torchx-nightly 2023.3.20__py3-none-any.whl → 2023.3.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/components/dist.py CHANGED
@@ -62,6 +62,7 @@ from typing import Dict, Iterable, List, Optional, Tuple
62
62
 
63
63
  import torchx
64
64
  import torchx.specs as specs
65
+ from torchx.components.structured_arg import StructuredJArgument, StructuredNameArgument
65
66
  from torchx.specs import macros
66
67
 
67
68
  _TORCH_DEBUG_FLAGS: Dict[str, str] = {
@@ -80,12 +81,88 @@ These are commonly set environment variables to debug PyTorch execution.
80
81
  """
81
82
 
82
83
 
84
+ def spmd(
85
+ *args: str,
86
+ script: Optional[str] = None,
87
+ m: Optional[str] = None,
88
+ image: str = torchx.IMAGE,
89
+ name: str = "/",
90
+ h: str = "gpu.small",
91
+ j: str = "1x1",
92
+ env: Optional[Dict[str, str]] = None,
93
+ max_retries: int = 0,
94
+ mounts: Optional[List[str]] = None,
95
+ debug: bool = False,
96
+ ) -> specs.AppDef:
97
+ """
98
+ Usage (by script): torchx run spmd -j 2x8 -h aws_p4d.24xlarge --name my_experiment/trial_1 --script path/to/my/trainer.py -foo bar
99
+
100
+ Usage (by module): torchx run spmd -j 2x8 -h aws_p4d.24xlarge --name my_experiment/trial_1 -m path.to.my.trainer -foo bar
101
+
102
+ Usage (infer GPU count): torchx run spmd -j 2 -h p4d.24xlarge ... (same as -j 2x8)
103
+
104
+ Creates a torchx.specs.AppDef (Job Definition) for a Single-Process-Multiple-Data (SPMD)
105
+ style application. See: https://en.wikipedia.org/wiki/Single_program,_multiple_data.
106
+
107
+ SPMD launches `n x m` (set via the `-j nxm` option) copies of the same program,
108
+ where `n` is the number of nodes (hosts) and `m` is the number of processes on each node.
109
+
110
+ If you have a distributed PyTorch script (DDP, FSDP, RPC) use this component to launch
111
+ the distributed application. You can also use `-j 1x1` to launch a single process application
112
+ which would be equivalent to launching with regular `python` except that your application
113
+ can safely call `torch.distributed.init_process_group(backend)`.
114
+
115
+ Note: For multi-node distributed runs, the hosts MUST have a network route to each other
116
+ AND port 29500 should be open on all hosts. Please check your security group settings.
117
+
118
+
119
+ Args:
120
+ args: the arguments to the main module or script (e.g. my/trainer.py -foo bar)
121
+ (for docker based runs) the script path must be relative to the WORKDIR of the image
122
+ script:
123
+ m: the main module name (e.g. my.module.trainer). When this option is used, the `script_args` are passed
124
+ as the arguments to the main module). Invoking my module is useful when the relative/absolute path
125
+ of the main script is unknown w.r.t the WORKDIR of the image. Use this option when it makes sense to
126
+ invoke the main script via `python -m <MAIN.MODULE>`.
127
+ image: the base docker image of the workspace, if workspace is disabled, then the image of the job
128
+ name: ``{experimentname}/{runname}`` or ``{experimentname}/`` or ``/{runname}`` or ``{runname}``
129
+ h: the type of host to run on (e.g. aws_p4d.24xlarge). Must be one of the registered named resources
130
+ j: {nnodes}x{nproc_per_node}. For GPU hosts omitting nproc_per_node will infer it from the GPU count on the host
131
+ env: environment variables to be passed to the run (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
132
+ max_retries: the number of scheduler retries allowed
133
+ rdzv_port: the port on rank0's host to use for hosting the c10d store used for rendezvous.
134
+ Only takes effect when running multi-node. When running single node, this parameter
135
+ is ignored and a random free port is chosen.
136
+ mounts: (for docker based runs only) mounts to mount into the worker environment/container
137
+ (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]).
138
+ debug: whether to run with preset debug flags enabled
139
+
140
+ """
141
+
142
+ if env is None:
143
+ env = {}
144
+
145
+ return ddp(
146
+ *args,
147
+ script=script,
148
+ m=m,
149
+ image=image,
150
+ name=name,
151
+ h=h,
152
+ j=str(StructuredJArgument.parse_from(h, j)),
153
+ env=env,
154
+ max_retries=max_retries,
155
+ mounts=mounts,
156
+ debug=debug,
157
+ )
158
+
159
+
83
160
  def ddp(
84
161
  *script_args: str,
85
162
  script: Optional[str] = None,
86
163
  m: Optional[str] = None,
87
164
  image: str = torchx.IMAGE,
88
- name: Optional[str] = None,
165
+ name: str = "/",
89
166
  h: Optional[str] = None,
90
167
  cpu: int = 2,
91
168
  gpu: int = 0,
@@ -114,7 +191,8 @@ def ddp(
114
191
  script: script or binary to run within the image
115
192
  m: the python module path to run
116
193
  image: image (e.g. docker)
117
- name: job name override (uses the script name if not specified)
194
+ name: job name override in the following format: ``{experimentname}/{runname}`` or ``{experimentname}/`` or ``/{runname}`` or ``{runname}``.
195
+ Uses the script or module name if ``{runname}`` not specified.
118
196
  cpu: number of cpus per replica
119
197
  gpu: number of gpus per replica
120
198
  memMB: cpu memory in MB per replica
@@ -138,14 +216,6 @@ def ddp(
138
216
  # nproc_per_node: number of processes on each node
139
217
  min_nnodes, max_nnodes, nproc_per_node, nnodes_rep = parse_nnodes(j)
140
218
 
141
- if script:
142
- # script name/module no extension
143
- role_name = Path(script).stem
144
- elif m:
145
- role_name = m.rpartition(".")[2]
146
- else:
147
- raise ValueError("failed to compute role_name")
148
-
149
219
  rdzv_backend = "c10d"
150
220
  if max_nnodes == 1:
151
221
  # using port 0 makes elastic chose a free random port which is ok
@@ -165,8 +235,16 @@ def ddp(
165
235
 
166
236
  if env is None:
167
237
  env = {}
168
- env.setdefault("LOGLEVEL", os.getenv("LOGLEVEL", "WARNING"))
169
238
 
239
+ argname = StructuredNameArgument.parse_from(
240
+ name=name,
241
+ m=m,
242
+ script=script,
243
+ )
244
+
245
+ env["TORCHX_TRACKING_EXPERIMENT_NAME"] = argname.experiment_name
246
+
247
+ env.setdefault("LOGLEVEL", os.getenv("LOGLEVEL", "WARNING"))
170
248
  if debug:
171
249
  env.update(_TORCH_DEBUG_FLAGS)
172
250
 
@@ -193,10 +271,10 @@ def ddp(
193
271
  cmd += ["-m", m]
194
272
  cmd += script_args
195
273
  return specs.AppDef(
196
- name=name or role_name,
274
+ name=argname.run_name,
197
275
  roles=[
198
276
  specs.Role(
199
- name=role_name,
277
+ name=get_role_name(script, m),
200
278
  image=image,
201
279
  min_replicas=min_nnodes,
202
280
  entrypoint="bash",
@@ -214,6 +292,17 @@ def ddp(
214
292
  )
215
293
 
216
294
 
295
+ def get_role_name(script: Optional[str], m: Optional[str]) -> str:
296
+ if script:
297
+ # script name/module no extension
298
+ role_name = Path(script).stem
299
+ elif m:
300
+ role_name = m.rpartition(".")[2]
301
+ else:
302
+ raise ValueError("failed to compute role_name")
303
+ return role_name
304
+
305
+
217
306
  def _args_join(args: Iterable[str]) -> str:
218
307
  """
219
308
  _args_join is like shlex.join but if the argument is wrapped in _noquote
@@ -0,0 +1,236 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Defines methods for structured (higher order) component argument parsing.
9
+ Use the functionalities defined in this module to author components
10
+ in such a way that the structured component arguments are consistent across the board.
11
+
12
+ A structured component argument is a function argument to a component (a function that returns an ``AppDef``)
13
+ that is human-friendly (less typing in the CLI or better readability) but technically embeds multiple
14
+ primitive arguments.
15
+
16
+ Examples:
17
+
18
+ #. ``-j {NNODES}x{NPROC_PER_NODE}`` (e.g. ``-j 1x2``): Compactly represents the number of
19
+ nodes and number of workers per node for a distributed application. Otherwise would've had to be taken
20
+ as two separate arguments: ``--nnodes 1 --nproc_per_node 8``
21
+ #. ``--name {EXPERIMENT_NAME}/{RUN_NAME}`` (e.g. ``--name t5_modeling/bfloat16_trial``): Uses a single
22
+ ``--name`` parameter to parse experiment and run names for logging experiments and trials (runs)
23
+ with an experiment tracker. The ``/`` delimiter is a natural way to group runs within experiments.
24
+
25
+ """
26
+ import warnings
27
+ from dataclasses import dataclass
28
+ from pathlib import Path
29
+ from typing import Optional
30
+
31
+ from pyre_extensions import none_throws
32
+
33
+ from torchx import specs
34
+
35
+
36
+ @dataclass
37
+ class StructuredNameArgument:
38
+ experiment_name: str
39
+ run_name: str
40
+
41
+ def __str__(self) -> str:
42
+ return f"{self.experiment_name or ''}/{self.run_name}"
43
+
44
+ @staticmethod
45
+ def parse_from(
46
+ name: str,
47
+ m: Optional[str] = None,
48
+ script: Optional[str] = None,
49
+ default_experiment_name: str = "default-experiment",
50
+ ) -> "StructuredNameArgument":
51
+ """
52
+ Creates an :py:class:`ArgName` from the component arguments:
53
+ ``name``, ``m`` (main module), ``script`` (main script).
54
+
55
+ The ``name`` MUST be of the form ``{EXPERIMENT_NAME}/{RUN_NAME}`` where either or both
56
+ ``{EXPERIMENT_NAME}`` and ``{RUN_NAME}`` may be left empy.
57
+ However, the ``name`` must include the ``/`` delimiter.
58
+
59
+ For instance:
60
+
61
+ #. ``foo/``: specifies an experiment name but no run name
62
+ #. ``/bar``: specifies a run name but no experiment name
63
+ #. ``foo``: specifieds a run name but no experiment name
64
+ #. ``foo/bar``: specifies both experiment and run names
65
+ #. ``/``: does not specify experiment nor run name
66
+
67
+ If the run name is left empty then one is derived from either the ``m``
68
+ or ``script``, whichever is not null.
69
+
70
+ .. important::
71
+ Exactly one of ``m`` (main module)` or ``script`` (path to script) must be provided.
72
+ If both or neither are provided, then this function throws.
73
+
74
+
75
+ Examples:
76
+ .. doctest::
77
+
78
+ >>> import os
79
+ >>> from torchx.components.structured_arg import StructuredNameArgument
80
+ >>> StructuredNameArgument.parse_from(name="foo/bar", script="bar/baz.py")
81
+ StructuredNameArgument(experiment_name='foo', run_name='bar')
82
+
83
+ >>> StructuredNameArgument.parse_from(name="foo/", script="bar/baz.py")
84
+ StructuredNameArgument(experiment_name='foo', run_name='baz')
85
+
86
+ >>> StructuredNameArgument.parse_from(name="/bar", script="bar/baz.py")
87
+ StructuredNameArgument(experiment_name='default-experiment', run_name='bar')
88
+
89
+ >>> StructuredNameArgument.parse_from(name="foobar", m="foo.bar")
90
+ StructuredNameArgument(experiment_name='default-experiment', run_name='foobar')
91
+
92
+ >>> StructuredNameArgument.parse_from(name="foo/bar", m="foo.bar.baz")
93
+ StructuredNameArgument(experiment_name='foo', run_name='bar')
94
+
95
+ >>> StructuredNameArgument.parse_from(name="foo/", m="foo.bar.baz")
96
+ StructuredNameArgument(experiment_name='foo', run_name='baz')
97
+
98
+ >>> StructuredNameArgument.parse_from(name="/bar", m="foo.bar.baz")
99
+ StructuredNameArgument(experiment_name='default-experiment', run_name='bar')
100
+
101
+ >>> StructuredNameArgument.parse_from(name="foo/bar")
102
+ Traceback (most recent call last):
103
+ ...
104
+ ValueError: No main module or script specified.
105
+
106
+
107
+ Arguments:
108
+ name: ``{EXPERIMENT_NAME}/{RUN_NAME}``, ``/{RUN_NAME}``, or ``{EXPERIMENT_NAME}/``, or ``{RUN_NAME}``
109
+ m: the main module (e.g. ``foo.bar.baz`` for ``foo/bar/baz.py``)
110
+ script: path to the main script
111
+
112
+
113
+ Raises:
114
+ ValueError: if both ``m`` and ``script`` are empty or both are non-empty.
115
+ ValueError: if the ``name`` does not contain the experiment/run name delimiter: ``/``.
116
+ """
117
+ if not m and not script:
118
+ raise ValueError(
119
+ "No main module or script specified. Specify either a main module or a script path"
120
+ )
121
+ if m and script:
122
+ raise ValueError(
123
+ "Both main module and script set. Specify exactly one of: main module or script, but not both"
124
+ )
125
+
126
+ run_name = ""
127
+ experiment_name = ""
128
+
129
+ delim_idx = name.find("/")
130
+ # just assume that name is the run_name (experiment name should default)
131
+ if delim_idx < 0:
132
+ run_name = name
133
+ elif delim_idx >= 0 and delim_idx < len(name) - 1:
134
+ # deal with:
135
+ # 1. /FOO (only run name)
136
+ # 2. FOO/BAR (both exp and run name)
137
+ #
138
+ # FOO/ (only exp name) will not enter this branch
139
+ # and end up getting an empty run_name (as declared above)
140
+ run_name = name[delim_idx + 1 :]
141
+
142
+ if delim_idx > 0:
143
+ experiment_name = name[:delim_idx]
144
+
145
+ if not run_name:
146
+ if m: # use the last module name
147
+ run_name = m.rpartition(".")[2]
148
+ else: # use script name w/ no extension
149
+ run_name = Path(none_throws(script)).stem
150
+ return StructuredNameArgument(
151
+ experiment_name or default_experiment_name, run_name
152
+ )
153
+
154
+
155
+ @dataclass
156
+ class StructuredJArgument:
157
+ nnodes: int
158
+ nproc_per_node: int
159
+
160
+ def __str__(self) -> str:
161
+ return f"{self.nnodes}x{self.nproc_per_node}"
162
+
163
+ @staticmethod
164
+ def parse_from(h: str, j: str) -> "StructuredJArgument":
165
+ """
166
+ Creates an :py:class:`ArgJ` instance given the ``h`` (host) and ``j`` (nnodes x nproc_per_node)
167
+ component arguments.
168
+
169
+ If the host has GPUs and ``j`` only specified nnodes (e.g. ``-j 2`` versus ``-j 2x8``), then
170
+ nproc_per_node is set equal to the number of GPUs on the host. If nproc_per_node was explicitly
171
+ specified, then it is honored even if it does not match the number of GPUs on the host.
172
+ However, a warning message is displayed reminding the user that there is a mismatch between
173
+ the GPU count on the host and the configured nproc_per_node.
174
+
175
+ Example (GPU):
176
+
177
+ .. doctest::
178
+
179
+ >>> from torchx.components.structured_arg import StructuredJArgument
180
+ >>> str(StructuredJArgument.parse_from(h="aws_p4d.24xlarge", j="2"))
181
+ '2x8'
182
+
183
+ >>> str(StructuredJArgument.parse_from(h="aws_p4d.24xlarge", j="2x4"))
184
+ '2x4'
185
+
186
+
187
+ For hosts with no GPU devices, one MUST specify nproc_per_node. Otherwise this function will
188
+ raise an Error.
189
+
190
+ Example (CPU or Trainium):
191
+
192
+ .. doctest::
193
+
194
+ >>> str(StructuredJArgument.parse_from(h="aws_trn1.32xl", j="2"))
195
+ Traceback (most recent call last):
196
+ ...
197
+ ValueError: nproc_per_node cannot be inferred from GPU count. `trn1.32xl` is not a GPU instance. ...
198
+
199
+ >>> str(StructuredJArgument.parse_from(h="aws_trn1.32xl", j="2x16"))
200
+ '2x16'
201
+
202
+ """
203
+ nums = j.split("x")
204
+ num_gpus = specs.named_resources[h].gpu
205
+ if len(nums) == 1: # -j 1
206
+ nnodes = int(nums[0])
207
+ # infer nproc_per_node from # of gpus in host
208
+
209
+ if num_gpus > 0:
210
+ nproc_per_node = num_gpus
211
+ else:
212
+ raise ValueError(
213
+ f"nproc_per_node cannot be inferred from GPU count."
214
+ f" `{h}` is not a GPU instance."
215
+ f" You must specify `-j $NNODESx$NPROCS_PER_NODE` (e.g. `-j {nnodes}x8`)"
216
+ )
217
+
218
+ elif len(nums) == 2: # -j 1x2
219
+ nnodes = int(nums[0])
220
+ nproc_per_node = int(nums[1])
221
+
222
+ if nproc_per_node != num_gpus:
223
+ warnings.warn(
224
+ f"In `-j {j}` you specified nproc_per_node={nproc_per_node}"
225
+ f" which does not equal the number of GPUs on a {h}: {num_gpus}."
226
+ f" This may lead to under-utilization or an error. "
227
+ f" If this was intentional, ignore this warning."
228
+ f" Otherwise set `-j {nnodes}` to auto-set nproc_per_node"
229
+ f" to the number of GPUs on the host."
230
+ )
231
+ else:
232
+ raise ValueError(
233
+ f"Invalid format for `-j $NNODESx$NPROCS_PER_NODE` (e.g. `-j 1x8`). Given: {j}"
234
+ )
235
+
236
+ return StructuredJArgument(nnodes=nnodes, nproc_per_node=nproc_per_node)
@@ -0,0 +1,270 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Convenience methods to use ``torch.distributed``.
9
+ """
10
+
11
+ import logging
12
+ import os
13
+ import warnings
14
+ from contextlib import contextmanager
15
+ from typing import Any, Iterator
16
+
17
+ import torch
18
+ import torch.distributed as dist
19
+ from typing_extensions import Literal
20
+
21
+ log: logging.Logger = logging.getLogger(__name__)
22
+
23
+
24
+ def local_rank() -> int:
25
+ """
26
+ Returns the local rank (aka rank within the node) of this process.
27
+ Typically the local rank is used to set the CUDA device on the node.
28
+
29
+ .. warning::
30
+ This function only works correctly if the invoker of the program sets ``LOCAL_RANK`` env var
31
+ or invokes the program with ``torchrun`` (aka ``torch.distributed.run``) or ``torchx``.
32
+ If ``LOCAL_RANK`` is not set or the process group is not initialized
33
+ then this function assumes that the process is not distributed and trivially returns 0.
34
+
35
+ """
36
+
37
+ if not dist.is_initialized():
38
+ return 0
39
+
40
+ if "LOCAL_RANK" not in os.environ:
41
+ warnings.warn(
42
+ "\n"
43
+ "==============================================================================================\n"
44
+ "`LOCAL_RANK` environment variable is not set. Will trivially return 0 for local_rank.\n"
45
+ " It is recommended to use torchrun/torchx to run your script or set the `LOCAL_RANK` manually.\n"
46
+ " For additional details see:\n"
47
+ " 1) https://pytorch.org/torchx/latest/components/distributed.html\n"
48
+ " 2) https://pytorch.org/docs/stable/elastic/run.html\n"
49
+ "=============================================================================================="
50
+ )
51
+ return 0
52
+ else:
53
+ return int(os.environ["LOCAL_RANK"])
54
+
55
+
56
+ def local_cuda_device() -> torch.device:
57
+ """
58
+ Returns the CUDA device (as a ``torch.device``) based on the local rank.
59
+
60
+ See Also: :py:func:`get_local_rank`.
61
+ """
62
+ return torch.device(f"cuda:{local_rank()}")
63
+
64
+
65
+ def rank() -> int:
66
+ """
67
+ A non-distributed-safe get_rank call. Unlike ``torch.distributed.get_rank()``
68
+ this method will not fail if being invoked from a non-distributed (e.g. process group not initialized)
69
+ context. Therefore, this method is safe to use in internal methods that may be used
70
+ in non-distributed contexts as well.
71
+
72
+ Returns:
73
+ If a process group has been initialized returns the value returned by ``torch.distributed.get_rank()``.
74
+ Otherwise, returns 0 (trivial rank)
75
+
76
+ """
77
+ return dist.get_rank() if dist.is_initialized() else 0
78
+
79
+
80
+ def world_size() -> int:
81
+ """
82
+ A non-distributed-safe get_world_size call. Unlike ``torch.distributed.get_world_size()``,
83
+ this method will not fail if being invoked from a non-distributed (e.g. process group not initialized)
84
+ context. Threefore, this method is safe to use in internal mthods that may be used
85
+ in non-distributed contexts as well.
86
+
87
+ Returns:
88
+ If a process group has been initialized returns the value returns by ``torch.distributed.get_world_size()``.
89
+ Otherwise, returns 1 (trivial world_size)
90
+
91
+ """
92
+ return dist.get_world_size() if dist.is_initialized() else 1
93
+
94
+
95
+ def is_rank0() -> bool:
96
+ """
97
+ Returns ``True`` if the caller is rank 0 (in a distributed setting).
98
+ If no process group has been initialized, then this method assumes
99
+ that the caller is a single-process (aka not-distributed) and trivially returns ``True``.
100
+ That is, for a non-distributed job there is only one process and hence that process
101
+ is trivially rank 0.
102
+
103
+ .. note::
104
+ To initialize the process group prefer to use :py:func:init_process_group over
105
+ ``torch.distributed.init_process_group()`` since the former can be called from
106
+ both distributed and non-distributed scripts.
107
+
108
+ """
109
+ return rank() == 0
110
+
111
+
112
+ def is_local_rank0() -> bool:
113
+ """
114
+ Returns ``True`` if this process is local rank 0 and ``False`` otherwise.
115
+ Used to perform an action just once per node. Example
116
+
117
+ .. code-block:: python
118
+
119
+ if is_local_rank0():
120
+ # download a file just once per node
121
+ download_file("s3://...")
122
+
123
+
124
+ """
125
+ return local_rank() == 0
126
+
127
+
128
+ Backend = Literal["nccl", "gloo", "auto"]
129
+
130
+
131
+ def init_pg(backend: Backend = "auto", **kwargs: Any) -> torch.device:
132
+ """
133
+ A convenience wrapper around ``torch.distributed.init_proces_group()``
134
+ that makes initializing a trivial (single world_size) process group easy.
135
+
136
+ Useful when you want to make your code portable across launching with
137
+ simple python or with ``torchrun`` (aka ``torch.distributed.run``)
138
+
139
+ Usage:
140
+
141
+
142
+ .. doctest::
143
+
144
+ >>> from torchx.distributed import init_pg
145
+ >>> init_pg(backend="gloo") # or nccl # doctest: +SKIP
146
+ device(type='cpu')
147
+
148
+ The example above works to initialize a pytorch process group
149
+ for the trivial (``world_size = 1``) and distributed (``world_size > 1``)
150
+ cases without you having to write an explicit check with an if-else branch statement.
151
+
152
+ You can pass ``backend="auto"`` to have this function select ``"nccl"``
153
+ if there is a cuda device available, otherwise ``"gloo"`` (for CPU)
154
+
155
+
156
+ .. doctest::
157
+
158
+ >>> from torchx.distributed import init_pg
159
+ >>> device = init_pg(backend="auto") # doctest: +SKIP
160
+
161
+
162
+ In the code above, ``device`` will be ``cuda:{LOCAL_RANK}`` if the host has CUDA devices (GPUs)
163
+ and ``cpu`` if not.
164
+
165
+ Returns:
166
+ The cuda device that this rank should be using or cpu device if ``backend="gloo"``
167
+ or if ``backend="auto"`` and no GPUs are available on the host.
168
+
169
+ """
170
+
171
+ if backend == "auto":
172
+ backend = (
173
+ "nccl"
174
+ if torch.cuda.is_available() # returns True if gpu-torch was installed even on CPU host
175
+ and (
176
+ torch.cuda.device_count() > 0
177
+ ) # so need to check for CUDA devices explicitly
178
+ and dist.is_nccl_available()
179
+ else "gloo"
180
+ )
181
+
182
+ # this means that the script was launched as a single python process
183
+ # initialize a trivial process group
184
+ if not dist.is_torchelastic_launched():
185
+ os.environ["MASTER_ADDR"] = "localhost"
186
+ os.environ["MASTER_PORT"] = "0" # port selection - selects free random port
187
+ dist.init_process_group(backend=backend, rank=0, world_size=1, **kwargs)
188
+ else:
189
+ dist.init_process_group(backend=backend, **kwargs)
190
+
191
+ if backend == "nccl":
192
+ return local_cuda_device()
193
+ else:
194
+ return torch.device("cpu")
195
+
196
+
197
+ @contextmanager
198
+ def on_rank0_first() -> Iterator[None]:
199
+ """
200
+ Runs the piece of code that is wrapped in this context manager
201
+ first on rank0 then on the rest of the ranks.
202
+
203
+ Example:
204
+
205
+ .. code-block:: python
206
+
207
+ import time
208
+ from torchx.distributed import on_rank0_first, rank
209
+
210
+ with on_rank0_first():
211
+ print(f"Running on rank {rank()} at {int(time.monotonic())}")
212
+ time.sleep(10)
213
+
214
+
215
+ Would print:
216
+
217
+ .. code-block::
218
+
219
+ Running on rank 0 at 12534774
220
+ Running on rank 1 at 12534784 # at least +10 seconds on the other ranks
221
+ Running on rank 2 at 12534784
222
+ ...
223
+
224
+ To run ONLY on rank0 use an if-statement as such:
225
+
226
+ .. code-block:: python
227
+
228
+ if is_rank0():
229
+ print(f"Running on rank {dist.get_rank()}")
230
+
231
+ The code above would only print once on rank 0.
232
+
233
+ """
234
+ if dist.is_initialized() and not is_rank0():
235
+ dist.barrier()
236
+
237
+ try:
238
+ yield
239
+ finally:
240
+ if dist.is_initialized() and is_rank0():
241
+ dist.barrier()
242
+
243
+
244
+ @contextmanager
245
+ def on_local_rank0_first() -> Iterator[None]:
246
+ """
247
+ Runs the piece of code that is wrapped in this context manager
248
+ first on local rank 0 then on the rest of the ranks.
249
+
250
+ The behavior is exactly the same as :py:func:`torchx.distributed.on_rank0_first`
251
+ except that the barrier is on each local rank on each node (versus a global barrier on rank0).
252
+
253
+ This is useful in situations there a node-local action (that would otherwise cause races)
254
+ needs to be done first from a representative worker on each node. For instance,
255
+ downloading a checkpoint file to a tmp dir on each node once, then having all the
256
+ workers read off the downloaded file.
257
+
258
+ .. note::
259
+ For actions that need to be run first at a job level
260
+ use :py:func:`torchx.distributed.on_rank0_first`
261
+
262
+ """
263
+ if dist.is_initialized() and not is_local_rank0():
264
+ dist.barrier()
265
+
266
+ try:
267
+ yield
268
+ finally:
269
+ if dist.is_initialized() and is_local_rank0():
270
+ dist.barrier()
torchx/specs/__init__.py CHANGED
@@ -14,6 +14,9 @@ import difflib
14
14
  from typing import Callable, Dict, Optional
15
15
 
16
16
  from torchx.specs.named_resources_aws import NAMED_RESOURCES as AWS_NAMED_RESOURCES
17
+ from torchx.specs.named_resources_generic import (
18
+ NAMED_RESOURCES as GENERIC_NAMED_RESOURCES,
19
+ )
17
20
  from torchx.util.entrypoints import load_group
18
21
 
19
22
  from .api import ( # noqa: F401 F403
@@ -55,12 +58,16 @@ GiB: int = 1024
55
58
  def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
56
59
  resource_methods = load_group("torchx.named_resources", default={})
57
60
  materialized_resources: Dict[str, Callable[[], Resource]] = {}
58
- default = AWS_NAMED_RESOURCES
59
- for name, resource in default.items():
61
+
62
+ for name, resource in {
63
+ **GENERIC_NAMED_RESOURCES,
64
+ **AWS_NAMED_RESOURCES,
65
+ **resource_methods,
66
+ }.items():
60
67
  materialized_resources[name] = resource
61
- for resource_name, resource_method in resource_methods.items():
62
- materialized_resources[resource_name] = resource_method
68
+
63
69
  materialized_resources["NULL"] = lambda: NULL_RESOURCE
70
+ materialized_resources["MISSING"] = lambda: NULL_RESOURCE
64
71
  return materialized_resources
65
72
 
66
73
 
@@ -0,0 +1,58 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Defines generic named resources that are not specific to any cloud provider's
9
+ instance types. These generic named resources are meant to be used as
10
+ default values for components and examples and are NOT meant to be used
11
+ long term as the specific capabilities (e.g. number of cpu, gpu, memMB)
12
+ are subject to change.
13
+
14
+ .. note:: T
15
+ he named resources in this file DO NOT map device capabilities such as
16
+ special network interfaces (e.g. EFA devices on AWS).
17
+ .. warning::
18
+ Do not use for launching applications that require specific capabilities
19
+ (e.g. needs exactly 4 x A100 GPUs with 40GB of memory connected with NVLink).
20
+
21
+ Different cloud provides offer different types of
22
+ instance types hence practically speaking one should register their own
23
+ named resources that accurately capture the instances they have at their disposal
24
+ rather than using these defaults long term.
25
+
26
+ .. note::
27
+ The cpu/gpu/memory ratios in these default resources are based on current
28
+ HW trends and do not map exactly to a particular instance type!
29
+
30
+ .. warning::
31
+ The specific capabilities of these default resources are subject to change
32
+ at any time based on current hardware spec trends.
33
+ Therefore, the user should NEVER assume that the specific number of cpu, gpu, and memMB
34
+ will always remain the same. For instance, never assume that ``gpu.small`` will always
35
+ have 8 cpus.
36
+
37
+ """
38
+ from typing import Callable, Mapping
39
+
40
+ from torchx.specs.api import Resource
41
+
42
+ GiB: int = 1024
43
+
44
+ NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
45
+ # typically system CPU memory is >= GPU memory (most modern GPUs have 32GB device mem)
46
+ # most cloud provides offer 1, 2, 4, 8 GPUs per host
47
+ "gpu.small": lambda: Resource(cpu=8, gpu=1, memMB=32 * GiB),
48
+ "gpu.medium": lambda: Resource(cpu=16, gpu=2, memMB=64 * GiB),
49
+ "gpu.large": lambda: Resource(cpu=32, gpu=4, memMB=128 * GiB),
50
+ "gpu.xlarge": lambda: Resource(cpu=64, gpu=8, memMB=256 * GiB),
51
+ # for cpu defaults - based on AWS's T2 (general purpose) instance type
52
+ "cpu.nano": lambda: Resource(cpu=1, gpu=0, memMB=512),
53
+ "cpu.micro": lambda: Resource(cpu=1, gpu=0, memMB=1 * GiB),
54
+ "cpu.small": lambda: Resource(cpu=1, gpu=0, memMB=2 * GiB),
55
+ "cpu.medium": lambda: Resource(cpu=2, gpu=0, memMB=4 * GiB),
56
+ "cpu.large": lambda: Resource(cpu=2, gpu=0, memMB=8 * GiB),
57
+ "cpu.xlarge": lambda: Resource(cpu=8, gpu=0, memMB=32 * GiB),
58
+ }
@@ -0,0 +1,342 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import dataclasses
8
+ import os
9
+ import socket
10
+ from getpass import getuser
11
+ from logging import getLogger, Logger
12
+ from pathlib import Path
13
+ from tempfile import gettempdir
14
+ from typing import Any, Dict, Iterable, Mapping, Optional, Sequence
15
+
16
+ import mlflow
17
+ from mlflow import MlflowClient
18
+ from mlflow.entities import Experiment, Run
19
+
20
+ from torchx.distributed import on_rank0_first
21
+ from torchx.runner.config import get_configs
22
+ from torchx.tracker.api import Lineage, TrackerArtifact, TrackerBase, TrackerSource
23
+
24
+ log: Logger = getLogger(__name__)
25
+ TAG_ARTIFACT_MD_PREFIX = "torchx.artifact.metadata"
26
+
27
+
28
+ class MLflowTracker(TrackerBase):
29
+ """
30
+ An implementation of a ``Tracker`` that uses mlflow as the backend.
31
+ Don't forget to call the ``close()`` method for orderly shutdown.
32
+ This ensures that the run state in mlflow is properly marked as ``FINISHED``,
33
+ otherwise the run will remain in ``UNFINISHED`` status.
34
+
35
+ .. important::
36
+ TorchX's run_id is used as mlflow's run_name! The run_id in TorchX
37
+ is the job name. The job name in TorchX is made unique by adding
38
+ a short random hash to the user-provided job name prefix. This is
39
+ done because certain job schedulers supported by TorchX requires
40
+ that the job name on the submitted job definition is globally unique
41
+ (rather than the scheduler returning a unique job id as the return result
42
+ of the job submission API).
43
+
44
+ .. warning::
45
+ APIs on this class may only be called with the same ``run_name`.
46
+ Typically the user does not have to worry about manually setting
47
+ the run_name as it is picked up by default from the environment variable
48
+ ``TORCHX_APP_NAME``.
49
+
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ experiment_name: Optional[str] = None,
55
+ tracking_uri: str = f"file://{Path(gettempdir()) / 'torchx' / 'mlruns'}",
56
+ artifact_location: Optional[str] = None,
57
+ ) -> None:
58
+ if experiment_name is None:
59
+ experiment_name = self.default_experiment_name()
60
+
61
+ self.tracking_uri = tracking_uri
62
+ mlflow.set_tracking_uri(tracking_uri)
63
+ log.info(
64
+ f"MLflow tracking_uri={tracking_uri}, artifact_location={artifact_location}"
65
+ )
66
+ with on_rank0_first():
67
+ existing_experiment = mlflow.get_experiment_by_name(experiment_name)
68
+ if existing_experiment:
69
+ self.experiment: Experiment = existing_experiment
70
+ log.info(
71
+ f"Found existing experiment `{experiment_name}` (id={self.experiment_id})"
72
+ )
73
+ else:
74
+ experiment_id = mlflow.create_experiment(
75
+ name=experiment_name,
76
+ artifact_location=artifact_location,
77
+ )
78
+ self.experiment = mlflow.get_experiment(experiment_id)
79
+ log.info(
80
+ f"Created new experiment `{experiment_name}` (id={experiment_id})"
81
+ )
82
+
83
+ @staticmethod
84
+ def default_experiment_name() -> str:
85
+ return f"default-experiment/{getuser()}/{socket.getfqdn()}"
86
+
87
+ @property
88
+ def experiment_id(self) -> str:
89
+ return self.experiment.experiment_id
90
+
91
+ @property
92
+ def experiment_name(self) -> str:
93
+ return self.experiment.name
94
+
95
+ @property
96
+ def artifact_location(self) -> str:
97
+ return self.experiment.artifact_location
98
+
99
+ def get_run(self, run_name: str) -> Run:
100
+ """
101
+ Gets mlflow's ``Run`` object for the given ``run_name`` in the current experiment.
102
+ If no such run exists, this method creates a new run under
103
+ this experiment and starts the run so that subsequent calls to
104
+ mlflow logs metadata, metrics, artifacts to the newly created run.
105
+
106
+ .. warning::
107
+ This method should only be called with the same run_name!
108
+ This is because of the way mlflow
109
+ APIs work is by setting an "active run" for which subsequent
110
+ mlflow logging APIs are made against the current active run
111
+ in the stack. If you call ``mlflow.start_run()`` directly
112
+ or pass different run names, then you may be logging into two different
113
+ mlflow runs from the same job!
114
+
115
+ Args:
116
+ run_name: equal to torchx's run_id
117
+
118
+ Returns: mlflow's ``Run`` object for the ``run_name``
119
+
120
+ """
121
+
122
+ active_run = mlflow.active_run()
123
+
124
+ if active_run is None:
125
+ search_result = mlflow.search_runs(
126
+ experiment_ids=[self.experiment_id],
127
+ output_format="list",
128
+ filter_string=f"tags.`mlflow.runName` = '{run_name}'",
129
+ )
130
+ if not search_result:
131
+ return mlflow.start_run(
132
+ experiment_id=self.experiment_id, run_name=run_name
133
+ )
134
+ elif len(search_result) == 1:
135
+ return search_result[0]
136
+ else: # len(search_result) > 1
137
+ raise RuntimeError(
138
+ f"More than 1 run found for run_name `{run_name}` in experiment `{self.experiment_name}`."
139
+ f" Did you manually create runs with the same name under this experiment?"
140
+ f" Remove duplicate run names and try again"
141
+ )
142
+ else:
143
+ # need to query mlflow again so that the run reflects any newly written logs
144
+ return mlflow.get_run(active_run.info.run_id)
145
+
146
+ def get_run_id(self, run_name: str) -> str:
147
+ """
148
+ Gets the mlflow run's run_id for the given ``run_name`` and additionally sets
149
+ this run as the active run. Hence this method has a side-effect where all subsequent
150
+ calls to mlflow log APIs are against the run for the given ``run_name``.
151
+ """
152
+ return self.get_run(run_name).info.run_id
153
+
154
+ def close(self) -> None:
155
+ mlflow.end_run()
156
+
157
+ def add_artifact(
158
+ self,
159
+ run_id: str,
160
+ name: str,
161
+ path: str,
162
+ metadata: Optional[Mapping[str, object]] = None,
163
+ ) -> None:
164
+ self.get_run(run_id)
165
+ # stores the artifact in {artifact_location}/{name} (e.g. s3://bucket/prefix/{name})
166
+ mlflow.log_artifact(local_path=path, artifact_path=name)
167
+
168
+ # add artifact metadata with torchx.artifact_metadata.{name}.* tag prefix
169
+ if metadata:
170
+ mlflow.set_tags(
171
+ tags={
172
+ f"{TAG_ARTIFACT_MD_PREFIX}.{name}.{k}": v
173
+ for k, v in metadata.items()
174
+ }
175
+ )
176
+
177
+ def artifacts(self, run_id: str) -> Mapping[str, TrackerArtifact]:
178
+ artifacts: Dict[str, TrackerArtifact] = {}
179
+ mlflow_client: MlflowClient = MlflowClient(self.tracking_uri)
180
+
181
+ def get_artifacts(path: Optional[str] = None) -> None:
182
+ for artifact_info in mlflow_client.list_artifacts(
183
+ self.get_run(run_id).info.run_id, path=path
184
+ ):
185
+ if artifact_info.is_dir:
186
+ get_artifacts(path=artifact_info.path)
187
+ else:
188
+ # we stored the artifact using the name as the path
189
+ # so path should never be `None` when we get to this point
190
+ # (e.g. the root of `artifact_location` will only have directories
191
+ # where the directory names are the artifact names
192
+ name = path or "<SHOULD_NOT_HAPPEN>"
193
+
194
+ # artifact metadata is stored as run tags with `torchx.artifact.metadata.*` prefix
195
+ tag_prefix = f"{TAG_ARTIFACT_MD_PREFIX}.{name}."
196
+ metadata = {
197
+ # k.removeprefix() only avail in python 3.9+
198
+ k[len(tag_prefix) :]: v
199
+ for k, v in self.metadata(run_id).items()
200
+ if k.startswith(tag_prefix)
201
+ }
202
+
203
+ # add some additional metadata about the artifact
204
+ metadata["mlflow.file_size"] = artifact_info.file_size
205
+
206
+ artifacts[name] = TrackerArtifact(
207
+ name=name,
208
+ path=f"{self.artifact_location}/{artifact_info.path}",
209
+ metadata=metadata,
210
+ )
211
+
212
+ get_artifacts()
213
+ return artifacts
214
+
215
+ def add_metadata(self, run_id: str, **kwargs: object) -> None:
216
+ self.get_run(run_id)
217
+ mlflow.set_tags(tags={k: v for k, v in kwargs.items()})
218
+
219
+ def metadata(self, run_id: str) -> Mapping[str, object]:
220
+ return self.get_run(run_id).data.tags
221
+
222
+ def run_ids(self, **kwargs: str) -> Iterable[str]:
223
+ runs = mlflow.search_runs(
224
+ experiment_ids=[self.experiment_id], output_format="list"
225
+ )
226
+ return [r.info.run_name for r in runs]
227
+
228
+ def log_params_flat(
229
+ self, run_name: str, cfg: Any, key: str = "" # pyre-ignore[2]
230
+ ) -> None:
231
+ """
232
+ Designed to be primarily used with hydra-style config objects (e.g. dataclasses),
233
+ logs the given ``cfg``, which is one of: ``@dataclass``, ``Sequence`` (e.g. ``list``, ``tuple``, ``set``),
234
+ or ``Mapping`` (e.g. ``dict``), where the fields of ``cfg`` are flattened recursively and logged as
235
+ the the run's ``Parameter`` in mlflow.
236
+
237
+ For example if ``cfg`` is:
238
+
239
+ .. code-block:: python
240
+
241
+ @dataclass
242
+ class Config2:
243
+ foo: str = "bar"
244
+
245
+ @dataclass
246
+ class Config:
247
+ i: int = 1
248
+ f: float = 2.1
249
+ s: str = "string"
250
+ l: List[str] = field(default_factory=lambda :["a", "b", "c"])
251
+ cfg_list = List[Config2] = field(default_factory=lambda : [Config2(foo="hello"), Config2(foo="world")])
252
+ cfg2: Config2 = Config2()
253
+
254
+
255
+ Then this function logs the following parameters
256
+
257
+ .. code-block::
258
+
259
+ i: "1"
260
+ f: "2.1"
261
+ s: "string"
262
+ l: ["a", "b", "c"]
263
+ cfg_list._0.foo = "hello"
264
+ cfg_list._1.foo = "hello"
265
+ cfg2.foo = "bar"
266
+
267
+ As shown above, primitive sequence containers are logged directly (e.g. ``l: ["a", "b", "c"]``)
268
+ whereas nested sequence containers will be logged per element where the key is suffixed with
269
+ ``_{INDEX}`` (e.g. ``cfg_list._0.foo = "hello"``).
270
+
271
+ """
272
+ if dataclasses.is_dataclass(cfg):
273
+ cfg = dataclasses.asdict(cfg)
274
+ self.get_run(run_name)
275
+
276
+ def is_primitive(v: Any) -> bool: # pyre-ignore[2]
277
+ return isinstance(v, (str, int, float, bool))
278
+
279
+ key_prefix = f"{key}." if key else ""
280
+
281
+ if not cfg:
282
+ # empty container; log as is
283
+ mlflow.log_param(key, cfg)
284
+ else:
285
+ # non-empty container; check types
286
+ if isinstance(cfg, (Sequence, set)):
287
+ # assume list/set elements are homogeneous
288
+ # need only check first element for type
289
+ elem = next(iter(cfg))
290
+ if is_primitive(elem):
291
+ mlflow.log_param(key, cfg)
292
+ else:
293
+ for i, e in enumerate(cfg):
294
+ self.log_params_flat(run_name, e, f"{key_prefix}_{i}")
295
+ elif isinstance(cfg, Mapping):
296
+ for k, v in cfg.items():
297
+ if is_primitive(v):
298
+ mlflow.log_param(f"{key_prefix}{k}", v)
299
+ else:
300
+ self.log_params_flat(run_name, v, f"{key_prefix}{k}")
301
+
302
+ def add_source(
303
+ self, run_id: str, source_id: str, artifact_name: Optional[str] = None
304
+ ) -> None:
305
+ raise NotImplementedError(
306
+ f"Job's tracker sources is currently unsupported for {self.__class__.__qualname__}"
307
+ )
308
+
309
+ def sources(
310
+ self, run_id: str, artifact_name: Optional[str] = None
311
+ ) -> Iterable[TrackerSource]:
312
+ raise NotImplementedError(
313
+ f"Job's tracker sources is currently unsupported for {self.__class__.__qualname__}"
314
+ )
315
+
316
+ def lineage(self, run_id: str) -> Lineage:
317
+ raise NotImplementedError(
318
+ f"Job's lineage is currently unsupported for {self.__class__.__qualname__}"
319
+ )
320
+
321
+
322
+ def create_tracker(config: str) -> MLflowTracker:
323
+ ctor_args = get_configs(
324
+ prefix="tracker",
325
+ name="mlflow",
326
+ dirs=[config],
327
+ )
328
+
329
+ # remove "config" key since that one is reserved for torchx.tracker usage
330
+ ctor_args.pop("config", None)
331
+
332
+ # pass configs read from .torchxconfig [tracker:mlflow] section as kwargs
333
+ # get the experiment name from an env var (set in torchx.components.dist:spmd)
334
+ # if no such env var exists, then default the experiment_name to the one
335
+ # specified in .torchxconfig
336
+ return MLflowTracker(
337
+ experiment_name=os.getenv(
338
+ "TORCHX_TRACKING_EXPERIMENT_NAME",
339
+ default=ctor_args.pop("experiment_name", None),
340
+ ),
341
+ **ctor_args,
342
+ )
torchx/util/strings.py CHANGED
@@ -10,7 +10,7 @@ import re
10
10
  def normalize_str(data: str) -> str:
11
11
  """
12
12
  Invokes ``lower`` on thes string and removes all
13
- characters that do not satisfy ``[a-z0-9]`` pattern.
13
+ characters that do not satisfy ``[a-z0-9\\-]`` pattern.
14
14
  This method is mostly used to make sure kubernetes and gcp_batch scheduler gets
15
15
  the job name that does not violate its restrictions.
16
16
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2023.3.20
3
+ Version: 2023.3.22
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -42,6 +42,7 @@ Requires-Dist: google-cloud-runtimeconfig (>=0.33.2) ; extra == 'dev'
42
42
  Requires-Dist: hydra-core ; extra == 'dev'
43
43
  Requires-Dist: ipython ; extra == 'dev'
44
44
  Requires-Dist: kfp (==1.8.9) ; extra == 'dev'
45
+ Requires-Dist: mlflow-skinny ; extra == 'dev'
45
46
  Requires-Dist: moto (==4.1.3) ; extra == 'dev'
46
47
  Requires-Dist: pyre-extensions ; extra == 'dev'
47
48
  Requires-Dist: pyre-check ; extra == 'dev'
@@ -24,15 +24,17 @@ torchx/cli/colors.py,sha256=bVN_jEDwLgvypnDMeCHKn0q0ZDDhQjBJnyVfZHAE6nc,553
24
24
  torchx/cli/main.py,sha256=ysAQh0vPn0hC3JeVzRP0i7-E6dTO2D4rLN2B5Ok3abw,3442
25
25
  torchx/components/__init__.py,sha256=6-TQ4SY-Tn56os_1lOs_HMabOoE7gkkud_8e1BgvfJw,12106
26
26
  torchx/components/component_test_base.py,sha256=eKOwBp5cRgiA4FgZd_FCvyJ-ppv2v3JN9AGXnaSK_Cw,4135
27
- torchx/components/dist.py,sha256=l_IEtblhTTu-_8lDpfWNo5Pc_9teE9bVWDhRy_3hJ7s,9931
27
+ torchx/components/dist.py,sha256=FA1Wxxdo4qZ1_zd2Y20JpHog2ZjEGCkjN0k7dz5ZLvE,14075
28
28
  torchx/components/interpret.py,sha256=g8gkKdDJvsBfX1ZrpVT7n2bMEtmwRV_1AqDyAnnQ_aA,697
29
29
  torchx/components/metrics.py,sha256=1gbp8BfzZWGa7PD1db5vRADlONzmae4qSBUUdCWayr0,2814
30
30
  torchx/components/serve.py,sha256=9RlpwlU2KOC7sMOZBeYwUpJIKDCXrU8xNo1SH-AT3fc,2141
31
+ torchx/components/structured_arg.py,sha256=uavcUeFDRnMP7cWAqcxR3ujJYi6JEsClz0_Rd4Dgxj4,9542
31
32
  torchx/components/train.py,sha256=vtrQXRcD7bIcbb3lSeyD9BBlIe1mv1WNW6rnLK9R0Mw,1259
32
33
  torchx/components/utils.py,sha256=m7mFe6du2AMHpViwcW9dF8rr_twQB6KHQuEzJyHwBXw,9025
33
34
  torchx/components/integration_tests/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
34
35
  torchx/components/integration_tests/component_provider.py,sha256=fcdVWJt6dewv16qHOLl4PKQUK1PSfkPWy7d4dKMqZmo,3925
35
36
  torchx/components/integration_tests/integ_tests.py,sha256=OVgRvGrLWhDUNlqbbYj90ukGmkAwka2KubCWUR8pC7Y,5150
37
+ torchx/distributed/__init__.py,sha256=OAO1CIwVBOaclzbp2NjH_SMBq2WlK7aE9NVlNmDtVlQ,8786
36
38
  torchx/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
39
  torchx/examples/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
40
  torchx/examples/apps/datapreproc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -78,12 +80,13 @@ torchx/schedulers/streams.py,sha256=ObaKwEEcnsjrPyc6VZOp8cgZ_f2RFextAxeISxZUWeQ,
78
80
  torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
79
81
  torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
80
82
  torchx/schedulers/ray/ray_driver.py,sha256=0DL8Ad_hire-WgH8ZEYx1Q-mI2SUfZDk-6_6PICk8OQ,12282
81
- torchx/specs/__init__.py,sha256=lWdg9f9OVtJ5QmgMO_o6ek9N3gTb8XimP0HIlJVrZcY,5369
83
+ torchx/specs/__init__.py,sha256=fSA89Y0ZpdZLJmhIfEKNbjNNi6fbDR9k1bpIM7Xm7xo,5462
82
84
  torchx/specs/api.py,sha256=vFtvhYW18HvWzET8ob8ONNRC1MCuYwLMXohWDy0LbZI,33798
83
85
  torchx/specs/builders.py,sha256=dsKa80PD-cuIjXhFF2TsEY0eL8S01wux8aZaQrURIEE,8512
84
86
  torchx/specs/file_linter.py,sha256=LREWELpHJyE7YN3rc5ixf2ZydWFU9dlcSy5gGqdB5rA,11714
85
87
  torchx/specs/finder.py,sha256=RJI0PkG69esuVzhCp4w6Lotu2tSzIRh6PhWemCSQR7I,16234
86
88
  torchx/specs/named_resources_aws.py,sha256=6ID0jOGi5HVXn9BLylzCMD-SUQtxGeBHS5zU8PCten4,6361
89
+ torchx/specs/named_resources_generic.py,sha256=_xz0cRjy3fz-CVtX9G_MY7f3NX6n3AkP3xzAkuDevwk,2631
87
90
  torchx/specs/test/components/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
88
91
  torchx/specs/test/components/a/__init__.py,sha256=T7exlQ47Fak5ajCEGPg6_yOfChJCWpIMhWBmSVUnlrQ,546
89
92
  torchx/specs/test/components/a/b/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
@@ -92,6 +95,7 @@ torchx/specs/test/components/c/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFR
92
95
  torchx/specs/test/components/c/d.py,sha256=RH07jjo6uvFbzIaNFnAwmD_h24cEsT8kyZDTN-ezFio,531
93
96
  torchx/tracker/__init__.py,sha256=kaynfAwMYtdkkvXNrb4Rmin2mSd5vi9mg25iz1e0TJ8,4200
94
97
  torchx/tracker/api.py,sha256=qIgoHNlc_RsQbQdcHF8GDrKby_vLlZg84Tn6VfONrY4,11284
98
+ torchx/tracker/mlflow.py,sha256=P_mj7Yi-bZc7QOZ-6PJW4FHeWWGX0rjauNK0zXC25ig,13221
95
99
  torchx/tracker/backend/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
96
100
  torchx/tracker/backend/fsspec.py,sha256=JpSioMgn54mrxqqpY0kw5Gudqx9hhxkgDLaOFSEP2Ko,10425
97
101
  torchx/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -99,15 +103,15 @@ torchx/util/datetime.py,sha256=e-sO5Wjx1Utpln14C3qfJHl4v3KM-SMnn11hSyvkqFY,390
99
103
  torchx/util/entrypoints.py,sha256=C4A7cF1tPLlfyYWyZ7uZEtsKeuoOoLbMv0sOSxLhXs4,2710
100
104
  torchx/util/io.py,sha256=sxb6KI42Lq6n5z6_-YKW_mAhgPdC6CxzexlMyGheWSc,1792
101
105
  torchx/util/shlex.py,sha256=KzyWektMeU3oXS3Z5mFkNSPLItBTszVcvQ3EYfOMUYA,448
102
- torchx/util/strings.py,sha256=CfR2FtT0bNFQW-jeILL5gr85u3QShpBDR5_VIJmyYI0,660
106
+ torchx/util/strings.py,sha256=7CZe5WKHa7IQ6DuJCYeJ5FapUC4Fd1OGeq1yZAmjluw,663
103
107
  torchx/util/types.py,sha256=6ASuDKGO91UU3DCSuWhPX_C03341tApLCQEByUz8xpY,7016
104
108
  torchx/workspace/__init__.py,sha256=KbGEzJqqXaIxALm_EQO64aw-fE7MeDMFXcpU1mY650I,783
105
109
  torchx/workspace/api.py,sha256=Ej6DR__mNWaVyZgoVNAAOloDy1kTD5X1jz7pRtoVf80,5464
106
110
  torchx/workspace/dir_workspace.py,sha256=Fz-hKIx0KN8iJf2BsthNj0NvTkWlxP6WFsElPs_BaT0,2253
107
111
  torchx/workspace/docker_workspace.py,sha256=Yd8ut26bNfjyJQnmH8ANOrflfr-4VKcnOrIjbi_XIUY,9208
108
- torchx_nightly-2023.3.20.dist-info/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
109
- torchx_nightly-2023.3.20.dist-info/METADATA,sha256=0zjkrDxNcaB-NPPNX8dWihdDzDeexn1dVUCXm32ZIds,5376
110
- torchx_nightly-2023.3.20.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
111
- torchx_nightly-2023.3.20.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
112
- torchx_nightly-2023.3.20.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
113
- torchx_nightly-2023.3.20.dist-info/RECORD,,
112
+ torchx_nightly-2023.3.22.dist-info/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
113
+ torchx_nightly-2023.3.22.dist-info/METADATA,sha256=UbHY2VUh7IvI4ty1vY1HD-0V8LCtrIakJVG77fVzEaU,5422
114
+ torchx_nightly-2023.3.22.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
115
+ torchx_nightly-2023.3.22.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
116
+ torchx_nightly-2023.3.22.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
117
+ torchx_nightly-2023.3.22.dist-info/RECORD,,