torchx-nightly 2023.3.21__py3-none-any.whl → 2023.3.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/components/dist.py +102 -13
- torchx/components/structured_arg.py +236 -0
- torchx/distributed/__init__.py +270 -0
- torchx/specs/__init__.py +11 -4
- torchx/specs/named_resources_generic.py +58 -0
- torchx/tracker/mlflow.py +342 -0
- torchx/util/strings.py +1 -1
- {torchx_nightly-2023.3.21.dist-info → torchx_nightly-2023.3.22.dist-info}/METADATA +2 -1
- {torchx_nightly-2023.3.21.dist-info → torchx_nightly-2023.3.22.dist-info}/RECORD +13 -9
- {torchx_nightly-2023.3.21.dist-info → torchx_nightly-2023.3.22.dist-info}/LICENSE +0 -0
- {torchx_nightly-2023.3.21.dist-info → torchx_nightly-2023.3.22.dist-info}/WHEEL +0 -0
- {torchx_nightly-2023.3.21.dist-info → torchx_nightly-2023.3.22.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2023.3.21.dist-info → torchx_nightly-2023.3.22.dist-info}/top_level.txt +0 -0
torchx/components/dist.py
CHANGED
|
@@ -62,6 +62,7 @@ from typing import Dict, Iterable, List, Optional, Tuple
|
|
|
62
62
|
|
|
63
63
|
import torchx
|
|
64
64
|
import torchx.specs as specs
|
|
65
|
+
from torchx.components.structured_arg import StructuredJArgument, StructuredNameArgument
|
|
65
66
|
from torchx.specs import macros
|
|
66
67
|
|
|
67
68
|
_TORCH_DEBUG_FLAGS: Dict[str, str] = {
|
|
@@ -80,12 +81,88 @@ These are commonly set environment variables to debug PyTorch execution.
|
|
|
80
81
|
"""
|
|
81
82
|
|
|
82
83
|
|
|
84
|
+
def spmd(
|
|
85
|
+
*args: str,
|
|
86
|
+
script: Optional[str] = None,
|
|
87
|
+
m: Optional[str] = None,
|
|
88
|
+
image: str = torchx.IMAGE,
|
|
89
|
+
name: str = "/",
|
|
90
|
+
h: str = "gpu.small",
|
|
91
|
+
j: str = "1x1",
|
|
92
|
+
env: Optional[Dict[str, str]] = None,
|
|
93
|
+
max_retries: int = 0,
|
|
94
|
+
mounts: Optional[List[str]] = None,
|
|
95
|
+
debug: bool = False,
|
|
96
|
+
) -> specs.AppDef:
|
|
97
|
+
"""
|
|
98
|
+
Usage (by script): torchx run spmd -j 2x8 -h aws_p4d.24xlarge --name my_experiment/trial_1 --script path/to/my/trainer.py -foo bar
|
|
99
|
+
|
|
100
|
+
Usage (by module): torchx run spmd -j 2x8 -h aws_p4d.24xlarge --name my_experiment/trial_1 -m path.to.my.trainer -foo bar
|
|
101
|
+
|
|
102
|
+
Usage (infer GPU count): torchx run spmd -j 2 -h p4d.24xlarge ... (same as -j 2x8)
|
|
103
|
+
|
|
104
|
+
Creates a torchx.specs.AppDef (Job Definition) for a Single-Process-Multiple-Data (SPMD)
|
|
105
|
+
style application. See: https://en.wikipedia.org/wiki/Single_program,_multiple_data.
|
|
106
|
+
|
|
107
|
+
SPMD launches `n x m` (set via the `-j nxm` option) copies of the same program,
|
|
108
|
+
where `n` is the number of nodes (hosts) and `m` is the number of processes on each node.
|
|
109
|
+
|
|
110
|
+
If you have a distributed PyTorch script (DDP, FSDP, RPC) use this component to launch
|
|
111
|
+
the distributed application. You can also use `-j 1x1` to launch a single process application
|
|
112
|
+
which would be equivalent to launching with regular `python` except that your application
|
|
113
|
+
can safely call `torch.distributed.init_process_group(backend)`.
|
|
114
|
+
|
|
115
|
+
Note: For multi-node distributed runs, the hosts MUST have a network route to each other
|
|
116
|
+
AND port 29500 should be open on all hosts. Please check your security group settings.
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
args: the arguments to the main module or script (e.g. my/trainer.py -foo bar)
|
|
121
|
+
(for docker based runs) the script path must be relative to the WORKDIR of the image
|
|
122
|
+
script:
|
|
123
|
+
m: the main module name (e.g. my.module.trainer). When this option is used, the `script_args` are passed
|
|
124
|
+
as the arguments to the main module). Invoking my module is useful when the relative/absolute path
|
|
125
|
+
of the main script is unknown w.r.t the WORKDIR of the image. Use this option when it makes sense to
|
|
126
|
+
invoke the main script via `python -m <MAIN.MODULE>`.
|
|
127
|
+
image: the base docker image of the workspace, if workspace is disabled, then the image of the job
|
|
128
|
+
name: ``{experimentname}/{runname}`` or ``{experimentname}/`` or ``/{runname}`` or ``{runname}``
|
|
129
|
+
h: the type of host to run on (e.g. aws_p4d.24xlarge). Must be one of the registered named resources
|
|
130
|
+
j: {nnodes}x{nproc_per_node}. For GPU hosts omitting nproc_per_node will infer it from the GPU count on the host
|
|
131
|
+
env: environment variables to be passed to the run (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
|
|
132
|
+
max_retries: the number of scheduler retries allowed
|
|
133
|
+
rdzv_port: the port on rank0's host to use for hosting the c10d store used for rendezvous.
|
|
134
|
+
Only takes effect when running multi-node. When running single node, this parameter
|
|
135
|
+
is ignored and a random free port is chosen.
|
|
136
|
+
mounts: (for docker based runs only) mounts to mount into the worker environment/container
|
|
137
|
+
(ex. type=<bind/volume>,src=/host,dst=/job[,readonly]).
|
|
138
|
+
debug: whether to run with preset debug flags enabled
|
|
139
|
+
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
if env is None:
|
|
143
|
+
env = {}
|
|
144
|
+
|
|
145
|
+
return ddp(
|
|
146
|
+
*args,
|
|
147
|
+
script=script,
|
|
148
|
+
m=m,
|
|
149
|
+
image=image,
|
|
150
|
+
name=name,
|
|
151
|
+
h=h,
|
|
152
|
+
j=str(StructuredJArgument.parse_from(h, j)),
|
|
153
|
+
env=env,
|
|
154
|
+
max_retries=max_retries,
|
|
155
|
+
mounts=mounts,
|
|
156
|
+
debug=debug,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
83
160
|
def ddp(
|
|
84
161
|
*script_args: str,
|
|
85
162
|
script: Optional[str] = None,
|
|
86
163
|
m: Optional[str] = None,
|
|
87
164
|
image: str = torchx.IMAGE,
|
|
88
|
-
name:
|
|
165
|
+
name: str = "/",
|
|
89
166
|
h: Optional[str] = None,
|
|
90
167
|
cpu: int = 2,
|
|
91
168
|
gpu: int = 0,
|
|
@@ -114,7 +191,8 @@ def ddp(
|
|
|
114
191
|
script: script or binary to run within the image
|
|
115
192
|
m: the python module path to run
|
|
116
193
|
image: image (e.g. docker)
|
|
117
|
-
name: job name override
|
|
194
|
+
name: job name override in the following format: ``{experimentname}/{runname}`` or ``{experimentname}/`` or ``/{runname}`` or ``{runname}``.
|
|
195
|
+
Uses the script or module name if ``{runname}`` not specified.
|
|
118
196
|
cpu: number of cpus per replica
|
|
119
197
|
gpu: number of gpus per replica
|
|
120
198
|
memMB: cpu memory in MB per replica
|
|
@@ -138,14 +216,6 @@ def ddp(
|
|
|
138
216
|
# nproc_per_node: number of processes on each node
|
|
139
217
|
min_nnodes, max_nnodes, nproc_per_node, nnodes_rep = parse_nnodes(j)
|
|
140
218
|
|
|
141
|
-
if script:
|
|
142
|
-
# script name/module no extension
|
|
143
|
-
role_name = Path(script).stem
|
|
144
|
-
elif m:
|
|
145
|
-
role_name = m.rpartition(".")[2]
|
|
146
|
-
else:
|
|
147
|
-
raise ValueError("failed to compute role_name")
|
|
148
|
-
|
|
149
219
|
rdzv_backend = "c10d"
|
|
150
220
|
if max_nnodes == 1:
|
|
151
221
|
# using port 0 makes elastic chose a free random port which is ok
|
|
@@ -165,8 +235,16 @@ def ddp(
|
|
|
165
235
|
|
|
166
236
|
if env is None:
|
|
167
237
|
env = {}
|
|
168
|
-
env.setdefault("LOGLEVEL", os.getenv("LOGLEVEL", "WARNING"))
|
|
169
238
|
|
|
239
|
+
argname = StructuredNameArgument.parse_from(
|
|
240
|
+
name=name,
|
|
241
|
+
m=m,
|
|
242
|
+
script=script,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
env["TORCHX_TRACKING_EXPERIMENT_NAME"] = argname.experiment_name
|
|
246
|
+
|
|
247
|
+
env.setdefault("LOGLEVEL", os.getenv("LOGLEVEL", "WARNING"))
|
|
170
248
|
if debug:
|
|
171
249
|
env.update(_TORCH_DEBUG_FLAGS)
|
|
172
250
|
|
|
@@ -193,10 +271,10 @@ def ddp(
|
|
|
193
271
|
cmd += ["-m", m]
|
|
194
272
|
cmd += script_args
|
|
195
273
|
return specs.AppDef(
|
|
196
|
-
name=
|
|
274
|
+
name=argname.run_name,
|
|
197
275
|
roles=[
|
|
198
276
|
specs.Role(
|
|
199
|
-
name=
|
|
277
|
+
name=get_role_name(script, m),
|
|
200
278
|
image=image,
|
|
201
279
|
min_replicas=min_nnodes,
|
|
202
280
|
entrypoint="bash",
|
|
@@ -214,6 +292,17 @@ def ddp(
|
|
|
214
292
|
)
|
|
215
293
|
|
|
216
294
|
|
|
295
|
+
def get_role_name(script: Optional[str], m: Optional[str]) -> str:
|
|
296
|
+
if script:
|
|
297
|
+
# script name/module no extension
|
|
298
|
+
role_name = Path(script).stem
|
|
299
|
+
elif m:
|
|
300
|
+
role_name = m.rpartition(".")[2]
|
|
301
|
+
else:
|
|
302
|
+
raise ValueError("failed to compute role_name")
|
|
303
|
+
return role_name
|
|
304
|
+
|
|
305
|
+
|
|
217
306
|
def _args_join(args: Iterable[str]) -> str:
|
|
218
307
|
"""
|
|
219
308
|
_args_join is like shlex.join but if the argument is wrapped in _noquote
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Defines methods for structured (higher order) component argument parsing.
|
|
9
|
+
Use the functionalities defined in this module to author components
|
|
10
|
+
in such a way that the structured component arguments are consistent across the board.
|
|
11
|
+
|
|
12
|
+
A structured component argument is a function argument to a component (a function that returns an ``AppDef``)
|
|
13
|
+
that is human-friendly (less typing in the CLI or better readability) but technically embeds multiple
|
|
14
|
+
primitive arguments.
|
|
15
|
+
|
|
16
|
+
Examples:
|
|
17
|
+
|
|
18
|
+
#. ``-j {NNODES}x{NPROC_PER_NODE}`` (e.g. ``-j 1x2``): Compactly represents the number of
|
|
19
|
+
nodes and number of workers per node for a distributed application. Otherwise would've had to be taken
|
|
20
|
+
as two separate arguments: ``--nnodes 1 --nproc_per_node 8``
|
|
21
|
+
#. ``--name {EXPERIMENT_NAME}/{RUN_NAME}`` (e.g. ``--name t5_modeling/bfloat16_trial``): Uses a single
|
|
22
|
+
``--name`` parameter to parse experiment and run names for logging experiments and trials (runs)
|
|
23
|
+
with an experiment tracker. The ``/`` delimiter is a natural way to group runs within experiments.
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
import warnings
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Optional
|
|
30
|
+
|
|
31
|
+
from pyre_extensions import none_throws
|
|
32
|
+
|
|
33
|
+
from torchx import specs
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class StructuredNameArgument:
|
|
38
|
+
experiment_name: str
|
|
39
|
+
run_name: str
|
|
40
|
+
|
|
41
|
+
def __str__(self) -> str:
|
|
42
|
+
return f"{self.experiment_name or ''}/{self.run_name}"
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def parse_from(
|
|
46
|
+
name: str,
|
|
47
|
+
m: Optional[str] = None,
|
|
48
|
+
script: Optional[str] = None,
|
|
49
|
+
default_experiment_name: str = "default-experiment",
|
|
50
|
+
) -> "StructuredNameArgument":
|
|
51
|
+
"""
|
|
52
|
+
Creates an :py:class:`ArgName` from the component arguments:
|
|
53
|
+
``name``, ``m`` (main module), ``script`` (main script).
|
|
54
|
+
|
|
55
|
+
The ``name`` MUST be of the form ``{EXPERIMENT_NAME}/{RUN_NAME}`` where either or both
|
|
56
|
+
``{EXPERIMENT_NAME}`` and ``{RUN_NAME}`` may be left empy.
|
|
57
|
+
However, the ``name`` must include the ``/`` delimiter.
|
|
58
|
+
|
|
59
|
+
For instance:
|
|
60
|
+
|
|
61
|
+
#. ``foo/``: specifies an experiment name but no run name
|
|
62
|
+
#. ``/bar``: specifies a run name but no experiment name
|
|
63
|
+
#. ``foo``: specifieds a run name but no experiment name
|
|
64
|
+
#. ``foo/bar``: specifies both experiment and run names
|
|
65
|
+
#. ``/``: does not specify experiment nor run name
|
|
66
|
+
|
|
67
|
+
If the run name is left empty then one is derived from either the ``m``
|
|
68
|
+
or ``script``, whichever is not null.
|
|
69
|
+
|
|
70
|
+
.. important::
|
|
71
|
+
Exactly one of ``m`` (main module)` or ``script`` (path to script) must be provided.
|
|
72
|
+
If both or neither are provided, then this function throws.
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
.. doctest::
|
|
77
|
+
|
|
78
|
+
>>> import os
|
|
79
|
+
>>> from torchx.components.structured_arg import StructuredNameArgument
|
|
80
|
+
>>> StructuredNameArgument.parse_from(name="foo/bar", script="bar/baz.py")
|
|
81
|
+
StructuredNameArgument(experiment_name='foo', run_name='bar')
|
|
82
|
+
|
|
83
|
+
>>> StructuredNameArgument.parse_from(name="foo/", script="bar/baz.py")
|
|
84
|
+
StructuredNameArgument(experiment_name='foo', run_name='baz')
|
|
85
|
+
|
|
86
|
+
>>> StructuredNameArgument.parse_from(name="/bar", script="bar/baz.py")
|
|
87
|
+
StructuredNameArgument(experiment_name='default-experiment', run_name='bar')
|
|
88
|
+
|
|
89
|
+
>>> StructuredNameArgument.parse_from(name="foobar", m="foo.bar")
|
|
90
|
+
StructuredNameArgument(experiment_name='default-experiment', run_name='foobar')
|
|
91
|
+
|
|
92
|
+
>>> StructuredNameArgument.parse_from(name="foo/bar", m="foo.bar.baz")
|
|
93
|
+
StructuredNameArgument(experiment_name='foo', run_name='bar')
|
|
94
|
+
|
|
95
|
+
>>> StructuredNameArgument.parse_from(name="foo/", m="foo.bar.baz")
|
|
96
|
+
StructuredNameArgument(experiment_name='foo', run_name='baz')
|
|
97
|
+
|
|
98
|
+
>>> StructuredNameArgument.parse_from(name="/bar", m="foo.bar.baz")
|
|
99
|
+
StructuredNameArgument(experiment_name='default-experiment', run_name='bar')
|
|
100
|
+
|
|
101
|
+
>>> StructuredNameArgument.parse_from(name="foo/bar")
|
|
102
|
+
Traceback (most recent call last):
|
|
103
|
+
...
|
|
104
|
+
ValueError: No main module or script specified.
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
Arguments:
|
|
108
|
+
name: ``{EXPERIMENT_NAME}/{RUN_NAME}``, ``/{RUN_NAME}``, or ``{EXPERIMENT_NAME}/``, or ``{RUN_NAME}``
|
|
109
|
+
m: the main module (e.g. ``foo.bar.baz`` for ``foo/bar/baz.py``)
|
|
110
|
+
script: path to the main script
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
Raises:
|
|
114
|
+
ValueError: if both ``m`` and ``script`` are empty or both are non-empty.
|
|
115
|
+
ValueError: if the ``name`` does not contain the experiment/run name delimiter: ``/``.
|
|
116
|
+
"""
|
|
117
|
+
if not m and not script:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"No main module or script specified. Specify either a main module or a script path"
|
|
120
|
+
)
|
|
121
|
+
if m and script:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"Both main module and script set. Specify exactly one of: main module or script, but not both"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
run_name = ""
|
|
127
|
+
experiment_name = ""
|
|
128
|
+
|
|
129
|
+
delim_idx = name.find("/")
|
|
130
|
+
# just assume that name is the run_name (experiment name should default)
|
|
131
|
+
if delim_idx < 0:
|
|
132
|
+
run_name = name
|
|
133
|
+
elif delim_idx >= 0 and delim_idx < len(name) - 1:
|
|
134
|
+
# deal with:
|
|
135
|
+
# 1. /FOO (only run name)
|
|
136
|
+
# 2. FOO/BAR (both exp and run name)
|
|
137
|
+
#
|
|
138
|
+
# FOO/ (only exp name) will not enter this branch
|
|
139
|
+
# and end up getting an empty run_name (as declared above)
|
|
140
|
+
run_name = name[delim_idx + 1 :]
|
|
141
|
+
|
|
142
|
+
if delim_idx > 0:
|
|
143
|
+
experiment_name = name[:delim_idx]
|
|
144
|
+
|
|
145
|
+
if not run_name:
|
|
146
|
+
if m: # use the last module name
|
|
147
|
+
run_name = m.rpartition(".")[2]
|
|
148
|
+
else: # use script name w/ no extension
|
|
149
|
+
run_name = Path(none_throws(script)).stem
|
|
150
|
+
return StructuredNameArgument(
|
|
151
|
+
experiment_name or default_experiment_name, run_name
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class StructuredJArgument:
|
|
157
|
+
nnodes: int
|
|
158
|
+
nproc_per_node: int
|
|
159
|
+
|
|
160
|
+
def __str__(self) -> str:
|
|
161
|
+
return f"{self.nnodes}x{self.nproc_per_node}"
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def parse_from(h: str, j: str) -> "StructuredJArgument":
|
|
165
|
+
"""
|
|
166
|
+
Creates an :py:class:`ArgJ` instance given the ``h`` (host) and ``j`` (nnodes x nproc_per_node)
|
|
167
|
+
component arguments.
|
|
168
|
+
|
|
169
|
+
If the host has GPUs and ``j`` only specified nnodes (e.g. ``-j 2`` versus ``-j 2x8``), then
|
|
170
|
+
nproc_per_node is set equal to the number of GPUs on the host. If nproc_per_node was explicitly
|
|
171
|
+
specified, then it is honored even if it does not match the number of GPUs on the host.
|
|
172
|
+
However, a warning message is displayed reminding the user that there is a mismatch between
|
|
173
|
+
the GPU count on the host and the configured nproc_per_node.
|
|
174
|
+
|
|
175
|
+
Example (GPU):
|
|
176
|
+
|
|
177
|
+
.. doctest::
|
|
178
|
+
|
|
179
|
+
>>> from torchx.components.structured_arg import StructuredJArgument
|
|
180
|
+
>>> str(StructuredJArgument.parse_from(h="aws_p4d.24xlarge", j="2"))
|
|
181
|
+
'2x8'
|
|
182
|
+
|
|
183
|
+
>>> str(StructuredJArgument.parse_from(h="aws_p4d.24xlarge", j="2x4"))
|
|
184
|
+
'2x4'
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
For hosts with no GPU devices, one MUST specify nproc_per_node. Otherwise this function will
|
|
188
|
+
raise an Error.
|
|
189
|
+
|
|
190
|
+
Example (CPU or Trainium):
|
|
191
|
+
|
|
192
|
+
.. doctest::
|
|
193
|
+
|
|
194
|
+
>>> str(StructuredJArgument.parse_from(h="aws_trn1.32xl", j="2"))
|
|
195
|
+
Traceback (most recent call last):
|
|
196
|
+
...
|
|
197
|
+
ValueError: nproc_per_node cannot be inferred from GPU count. `trn1.32xl` is not a GPU instance. ...
|
|
198
|
+
|
|
199
|
+
>>> str(StructuredJArgument.parse_from(h="aws_trn1.32xl", j="2x16"))
|
|
200
|
+
'2x16'
|
|
201
|
+
|
|
202
|
+
"""
|
|
203
|
+
nums = j.split("x")
|
|
204
|
+
num_gpus = specs.named_resources[h].gpu
|
|
205
|
+
if len(nums) == 1: # -j 1
|
|
206
|
+
nnodes = int(nums[0])
|
|
207
|
+
# infer nproc_per_node from # of gpus in host
|
|
208
|
+
|
|
209
|
+
if num_gpus > 0:
|
|
210
|
+
nproc_per_node = num_gpus
|
|
211
|
+
else:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
f"nproc_per_node cannot be inferred from GPU count."
|
|
214
|
+
f" `{h}` is not a GPU instance."
|
|
215
|
+
f" You must specify `-j $NNODESx$NPROCS_PER_NODE` (e.g. `-j {nnodes}x8`)"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
elif len(nums) == 2: # -j 1x2
|
|
219
|
+
nnodes = int(nums[0])
|
|
220
|
+
nproc_per_node = int(nums[1])
|
|
221
|
+
|
|
222
|
+
if nproc_per_node != num_gpus:
|
|
223
|
+
warnings.warn(
|
|
224
|
+
f"In `-j {j}` you specified nproc_per_node={nproc_per_node}"
|
|
225
|
+
f" which does not equal the number of GPUs on a {h}: {num_gpus}."
|
|
226
|
+
f" This may lead to under-utilization or an error. "
|
|
227
|
+
f" If this was intentional, ignore this warning."
|
|
228
|
+
f" Otherwise set `-j {nnodes}` to auto-set nproc_per_node"
|
|
229
|
+
f" to the number of GPUs on the host."
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
raise ValueError(
|
|
233
|
+
f"Invalid format for `-j $NNODESx$NPROCS_PER_NODE` (e.g. `-j 1x8`). Given: {j}"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return StructuredJArgument(nnodes=nnodes, nproc_per_node=nproc_per_node)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Convenience methods to use ``torch.distributed``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import warnings
|
|
14
|
+
from contextlib import contextmanager
|
|
15
|
+
from typing import Any, Iterator
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
import torch.distributed as dist
|
|
19
|
+
from typing_extensions import Literal
|
|
20
|
+
|
|
21
|
+
log: logging.Logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def local_rank() -> int:
|
|
25
|
+
"""
|
|
26
|
+
Returns the local rank (aka rank within the node) of this process.
|
|
27
|
+
Typically the local rank is used to set the CUDA device on the node.
|
|
28
|
+
|
|
29
|
+
.. warning::
|
|
30
|
+
This function only works correctly if the invoker of the program sets ``LOCAL_RANK`` env var
|
|
31
|
+
or invokes the program with ``torchrun`` (aka ``torch.distributed.run``) or ``torchx``.
|
|
32
|
+
If ``LOCAL_RANK`` is not set or the process group is not initialized
|
|
33
|
+
then this function assumes that the process is not distributed and trivially returns 0.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
if not dist.is_initialized():
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
if "LOCAL_RANK" not in os.environ:
|
|
41
|
+
warnings.warn(
|
|
42
|
+
"\n"
|
|
43
|
+
"==============================================================================================\n"
|
|
44
|
+
"`LOCAL_RANK` environment variable is not set. Will trivially return 0 for local_rank.\n"
|
|
45
|
+
" It is recommended to use torchrun/torchx to run your script or set the `LOCAL_RANK` manually.\n"
|
|
46
|
+
" For additional details see:\n"
|
|
47
|
+
" 1) https://pytorch.org/torchx/latest/components/distributed.html\n"
|
|
48
|
+
" 2) https://pytorch.org/docs/stable/elastic/run.html\n"
|
|
49
|
+
"=============================================================================================="
|
|
50
|
+
)
|
|
51
|
+
return 0
|
|
52
|
+
else:
|
|
53
|
+
return int(os.environ["LOCAL_RANK"])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def local_cuda_device() -> torch.device:
|
|
57
|
+
"""
|
|
58
|
+
Returns the CUDA device (as a ``torch.device``) based on the local rank.
|
|
59
|
+
|
|
60
|
+
See Also: :py:func:`get_local_rank`.
|
|
61
|
+
"""
|
|
62
|
+
return torch.device(f"cuda:{local_rank()}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def rank() -> int:
|
|
66
|
+
"""
|
|
67
|
+
A non-distributed-safe get_rank call. Unlike ``torch.distributed.get_rank()``
|
|
68
|
+
this method will not fail if being invoked from a non-distributed (e.g. process group not initialized)
|
|
69
|
+
context. Therefore, this method is safe to use in internal methods that may be used
|
|
70
|
+
in non-distributed contexts as well.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
If a process group has been initialized returns the value returned by ``torch.distributed.get_rank()``.
|
|
74
|
+
Otherwise, returns 0 (trivial rank)
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
return dist.get_rank() if dist.is_initialized() else 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def world_size() -> int:
|
|
81
|
+
"""
|
|
82
|
+
A non-distributed-safe get_world_size call. Unlike ``torch.distributed.get_world_size()``,
|
|
83
|
+
this method will not fail if being invoked from a non-distributed (e.g. process group not initialized)
|
|
84
|
+
context. Threefore, this method is safe to use in internal mthods that may be used
|
|
85
|
+
in non-distributed contexts as well.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
If a process group has been initialized returns the value returns by ``torch.distributed.get_world_size()``.
|
|
89
|
+
Otherwise, returns 1 (trivial world_size)
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
return dist.get_world_size() if dist.is_initialized() else 1
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def is_rank0() -> bool:
|
|
96
|
+
"""
|
|
97
|
+
Returns ``True`` if the caller is rank 0 (in a distributed setting).
|
|
98
|
+
If no process group has been initialized, then this method assumes
|
|
99
|
+
that the caller is a single-process (aka not-distributed) and trivially returns ``True``.
|
|
100
|
+
That is, for a non-distributed job there is only one process and hence that process
|
|
101
|
+
is trivially rank 0.
|
|
102
|
+
|
|
103
|
+
.. note::
|
|
104
|
+
To initialize the process group prefer to use :py:func:init_process_group over
|
|
105
|
+
``torch.distributed.init_process_group()`` since the former can be called from
|
|
106
|
+
both distributed and non-distributed scripts.
|
|
107
|
+
|
|
108
|
+
"""
|
|
109
|
+
return rank() == 0
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def is_local_rank0() -> bool:
|
|
113
|
+
"""
|
|
114
|
+
Returns ``True`` if this process is local rank 0 and ``False`` otherwise.
|
|
115
|
+
Used to perform an action just once per node. Example
|
|
116
|
+
|
|
117
|
+
.. code-block:: python
|
|
118
|
+
|
|
119
|
+
if is_local_rank0():
|
|
120
|
+
# download a file just once per node
|
|
121
|
+
download_file("s3://...")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
"""
|
|
125
|
+
return local_rank() == 0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
Backend = Literal["nccl", "gloo", "auto"]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def init_pg(backend: Backend = "auto", **kwargs: Any) -> torch.device:
|
|
132
|
+
"""
|
|
133
|
+
A convenience wrapper around ``torch.distributed.init_proces_group()``
|
|
134
|
+
that makes initializing a trivial (single world_size) process group easy.
|
|
135
|
+
|
|
136
|
+
Useful when you want to make your code portable across launching with
|
|
137
|
+
simple python or with ``torchrun`` (aka ``torch.distributed.run``)
|
|
138
|
+
|
|
139
|
+
Usage:
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
.. doctest::
|
|
143
|
+
|
|
144
|
+
>>> from torchx.distributed import init_pg
|
|
145
|
+
>>> init_pg(backend="gloo") # or nccl # doctest: +SKIP
|
|
146
|
+
device(type='cpu')
|
|
147
|
+
|
|
148
|
+
The example above works to initialize a pytorch process group
|
|
149
|
+
for the trivial (``world_size = 1``) and distributed (``world_size > 1``)
|
|
150
|
+
cases without you having to write an explicit check with an if-else branch statement.
|
|
151
|
+
|
|
152
|
+
You can pass ``backend="auto"`` to have this function select ``"nccl"``
|
|
153
|
+
if there is a cuda device available, otherwise ``"gloo"`` (for CPU)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
.. doctest::
|
|
157
|
+
|
|
158
|
+
>>> from torchx.distributed import init_pg
|
|
159
|
+
>>> device = init_pg(backend="auto") # doctest: +SKIP
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
In the code above, ``device`` will be ``cuda:{LOCAL_RANK}`` if the host has CUDA devices (GPUs)
|
|
163
|
+
and ``cpu`` if not.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
The cuda device that this rank should be using or cpu device if ``backend="gloo"``
|
|
167
|
+
or if ``backend="auto"`` and no GPUs are available on the host.
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
if backend == "auto":
|
|
172
|
+
backend = (
|
|
173
|
+
"nccl"
|
|
174
|
+
if torch.cuda.is_available() # returns True if gpu-torch was installed even on CPU host
|
|
175
|
+
and (
|
|
176
|
+
torch.cuda.device_count() > 0
|
|
177
|
+
) # so need to check for CUDA devices explicitly
|
|
178
|
+
and dist.is_nccl_available()
|
|
179
|
+
else "gloo"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# this means that the script was launched as a single python process
|
|
183
|
+
# initialize a trivial process group
|
|
184
|
+
if not dist.is_torchelastic_launched():
|
|
185
|
+
os.environ["MASTER_ADDR"] = "localhost"
|
|
186
|
+
os.environ["MASTER_PORT"] = "0" # port selection - selects free random port
|
|
187
|
+
dist.init_process_group(backend=backend, rank=0, world_size=1, **kwargs)
|
|
188
|
+
else:
|
|
189
|
+
dist.init_process_group(backend=backend, **kwargs)
|
|
190
|
+
|
|
191
|
+
if backend == "nccl":
|
|
192
|
+
return local_cuda_device()
|
|
193
|
+
else:
|
|
194
|
+
return torch.device("cpu")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@contextmanager
|
|
198
|
+
def on_rank0_first() -> Iterator[None]:
|
|
199
|
+
"""
|
|
200
|
+
Runs the piece of code that is wrapped in this context manager
|
|
201
|
+
first on rank0 then on the rest of the ranks.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
|
|
205
|
+
.. code-block:: python
|
|
206
|
+
|
|
207
|
+
import time
|
|
208
|
+
from torchx.distributed import on_rank0_first, rank
|
|
209
|
+
|
|
210
|
+
with on_rank0_first():
|
|
211
|
+
print(f"Running on rank {rank()} at {int(time.monotonic())}")
|
|
212
|
+
time.sleep(10)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
Would print:
|
|
216
|
+
|
|
217
|
+
.. code-block::
|
|
218
|
+
|
|
219
|
+
Running on rank 0 at 12534774
|
|
220
|
+
Running on rank 1 at 12534784 # at least +10 seconds on the other ranks
|
|
221
|
+
Running on rank 2 at 12534784
|
|
222
|
+
...
|
|
223
|
+
|
|
224
|
+
To run ONLY on rank0 use an if-statement as such:
|
|
225
|
+
|
|
226
|
+
.. code-block:: python
|
|
227
|
+
|
|
228
|
+
if is_rank0():
|
|
229
|
+
print(f"Running on rank {dist.get_rank()}")
|
|
230
|
+
|
|
231
|
+
The code above would only print once on rank 0.
|
|
232
|
+
|
|
233
|
+
"""
|
|
234
|
+
if dist.is_initialized() and not is_rank0():
|
|
235
|
+
dist.barrier()
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
yield
|
|
239
|
+
finally:
|
|
240
|
+
if dist.is_initialized() and is_rank0():
|
|
241
|
+
dist.barrier()
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@contextmanager
|
|
245
|
+
def on_local_rank0_first() -> Iterator[None]:
|
|
246
|
+
"""
|
|
247
|
+
Runs the piece of code that is wrapped in this context manager
|
|
248
|
+
first on local rank 0 then on the rest of the ranks.
|
|
249
|
+
|
|
250
|
+
The behavior is exactly the same as :py:func:`torchx.distributed.on_rank0_first`
|
|
251
|
+
except that the barrier is on each local rank on each node (versus a global barrier on rank0).
|
|
252
|
+
|
|
253
|
+
This is useful in situations there a node-local action (that would otherwise cause races)
|
|
254
|
+
needs to be done first from a representative worker on each node. For instance,
|
|
255
|
+
downloading a checkpoint file to a tmp dir on each node once, then having all the
|
|
256
|
+
workers read off the downloaded file.
|
|
257
|
+
|
|
258
|
+
.. note::
|
|
259
|
+
For actions that need to be run first at a job level
|
|
260
|
+
use :py:func:`torchx.distributed.on_rank0_first`
|
|
261
|
+
|
|
262
|
+
"""
|
|
263
|
+
if dist.is_initialized() and not is_local_rank0():
|
|
264
|
+
dist.barrier()
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
yield
|
|
268
|
+
finally:
|
|
269
|
+
if dist.is_initialized() and is_local_rank0():
|
|
270
|
+
dist.barrier()
|
torchx/specs/__init__.py
CHANGED
|
@@ -14,6 +14,9 @@ import difflib
|
|
|
14
14
|
from typing import Callable, Dict, Optional
|
|
15
15
|
|
|
16
16
|
from torchx.specs.named_resources_aws import NAMED_RESOURCES as AWS_NAMED_RESOURCES
|
|
17
|
+
from torchx.specs.named_resources_generic import (
|
|
18
|
+
NAMED_RESOURCES as GENERIC_NAMED_RESOURCES,
|
|
19
|
+
)
|
|
17
20
|
from torchx.util.entrypoints import load_group
|
|
18
21
|
|
|
19
22
|
from .api import ( # noqa: F401 F403
|
|
@@ -55,12 +58,16 @@ GiB: int = 1024
|
|
|
55
58
|
def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
|
|
56
59
|
resource_methods = load_group("torchx.named_resources", default={})
|
|
57
60
|
materialized_resources: Dict[str, Callable[[], Resource]] = {}
|
|
58
|
-
|
|
59
|
-
for name, resource in
|
|
61
|
+
|
|
62
|
+
for name, resource in {
|
|
63
|
+
**GENERIC_NAMED_RESOURCES,
|
|
64
|
+
**AWS_NAMED_RESOURCES,
|
|
65
|
+
**resource_methods,
|
|
66
|
+
}.items():
|
|
60
67
|
materialized_resources[name] = resource
|
|
61
|
-
|
|
62
|
-
materialized_resources[resource_name] = resource_method
|
|
68
|
+
|
|
63
69
|
materialized_resources["NULL"] = lambda: NULL_RESOURCE
|
|
70
|
+
materialized_resources["MISSING"] = lambda: NULL_RESOURCE
|
|
64
71
|
return materialized_resources
|
|
65
72
|
|
|
66
73
|
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Defines generic named resources that are not specific to any cloud provider's
|
|
9
|
+
instance types. These generic named resources are meant to be used as
|
|
10
|
+
default values for components and examples and are NOT meant to be used
|
|
11
|
+
long term as the specific capabilities (e.g. number of cpu, gpu, memMB)
|
|
12
|
+
are subject to change.
|
|
13
|
+
|
|
14
|
+
.. note:: T
|
|
15
|
+
he named resources in this file DO NOT map device capabilities such as
|
|
16
|
+
special network interfaces (e.g. EFA devices on AWS).
|
|
17
|
+
.. warning::
|
|
18
|
+
Do not use for launching applications that require specific capabilities
|
|
19
|
+
(e.g. needs exactly 4 x A100 GPUs with 40GB of memory connected with NVLink).
|
|
20
|
+
|
|
21
|
+
Different cloud provides offer different types of
|
|
22
|
+
instance types hence practically speaking one should register their own
|
|
23
|
+
named resources that accurately capture the instances they have at their disposal
|
|
24
|
+
rather than using these defaults long term.
|
|
25
|
+
|
|
26
|
+
.. note::
|
|
27
|
+
The cpu/gpu/memory ratios in these default resources are based on current
|
|
28
|
+
HW trends and do not map exactly to a particular instance type!
|
|
29
|
+
|
|
30
|
+
.. warning::
|
|
31
|
+
The specific capabilities of these default resources are subject to change
|
|
32
|
+
at any time based on current hardware spec trends.
|
|
33
|
+
Therefore, the user should NEVER assume that the specific number of cpu, gpu, and memMB
|
|
34
|
+
will always remain the same. For instance, never assume that ``gpu.small`` will always
|
|
35
|
+
have 8 cpus.
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
from typing import Callable, Mapping
|
|
39
|
+
|
|
40
|
+
from torchx.specs.api import Resource
|
|
41
|
+
|
|
42
|
+
GiB: int = 1024
|
|
43
|
+
|
|
44
|
+
NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
|
|
45
|
+
# typically system CPU memory is >= GPU memory (most modern GPUs have 32GB device mem)
|
|
46
|
+
# most cloud provides offer 1, 2, 4, 8 GPUs per host
|
|
47
|
+
"gpu.small": lambda: Resource(cpu=8, gpu=1, memMB=32 * GiB),
|
|
48
|
+
"gpu.medium": lambda: Resource(cpu=16, gpu=2, memMB=64 * GiB),
|
|
49
|
+
"gpu.large": lambda: Resource(cpu=32, gpu=4, memMB=128 * GiB),
|
|
50
|
+
"gpu.xlarge": lambda: Resource(cpu=64, gpu=8, memMB=256 * GiB),
|
|
51
|
+
# for cpu defaults - based on AWS's T2 (general purpose) instance type
|
|
52
|
+
"cpu.nano": lambda: Resource(cpu=1, gpu=0, memMB=512),
|
|
53
|
+
"cpu.micro": lambda: Resource(cpu=1, gpu=0, memMB=1 * GiB),
|
|
54
|
+
"cpu.small": lambda: Resource(cpu=1, gpu=0, memMB=2 * GiB),
|
|
55
|
+
"cpu.medium": lambda: Resource(cpu=2, gpu=0, memMB=4 * GiB),
|
|
56
|
+
"cpu.large": lambda: Resource(cpu=2, gpu=0, memMB=8 * GiB),
|
|
57
|
+
"cpu.xlarge": lambda: Resource(cpu=8, gpu=0, memMB=32 * GiB),
|
|
58
|
+
}
|
torchx/tracker/mlflow.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import dataclasses
|
|
8
|
+
import os
|
|
9
|
+
import socket
|
|
10
|
+
from getpass import getuser
|
|
11
|
+
from logging import getLogger, Logger
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from tempfile import gettempdir
|
|
14
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Sequence
|
|
15
|
+
|
|
16
|
+
import mlflow
|
|
17
|
+
from mlflow import MlflowClient
|
|
18
|
+
from mlflow.entities import Experiment, Run
|
|
19
|
+
|
|
20
|
+
from torchx.distributed import on_rank0_first
|
|
21
|
+
from torchx.runner.config import get_configs
|
|
22
|
+
from torchx.tracker.api import Lineage, TrackerArtifact, TrackerBase, TrackerSource
|
|
23
|
+
|
|
24
|
+
log: Logger = getLogger(__name__)
|
|
25
|
+
TAG_ARTIFACT_MD_PREFIX = "torchx.artifact.metadata"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MLflowTracker(TrackerBase):
|
|
29
|
+
"""
|
|
30
|
+
An implementation of a ``Tracker`` that uses mlflow as the backend.
|
|
31
|
+
Don't forget to call the ``close()`` method for orderly shutdown.
|
|
32
|
+
This ensures that the run state in mlflow is properly marked as ``FINISHED``,
|
|
33
|
+
otherwise the run will remain in ``UNFINISHED`` status.
|
|
34
|
+
|
|
35
|
+
.. important::
|
|
36
|
+
TorchX's run_id is used as mlflow's run_name! The run_id in TorchX
|
|
37
|
+
is the job name. The job name in TorchX is made unique by adding
|
|
38
|
+
a short random hash to the user-provided job name prefix. This is
|
|
39
|
+
done because certain job schedulers supported by TorchX requires
|
|
40
|
+
that the job name on the submitted job definition is globally unique
|
|
41
|
+
(rather than the scheduler returning a unique job id as the return result
|
|
42
|
+
of the job submission API).
|
|
43
|
+
|
|
44
|
+
.. warning::
|
|
45
|
+
APIs on this class may only be called with the same ``run_name`.
|
|
46
|
+
Typically the user does not have to worry about manually setting
|
|
47
|
+
the run_name as it is picked up by default from the environment variable
|
|
48
|
+
``TORCHX_APP_NAME``.
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
experiment_name: Optional[str] = None,
|
|
55
|
+
tracking_uri: str = f"file://{Path(gettempdir()) / 'torchx' / 'mlruns'}",
|
|
56
|
+
artifact_location: Optional[str] = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
if experiment_name is None:
|
|
59
|
+
experiment_name = self.default_experiment_name()
|
|
60
|
+
|
|
61
|
+
self.tracking_uri = tracking_uri
|
|
62
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
63
|
+
log.info(
|
|
64
|
+
f"MLflow tracking_uri={tracking_uri}, artifact_location={artifact_location}"
|
|
65
|
+
)
|
|
66
|
+
with on_rank0_first():
|
|
67
|
+
existing_experiment = mlflow.get_experiment_by_name(experiment_name)
|
|
68
|
+
if existing_experiment:
|
|
69
|
+
self.experiment: Experiment = existing_experiment
|
|
70
|
+
log.info(
|
|
71
|
+
f"Found existing experiment `{experiment_name}` (id={self.experiment_id})"
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
experiment_id = mlflow.create_experiment(
|
|
75
|
+
name=experiment_name,
|
|
76
|
+
artifact_location=artifact_location,
|
|
77
|
+
)
|
|
78
|
+
self.experiment = mlflow.get_experiment(experiment_id)
|
|
79
|
+
log.info(
|
|
80
|
+
f"Created new experiment `{experiment_name}` (id={experiment_id})"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def default_experiment_name() -> str:
|
|
85
|
+
return f"default-experiment/{getuser()}/{socket.getfqdn()}"
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def experiment_id(self) -> str:
|
|
89
|
+
return self.experiment.experiment_id
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def experiment_name(self) -> str:
|
|
93
|
+
return self.experiment.name
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def artifact_location(self) -> str:
|
|
97
|
+
return self.experiment.artifact_location
|
|
98
|
+
|
|
99
|
+
def get_run(self, run_name: str) -> Run:
|
|
100
|
+
"""
|
|
101
|
+
Gets mlflow's ``Run`` object for the given ``run_name`` in the current experiment.
|
|
102
|
+
If no such run exists, this method creates a new run under
|
|
103
|
+
this experiment and starts the run so that subsequent calls to
|
|
104
|
+
mlflow logs metadata, metrics, artifacts to the newly created run.
|
|
105
|
+
|
|
106
|
+
.. warning::
|
|
107
|
+
This method should only be called with the same run_name!
|
|
108
|
+
This is because of the way mlflow
|
|
109
|
+
APIs work is by setting an "active run" for which subsequent
|
|
110
|
+
mlflow logging APIs are made against the current active run
|
|
111
|
+
in the stack. If you call ``mlflow.start_run()`` directly
|
|
112
|
+
or pass different run names, then you may be logging into two different
|
|
113
|
+
mlflow runs from the same job!
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
run_name: equal to torchx's run_id
|
|
117
|
+
|
|
118
|
+
Returns: mlflow's ``Run`` object for the ``run_name``
|
|
119
|
+
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
active_run = mlflow.active_run()
|
|
123
|
+
|
|
124
|
+
if active_run is None:
|
|
125
|
+
search_result = mlflow.search_runs(
|
|
126
|
+
experiment_ids=[self.experiment_id],
|
|
127
|
+
output_format="list",
|
|
128
|
+
filter_string=f"tags.`mlflow.runName` = '{run_name}'",
|
|
129
|
+
)
|
|
130
|
+
if not search_result:
|
|
131
|
+
return mlflow.start_run(
|
|
132
|
+
experiment_id=self.experiment_id, run_name=run_name
|
|
133
|
+
)
|
|
134
|
+
elif len(search_result) == 1:
|
|
135
|
+
return search_result[0]
|
|
136
|
+
else: # len(search_result) > 1
|
|
137
|
+
raise RuntimeError(
|
|
138
|
+
f"More than 1 run found for run_name `{run_name}` in experiment `{self.experiment_name}`."
|
|
139
|
+
f" Did you manually create runs with the same name under this experiment?"
|
|
140
|
+
f" Remove duplicate run names and try again"
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
# need to query mlflow again so that the run reflects any newly written logs
|
|
144
|
+
return mlflow.get_run(active_run.info.run_id)
|
|
145
|
+
|
|
146
|
+
def get_run_id(self, run_name: str) -> str:
|
|
147
|
+
"""
|
|
148
|
+
Gets the mlflow run's run_id for the given ``run_name`` and additionally sets
|
|
149
|
+
this run as the active run. Hence this method has a side-effect where all subsequent
|
|
150
|
+
calls to mlflow log APIs are against the run for the given ``run_name``.
|
|
151
|
+
"""
|
|
152
|
+
return self.get_run(run_name).info.run_id
|
|
153
|
+
|
|
154
|
+
def close(self) -> None:
|
|
155
|
+
mlflow.end_run()
|
|
156
|
+
|
|
157
|
+
def add_artifact(
|
|
158
|
+
self,
|
|
159
|
+
run_id: str,
|
|
160
|
+
name: str,
|
|
161
|
+
path: str,
|
|
162
|
+
metadata: Optional[Mapping[str, object]] = None,
|
|
163
|
+
) -> None:
|
|
164
|
+
self.get_run(run_id)
|
|
165
|
+
# stores the artifact in {artifact_location}/{name} (e.g. s3://bucket/prefix/{name})
|
|
166
|
+
mlflow.log_artifact(local_path=path, artifact_path=name)
|
|
167
|
+
|
|
168
|
+
# add artifact metadata with torchx.artifact_metadata.{name}.* tag prefix
|
|
169
|
+
if metadata:
|
|
170
|
+
mlflow.set_tags(
|
|
171
|
+
tags={
|
|
172
|
+
f"{TAG_ARTIFACT_MD_PREFIX}.{name}.{k}": v
|
|
173
|
+
for k, v in metadata.items()
|
|
174
|
+
}
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def artifacts(self, run_id: str) -> Mapping[str, TrackerArtifact]:
|
|
178
|
+
artifacts: Dict[str, TrackerArtifact] = {}
|
|
179
|
+
mlflow_client: MlflowClient = MlflowClient(self.tracking_uri)
|
|
180
|
+
|
|
181
|
+
def get_artifacts(path: Optional[str] = None) -> None:
|
|
182
|
+
for artifact_info in mlflow_client.list_artifacts(
|
|
183
|
+
self.get_run(run_id).info.run_id, path=path
|
|
184
|
+
):
|
|
185
|
+
if artifact_info.is_dir:
|
|
186
|
+
get_artifacts(path=artifact_info.path)
|
|
187
|
+
else:
|
|
188
|
+
# we stored the artifact using the name as the path
|
|
189
|
+
# so path should never be `None` when we get to this point
|
|
190
|
+
# (e.g. the root of `artifact_location` will only have directories
|
|
191
|
+
# where the directory names are the artifact names
|
|
192
|
+
name = path or "<SHOULD_NOT_HAPPEN>"
|
|
193
|
+
|
|
194
|
+
# artifact metadata is stored as run tags with `torchx.artifact.metadata.*` prefix
|
|
195
|
+
tag_prefix = f"{TAG_ARTIFACT_MD_PREFIX}.{name}."
|
|
196
|
+
metadata = {
|
|
197
|
+
# k.removeprefix() only avail in python 3.9+
|
|
198
|
+
k[len(tag_prefix) :]: v
|
|
199
|
+
for k, v in self.metadata(run_id).items()
|
|
200
|
+
if k.startswith(tag_prefix)
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
# add some additional metadata about the artifact
|
|
204
|
+
metadata["mlflow.file_size"] = artifact_info.file_size
|
|
205
|
+
|
|
206
|
+
artifacts[name] = TrackerArtifact(
|
|
207
|
+
name=name,
|
|
208
|
+
path=f"{self.artifact_location}/{artifact_info.path}",
|
|
209
|
+
metadata=metadata,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
get_artifacts()
|
|
213
|
+
return artifacts
|
|
214
|
+
|
|
215
|
+
def add_metadata(self, run_id: str, **kwargs: object) -> None:
|
|
216
|
+
self.get_run(run_id)
|
|
217
|
+
mlflow.set_tags(tags={k: v for k, v in kwargs.items()})
|
|
218
|
+
|
|
219
|
+
def metadata(self, run_id: str) -> Mapping[str, object]:
|
|
220
|
+
return self.get_run(run_id).data.tags
|
|
221
|
+
|
|
222
|
+
def run_ids(self, **kwargs: str) -> Iterable[str]:
|
|
223
|
+
runs = mlflow.search_runs(
|
|
224
|
+
experiment_ids=[self.experiment_id], output_format="list"
|
|
225
|
+
)
|
|
226
|
+
return [r.info.run_name for r in runs]
|
|
227
|
+
|
|
228
|
+
def log_params_flat(
|
|
229
|
+
self, run_name: str, cfg: Any, key: str = "" # pyre-ignore[2]
|
|
230
|
+
) -> None:
|
|
231
|
+
"""
|
|
232
|
+
Designed to be primarily used with hydra-style config objects (e.g. dataclasses),
|
|
233
|
+
logs the given ``cfg``, which is one of: ``@dataclass``, ``Sequence`` (e.g. ``list``, ``tuple``, ``set``),
|
|
234
|
+
or ``Mapping`` (e.g. ``dict``), where the fields of ``cfg`` are flattened recursively and logged as
|
|
235
|
+
the the run's ``Parameter`` in mlflow.
|
|
236
|
+
|
|
237
|
+
For example if ``cfg`` is:
|
|
238
|
+
|
|
239
|
+
.. code-block:: python
|
|
240
|
+
|
|
241
|
+
@dataclass
|
|
242
|
+
class Config2:
|
|
243
|
+
foo: str = "bar"
|
|
244
|
+
|
|
245
|
+
@dataclass
|
|
246
|
+
class Config:
|
|
247
|
+
i: int = 1
|
|
248
|
+
f: float = 2.1
|
|
249
|
+
s: str = "string"
|
|
250
|
+
l: List[str] = field(default_factory=lambda :["a", "b", "c"])
|
|
251
|
+
cfg_list = List[Config2] = field(default_factory=lambda : [Config2(foo="hello"), Config2(foo="world")])
|
|
252
|
+
cfg2: Config2 = Config2()
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
Then this function logs the following parameters
|
|
256
|
+
|
|
257
|
+
.. code-block::
|
|
258
|
+
|
|
259
|
+
i: "1"
|
|
260
|
+
f: "2.1"
|
|
261
|
+
s: "string"
|
|
262
|
+
l: ["a", "b", "c"]
|
|
263
|
+
cfg_list._0.foo = "hello"
|
|
264
|
+
cfg_list._1.foo = "hello"
|
|
265
|
+
cfg2.foo = "bar"
|
|
266
|
+
|
|
267
|
+
As shown above, primitive sequence containers are logged directly (e.g. ``l: ["a", "b", "c"]``)
|
|
268
|
+
whereas nested sequence containers will be logged per element where the key is suffixed with
|
|
269
|
+
``_{INDEX}`` (e.g. ``cfg_list._0.foo = "hello"``).
|
|
270
|
+
|
|
271
|
+
"""
|
|
272
|
+
if dataclasses.is_dataclass(cfg):
|
|
273
|
+
cfg = dataclasses.asdict(cfg)
|
|
274
|
+
self.get_run(run_name)
|
|
275
|
+
|
|
276
|
+
def is_primitive(v: Any) -> bool: # pyre-ignore[2]
|
|
277
|
+
return isinstance(v, (str, int, float, bool))
|
|
278
|
+
|
|
279
|
+
key_prefix = f"{key}." if key else ""
|
|
280
|
+
|
|
281
|
+
if not cfg:
|
|
282
|
+
# empty container; log as is
|
|
283
|
+
mlflow.log_param(key, cfg)
|
|
284
|
+
else:
|
|
285
|
+
# non-empty container; check types
|
|
286
|
+
if isinstance(cfg, (Sequence, set)):
|
|
287
|
+
# assume list/set elements are homogeneous
|
|
288
|
+
# need only check first element for type
|
|
289
|
+
elem = next(iter(cfg))
|
|
290
|
+
if is_primitive(elem):
|
|
291
|
+
mlflow.log_param(key, cfg)
|
|
292
|
+
else:
|
|
293
|
+
for i, e in enumerate(cfg):
|
|
294
|
+
self.log_params_flat(run_name, e, f"{key_prefix}_{i}")
|
|
295
|
+
elif isinstance(cfg, Mapping):
|
|
296
|
+
for k, v in cfg.items():
|
|
297
|
+
if is_primitive(v):
|
|
298
|
+
mlflow.log_param(f"{key_prefix}{k}", v)
|
|
299
|
+
else:
|
|
300
|
+
self.log_params_flat(run_name, v, f"{key_prefix}{k}")
|
|
301
|
+
|
|
302
|
+
def add_source(
|
|
303
|
+
self, run_id: str, source_id: str, artifact_name: Optional[str] = None
|
|
304
|
+
) -> None:
|
|
305
|
+
raise NotImplementedError(
|
|
306
|
+
f"Job's tracker sources is currently unsupported for {self.__class__.__qualname__}"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
def sources(
|
|
310
|
+
self, run_id: str, artifact_name: Optional[str] = None
|
|
311
|
+
) -> Iterable[TrackerSource]:
|
|
312
|
+
raise NotImplementedError(
|
|
313
|
+
f"Job's tracker sources is currently unsupported for {self.__class__.__qualname__}"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def lineage(self, run_id: str) -> Lineage:
|
|
317
|
+
raise NotImplementedError(
|
|
318
|
+
f"Job's lineage is currently unsupported for {self.__class__.__qualname__}"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def create_tracker(config: str) -> MLflowTracker:
|
|
323
|
+
ctor_args = get_configs(
|
|
324
|
+
prefix="tracker",
|
|
325
|
+
name="mlflow",
|
|
326
|
+
dirs=[config],
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# remove "config" key since that one is reserved for torchx.tracker usage
|
|
330
|
+
ctor_args.pop("config", None)
|
|
331
|
+
|
|
332
|
+
# pass configs read from .torchxconfig [tracker:mlflow] section as kwargs
|
|
333
|
+
# get the experiment name from an env var (set in torchx.components.dist:spmd)
|
|
334
|
+
# if no such env var exists, then default the experiment_name to the one
|
|
335
|
+
# specified in .torchxconfig
|
|
336
|
+
return MLflowTracker(
|
|
337
|
+
experiment_name=os.getenv(
|
|
338
|
+
"TORCHX_TRACKING_EXPERIMENT_NAME",
|
|
339
|
+
default=ctor_args.pop("experiment_name", None),
|
|
340
|
+
),
|
|
341
|
+
**ctor_args,
|
|
342
|
+
)
|
torchx/util/strings.py
CHANGED
|
@@ -10,7 +10,7 @@ import re
|
|
|
10
10
|
def normalize_str(data: str) -> str:
|
|
11
11
|
"""
|
|
12
12
|
Invokes ``lower`` on thes string and removes all
|
|
13
|
-
characters that do not satisfy ``[a-z0-9]`` pattern.
|
|
13
|
+
characters that do not satisfy ``[a-z0-9\\-]`` pattern.
|
|
14
14
|
This method is mostly used to make sure kubernetes and gcp_batch scheduler gets
|
|
15
15
|
the job name that does not violate its restrictions.
|
|
16
16
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: torchx-nightly
|
|
3
|
-
Version: 2023.3.
|
|
3
|
+
Version: 2023.3.22
|
|
4
4
|
Summary: TorchX SDK and Components
|
|
5
5
|
Home-page: https://github.com/pytorch/torchx
|
|
6
6
|
Author: TorchX Devs
|
|
@@ -42,6 +42,7 @@ Requires-Dist: google-cloud-runtimeconfig (>=0.33.2) ; extra == 'dev'
|
|
|
42
42
|
Requires-Dist: hydra-core ; extra == 'dev'
|
|
43
43
|
Requires-Dist: ipython ; extra == 'dev'
|
|
44
44
|
Requires-Dist: kfp (==1.8.9) ; extra == 'dev'
|
|
45
|
+
Requires-Dist: mlflow-skinny ; extra == 'dev'
|
|
45
46
|
Requires-Dist: moto (==4.1.3) ; extra == 'dev'
|
|
46
47
|
Requires-Dist: pyre-extensions ; extra == 'dev'
|
|
47
48
|
Requires-Dist: pyre-check ; extra == 'dev'
|
|
@@ -24,15 +24,17 @@ torchx/cli/colors.py,sha256=bVN_jEDwLgvypnDMeCHKn0q0ZDDhQjBJnyVfZHAE6nc,553
|
|
|
24
24
|
torchx/cli/main.py,sha256=ysAQh0vPn0hC3JeVzRP0i7-E6dTO2D4rLN2B5Ok3abw,3442
|
|
25
25
|
torchx/components/__init__.py,sha256=6-TQ4SY-Tn56os_1lOs_HMabOoE7gkkud_8e1BgvfJw,12106
|
|
26
26
|
torchx/components/component_test_base.py,sha256=eKOwBp5cRgiA4FgZd_FCvyJ-ppv2v3JN9AGXnaSK_Cw,4135
|
|
27
|
-
torchx/components/dist.py,sha256=
|
|
27
|
+
torchx/components/dist.py,sha256=FA1Wxxdo4qZ1_zd2Y20JpHog2ZjEGCkjN0k7dz5ZLvE,14075
|
|
28
28
|
torchx/components/interpret.py,sha256=g8gkKdDJvsBfX1ZrpVT7n2bMEtmwRV_1AqDyAnnQ_aA,697
|
|
29
29
|
torchx/components/metrics.py,sha256=1gbp8BfzZWGa7PD1db5vRADlONzmae4qSBUUdCWayr0,2814
|
|
30
30
|
torchx/components/serve.py,sha256=9RlpwlU2KOC7sMOZBeYwUpJIKDCXrU8xNo1SH-AT3fc,2141
|
|
31
|
+
torchx/components/structured_arg.py,sha256=uavcUeFDRnMP7cWAqcxR3ujJYi6JEsClz0_Rd4Dgxj4,9542
|
|
31
32
|
torchx/components/train.py,sha256=vtrQXRcD7bIcbb3lSeyD9BBlIe1mv1WNW6rnLK9R0Mw,1259
|
|
32
33
|
torchx/components/utils.py,sha256=m7mFe6du2AMHpViwcW9dF8rr_twQB6KHQuEzJyHwBXw,9025
|
|
33
34
|
torchx/components/integration_tests/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
|
|
34
35
|
torchx/components/integration_tests/component_provider.py,sha256=fcdVWJt6dewv16qHOLl4PKQUK1PSfkPWy7d4dKMqZmo,3925
|
|
35
36
|
torchx/components/integration_tests/integ_tests.py,sha256=OVgRvGrLWhDUNlqbbYj90ukGmkAwka2KubCWUR8pC7Y,5150
|
|
37
|
+
torchx/distributed/__init__.py,sha256=OAO1CIwVBOaclzbp2NjH_SMBq2WlK7aE9NVlNmDtVlQ,8786
|
|
36
38
|
torchx/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
39
|
torchx/examples/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
40
|
torchx/examples/apps/datapreproc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -78,12 +80,13 @@ torchx/schedulers/streams.py,sha256=ObaKwEEcnsjrPyc6VZOp8cgZ_f2RFextAxeISxZUWeQ,
|
|
|
78
80
|
torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
79
81
|
torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
|
|
80
82
|
torchx/schedulers/ray/ray_driver.py,sha256=0DL8Ad_hire-WgH8ZEYx1Q-mI2SUfZDk-6_6PICk8OQ,12282
|
|
81
|
-
torchx/specs/__init__.py,sha256=
|
|
83
|
+
torchx/specs/__init__.py,sha256=fSA89Y0ZpdZLJmhIfEKNbjNNi6fbDR9k1bpIM7Xm7xo,5462
|
|
82
84
|
torchx/specs/api.py,sha256=vFtvhYW18HvWzET8ob8ONNRC1MCuYwLMXohWDy0LbZI,33798
|
|
83
85
|
torchx/specs/builders.py,sha256=dsKa80PD-cuIjXhFF2TsEY0eL8S01wux8aZaQrURIEE,8512
|
|
84
86
|
torchx/specs/file_linter.py,sha256=LREWELpHJyE7YN3rc5ixf2ZydWFU9dlcSy5gGqdB5rA,11714
|
|
85
87
|
torchx/specs/finder.py,sha256=RJI0PkG69esuVzhCp4w6Lotu2tSzIRh6PhWemCSQR7I,16234
|
|
86
88
|
torchx/specs/named_resources_aws.py,sha256=6ID0jOGi5HVXn9BLylzCMD-SUQtxGeBHS5zU8PCten4,6361
|
|
89
|
+
torchx/specs/named_resources_generic.py,sha256=_xz0cRjy3fz-CVtX9G_MY7f3NX6n3AkP3xzAkuDevwk,2631
|
|
87
90
|
torchx/specs/test/components/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
|
|
88
91
|
torchx/specs/test/components/a/__init__.py,sha256=T7exlQ47Fak5ajCEGPg6_yOfChJCWpIMhWBmSVUnlrQ,546
|
|
89
92
|
torchx/specs/test/components/a/b/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
|
|
@@ -92,6 +95,7 @@ torchx/specs/test/components/c/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFR
|
|
|
92
95
|
torchx/specs/test/components/c/d.py,sha256=RH07jjo6uvFbzIaNFnAwmD_h24cEsT8kyZDTN-ezFio,531
|
|
93
96
|
torchx/tracker/__init__.py,sha256=kaynfAwMYtdkkvXNrb4Rmin2mSd5vi9mg25iz1e0TJ8,4200
|
|
94
97
|
torchx/tracker/api.py,sha256=qIgoHNlc_RsQbQdcHF8GDrKby_vLlZg84Tn6VfONrY4,11284
|
|
98
|
+
torchx/tracker/mlflow.py,sha256=P_mj7Yi-bZc7QOZ-6PJW4FHeWWGX0rjauNK0zXC25ig,13221
|
|
95
99
|
torchx/tracker/backend/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
96
100
|
torchx/tracker/backend/fsspec.py,sha256=JpSioMgn54mrxqqpY0kw5Gudqx9hhxkgDLaOFSEP2Ko,10425
|
|
97
101
|
torchx/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -99,15 +103,15 @@ torchx/util/datetime.py,sha256=e-sO5Wjx1Utpln14C3qfJHl4v3KM-SMnn11hSyvkqFY,390
|
|
|
99
103
|
torchx/util/entrypoints.py,sha256=C4A7cF1tPLlfyYWyZ7uZEtsKeuoOoLbMv0sOSxLhXs4,2710
|
|
100
104
|
torchx/util/io.py,sha256=sxb6KI42Lq6n5z6_-YKW_mAhgPdC6CxzexlMyGheWSc,1792
|
|
101
105
|
torchx/util/shlex.py,sha256=KzyWektMeU3oXS3Z5mFkNSPLItBTszVcvQ3EYfOMUYA,448
|
|
102
|
-
torchx/util/strings.py,sha256=
|
|
106
|
+
torchx/util/strings.py,sha256=7CZe5WKHa7IQ6DuJCYeJ5FapUC4Fd1OGeq1yZAmjluw,663
|
|
103
107
|
torchx/util/types.py,sha256=6ASuDKGO91UU3DCSuWhPX_C03341tApLCQEByUz8xpY,7016
|
|
104
108
|
torchx/workspace/__init__.py,sha256=KbGEzJqqXaIxALm_EQO64aw-fE7MeDMFXcpU1mY650I,783
|
|
105
109
|
torchx/workspace/api.py,sha256=Ej6DR__mNWaVyZgoVNAAOloDy1kTD5X1jz7pRtoVf80,5464
|
|
106
110
|
torchx/workspace/dir_workspace.py,sha256=Fz-hKIx0KN8iJf2BsthNj0NvTkWlxP6WFsElPs_BaT0,2253
|
|
107
111
|
torchx/workspace/docker_workspace.py,sha256=Yd8ut26bNfjyJQnmH8ANOrflfr-4VKcnOrIjbi_XIUY,9208
|
|
108
|
-
torchx_nightly-2023.3.
|
|
109
|
-
torchx_nightly-2023.3.
|
|
110
|
-
torchx_nightly-2023.3.
|
|
111
|
-
torchx_nightly-2023.3.
|
|
112
|
-
torchx_nightly-2023.3.
|
|
113
|
-
torchx_nightly-2023.3.
|
|
112
|
+
torchx_nightly-2023.3.22.dist-info/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
|
113
|
+
torchx_nightly-2023.3.22.dist-info/METADATA,sha256=UbHY2VUh7IvI4ty1vY1HD-0V8LCtrIakJVG77fVzEaU,5422
|
|
114
|
+
torchx_nightly-2023.3.22.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
115
|
+
torchx_nightly-2023.3.22.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
|
|
116
|
+
torchx_nightly-2023.3.22.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
117
|
+
torchx_nightly-2023.3.22.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|