torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/__init__.py +2 -0
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/apps/serve/serve.py +2 -0
- torchx/apps/utils/booth_main.py +2 -0
- torchx/apps/utils/copy_main.py +2 -0
- torchx/apps/utils/process_monitor.py +2 -0
- torchx/cli/__init__.py +2 -0
- torchx/cli/argparse_util.py +38 -3
- torchx/cli/cmd_base.py +2 -0
- torchx/cli/cmd_cancel.py +2 -0
- torchx/cli/cmd_configure.py +2 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_describe.py +2 -0
- torchx/cli/cmd_list.py +8 -4
- torchx/cli/cmd_log.py +6 -24
- torchx/cli/cmd_run.py +269 -45
- torchx/cli/cmd_runopts.py +2 -0
- torchx/cli/cmd_status.py +12 -1
- torchx/cli/cmd_tracker.py +3 -1
- torchx/cli/colors.py +2 -0
- torchx/cli/main.py +4 -0
- torchx/components/__init__.py +3 -8
- torchx/components/component_test_base.py +2 -0
- torchx/components/dist.py +18 -7
- torchx/components/integration_tests/component_provider.py +4 -2
- torchx/components/integration_tests/integ_tests.py +2 -0
- torchx/components/serve.py +2 -0
- torchx/components/structured_arg.py +4 -3
- torchx/components/utils.py +15 -4
- torchx/distributed/__init__.py +2 -4
- torchx/examples/apps/datapreproc/datapreproc.py +2 -0
- torchx/examples/apps/lightning/data.py +5 -3
- torchx/examples/apps/lightning/model.py +7 -6
- torchx/examples/apps/lightning/profiler.py +7 -4
- torchx/examples/apps/lightning/train.py +11 -2
- torchx/examples/torchx_out_of_sync_training.py +11 -0
- torchx/notebook.py +2 -0
- torchx/runner/__init__.py +2 -0
- torchx/runner/api.py +167 -60
- torchx/runner/config.py +43 -10
- torchx/runner/events/__init__.py +57 -13
- torchx/runner/events/api.py +14 -3
- torchx/runner/events/handlers.py +2 -0
- torchx/runtime/tracking/__init__.py +2 -0
- torchx/runtime/tracking/api.py +2 -0
- torchx/schedulers/__init__.py +16 -15
- torchx/schedulers/api.py +70 -14
- torchx/schedulers/aws_batch_scheduler.py +75 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
- torchx/schedulers/devices.py +17 -4
- torchx/schedulers/docker_scheduler.py +43 -11
- torchx/schedulers/ids.py +29 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
- torchx/schedulers/kubernetes_scheduler.py +383 -38
- torchx/schedulers/local_scheduler.py +100 -27
- torchx/schedulers/lsf_scheduler.py +5 -4
- torchx/schedulers/slurm_scheduler.py +336 -20
- torchx/schedulers/streams.py +2 -0
- torchx/specs/__init__.py +89 -12
- torchx/specs/api.py +418 -30
- torchx/specs/builders.py +176 -38
- torchx/specs/file_linter.py +143 -57
- torchx/specs/finder.py +68 -28
- torchx/specs/named_resources_aws.py +181 -4
- torchx/specs/named_resources_generic.py +2 -0
- torchx/specs/overlays.py +106 -0
- torchx/specs/test/components/__init__.py +2 -0
- torchx/specs/test/components/a/__init__.py +2 -0
- torchx/specs/test/components/a/b/__init__.py +2 -0
- torchx/specs/test/components/a/b/c.py +2 -0
- torchx/specs/test/components/c/__init__.py +2 -0
- torchx/specs/test/components/c/d.py +2 -0
- torchx/tracker/__init__.py +12 -6
- torchx/tracker/api.py +15 -18
- torchx/tracker/backend/fsspec.py +2 -0
- torchx/util/cuda.py +2 -0
- torchx/util/datetime.py +2 -0
- torchx/util/entrypoints.py +39 -15
- torchx/util/io.py +2 -0
- torchx/util/log_tee_helpers.py +210 -0
- torchx/util/modules.py +65 -0
- torchx/util/session.py +42 -0
- torchx/util/shlex.py +2 -0
- torchx/util/strings.py +3 -1
- torchx/util/types.py +90 -29
- torchx/version.py +4 -2
- torchx/workspace/__init__.py +2 -0
- torchx/workspace/api.py +136 -6
- torchx/workspace/dir_workspace.py +2 -0
- torchx/workspace/docker_workspace.py +30 -2
- torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
- torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
- torchx/pipelines/kfp/__init__.py +0 -28
- torchx/pipelines/kfp/adapter.py +0 -271
- torchx/pipelines/kfp/version.py +0 -17
- torchx/schedulers/gcp_batch_scheduler.py +0 -487
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -453
- torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
- torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
torchx/cli/cmd_run.py
CHANGED
|
@@ -4,23 +4,29 @@
|
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
7
9
|
import argparse
|
|
10
|
+
import json
|
|
8
11
|
import logging
|
|
9
12
|
import os
|
|
10
13
|
import sys
|
|
11
14
|
import threading
|
|
12
|
-
from
|
|
15
|
+
from collections import Counter
|
|
16
|
+
from dataclasses import asdict, dataclass, field, fields, MISSING as DATACLASS_MISSING
|
|
17
|
+
from itertools import groupby
|
|
13
18
|
from pathlib import Path
|
|
14
19
|
from pprint import pformat
|
|
15
|
-
from typing import Dict, List, Optional, Tuple
|
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
16
21
|
|
|
17
22
|
import torchx.specs as specs
|
|
18
|
-
from torchx.cli.argparse_util import torchxconfig_run
|
|
23
|
+
from torchx.cli.argparse_util import ArgOnceAction, torchxconfig_run
|
|
19
24
|
from torchx.cli.cmd_base import SubCommand
|
|
20
25
|
from torchx.cli.cmd_log import get_logs
|
|
21
26
|
from torchx.runner import config, get_runner, Runner
|
|
22
27
|
from torchx.runner.config import load_sections
|
|
23
28
|
from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
|
|
29
|
+
from torchx.specs import CfgVal, Workspace
|
|
24
30
|
from torchx.specs.finder import (
|
|
25
31
|
_Component,
|
|
26
32
|
ComponentNotFoundException,
|
|
@@ -28,6 +34,7 @@ from torchx.specs.finder import (
|
|
|
28
34
|
get_builtin_source,
|
|
29
35
|
get_components,
|
|
30
36
|
)
|
|
37
|
+
from torchx.util.log_tee_helpers import tee_logs
|
|
31
38
|
from torchx.util.types import none_throws
|
|
32
39
|
|
|
33
40
|
|
|
@@ -35,10 +42,81 @@ MISSING_COMPONENT_ERROR_MSG = (
|
|
|
35
42
|
"missing component name, either provide it from the CLI or in .torchxconfig"
|
|
36
43
|
)
|
|
37
44
|
|
|
45
|
+
LOCAL_SCHEDULER_WARNING_MSG = (
|
|
46
|
+
"`local` scheduler is deprecated and will be"
|
|
47
|
+
" removed in the near future,"
|
|
48
|
+
" please use other variants of the local scheduler"
|
|
49
|
+
" (e.g. `local_cwd`)"
|
|
50
|
+
)
|
|
38
51
|
|
|
39
52
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
40
53
|
|
|
41
54
|
|
|
55
|
+
@dataclass
|
|
56
|
+
class TorchXRunArgs:
|
|
57
|
+
component_name: str
|
|
58
|
+
scheduler: str
|
|
59
|
+
scheduler_args: Dict[str, Any]
|
|
60
|
+
scheduler_cfg: Dict[str, CfgVal] = field(default_factory=dict)
|
|
61
|
+
dryrun: bool = False
|
|
62
|
+
wait: bool = False
|
|
63
|
+
log: bool = False
|
|
64
|
+
workspace: str = ""
|
|
65
|
+
parent_run_id: Optional[str] = None
|
|
66
|
+
tee_logs: bool = False
|
|
67
|
+
component_args: Dict[str, Any] = field(default_factory=dict)
|
|
68
|
+
component_args_str: List[str] = field(default_factory=list)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def torchx_run_args_from_json(json_data: Dict[str, Any]) -> TorchXRunArgs:
|
|
72
|
+
all_fields = [f.name for f in fields(TorchXRunArgs)]
|
|
73
|
+
required_fields = {
|
|
74
|
+
f.name
|
|
75
|
+
for f in fields(TorchXRunArgs)
|
|
76
|
+
if f.default is DATACLASS_MISSING and f.default_factory is DATACLASS_MISSING
|
|
77
|
+
}
|
|
78
|
+
missing_fields = required_fields - json_data.keys()
|
|
79
|
+
if missing_fields:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"The following required fields are missing: {', '.join(missing_fields)}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Fail if there are fields that aren't part of the run command
|
|
85
|
+
filtered_json_data = {k: v for k, v in json_data.items() if k in all_fields}
|
|
86
|
+
extra_fields = set(json_data.keys()) - set(all_fields)
|
|
87
|
+
if extra_fields:
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"The following fields are not part of the run command: {', '.join(extra_fields)}.",
|
|
90
|
+
"Please check your JSON and try launching again.",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
torchx_args = TorchXRunArgs(**filtered_json_data)
|
|
94
|
+
if torchx_args.workspace == "":
|
|
95
|
+
torchx_args.workspace = f"{Path.cwd()}"
|
|
96
|
+
return torchx_args
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def torchx_run_args_from_argparse(
|
|
100
|
+
args: argparse.Namespace,
|
|
101
|
+
component_name: str,
|
|
102
|
+
component_args: List[str],
|
|
103
|
+
scheduler_cfg: Dict[str, CfgVal],
|
|
104
|
+
) -> TorchXRunArgs:
|
|
105
|
+
return TorchXRunArgs(
|
|
106
|
+
component_name=component_name,
|
|
107
|
+
scheduler=args.scheduler,
|
|
108
|
+
scheduler_args={},
|
|
109
|
+
scheduler_cfg=scheduler_cfg,
|
|
110
|
+
dryrun=args.dryrun,
|
|
111
|
+
wait=args.wait,
|
|
112
|
+
log=args.log,
|
|
113
|
+
workspace=args.workspace,
|
|
114
|
+
parent_run_id=args.parent_run_id,
|
|
115
|
+
tee_logs=args.tee_logs,
|
|
116
|
+
component_args_str=component_args,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
42
120
|
def _parse_component_name_and_args(
|
|
43
121
|
component_name_and_args: List[str],
|
|
44
122
|
subparser: argparse.ArgumentParser,
|
|
@@ -82,6 +160,20 @@ def _parse_component_name_and_args(
|
|
|
82
160
|
component = args[0]
|
|
83
161
|
component_args = args[1:]
|
|
84
162
|
|
|
163
|
+
# Error if there are repeated command line arguments each group of arguments,
|
|
164
|
+
# where the groups are separated by "--"
|
|
165
|
+
arg_groups = [list(g) for _, g in groupby(component_args, key=lambda x: x == "--")]
|
|
166
|
+
for arg_group in arg_groups:
|
|
167
|
+
all_options = [
|
|
168
|
+
x
|
|
169
|
+
for x in arg_group
|
|
170
|
+
if x.startswith("-") and x.strip() != "-" and x.strip() != "--"
|
|
171
|
+
]
|
|
172
|
+
arg_count = Counter(all_options)
|
|
173
|
+
duplicates = [arg for arg, count in arg_count.items() if count > 1]
|
|
174
|
+
if len(duplicates) > 0:
|
|
175
|
+
subparser.error(f"Repeated Command Line Arguments: {duplicates}")
|
|
176
|
+
|
|
85
177
|
if not component:
|
|
86
178
|
subparser.error(MISSING_COMPONENT_ERROR_MSG)
|
|
87
179
|
|
|
@@ -114,6 +206,7 @@ class CmdBuiltins(SubCommand):
|
|
|
114
206
|
class CmdRun(SubCommand):
|
|
115
207
|
def __init__(self) -> None:
|
|
116
208
|
self._subparser: Optional[argparse.ArgumentParser] = None
|
|
209
|
+
self._stdin_data_json: Optional[Dict[str, Any]] = None
|
|
117
210
|
|
|
118
211
|
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
|
119
212
|
scheduler_names = get_scheduler_factories().keys()
|
|
@@ -131,6 +224,7 @@ class CmdRun(SubCommand):
|
|
|
131
224
|
"-cfg",
|
|
132
225
|
"--scheduler_args",
|
|
133
226
|
type=str,
|
|
227
|
+
action=ArgOnceAction,
|
|
134
228
|
help="Arguments to pass to the scheduler (Ex:`cluster=foo,user=bar`)."
|
|
135
229
|
" For a list of scheduler run options run: `torchx runopts`",
|
|
136
230
|
)
|
|
@@ -156,46 +250,54 @@ class CmdRun(SubCommand):
|
|
|
156
250
|
subparser.add_argument(
|
|
157
251
|
"--workspace",
|
|
158
252
|
"--buck-target",
|
|
159
|
-
default=f"
|
|
253
|
+
default=f"{Path.cwd()}",
|
|
160
254
|
action=torchxconfig_run,
|
|
161
255
|
help="local workspace to build/patch (buck-target of main binary if using buck)",
|
|
162
256
|
)
|
|
163
257
|
subparser.add_argument(
|
|
164
258
|
"--parent_run_id",
|
|
165
259
|
type=str,
|
|
260
|
+
action=ArgOnceAction,
|
|
166
261
|
help="optional parent run ID that this run belongs to."
|
|
167
262
|
" It can be used to group runs for experiment tracking purposes",
|
|
168
263
|
)
|
|
264
|
+
subparser.add_argument(
|
|
265
|
+
"--tee_logs",
|
|
266
|
+
action="store_true",
|
|
267
|
+
default=False,
|
|
268
|
+
help="Add additional prefix to log lines to indicate which replica is printing the log",
|
|
269
|
+
)
|
|
270
|
+
subparser.add_argument(
|
|
271
|
+
"--stdin",
|
|
272
|
+
action="store_true",
|
|
273
|
+
default=False,
|
|
274
|
+
help="Read JSON input from stdin to parse into torchx run args and run the component.",
|
|
275
|
+
)
|
|
169
276
|
subparser.add_argument(
|
|
170
277
|
"component_name_and_args",
|
|
171
278
|
nargs=argparse.REMAINDER,
|
|
172
279
|
)
|
|
173
280
|
|
|
174
|
-
def
|
|
281
|
+
def _run_inner(self, runner: Runner, args: TorchXRunArgs) -> None:
|
|
175
282
|
if args.scheduler == "local":
|
|
176
|
-
logger.warning(
|
|
177
|
-
"`local` scheduler is deprecated and will be"
|
|
178
|
-
" removed in the near future,"
|
|
179
|
-
" please use other variants of the local scheduler"
|
|
180
|
-
" (e.g. `local_cwd`)"
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
scheduler_opts = runner.scheduler_run_opts(args.scheduler)
|
|
184
|
-
cfg = scheduler_opts.cfg_from_str(args.scheduler_args)
|
|
185
|
-
config.apply(scheduler=args.scheduler, cfg=cfg)
|
|
283
|
+
logger.warning(LOCAL_SCHEDULER_WARNING_MSG)
|
|
186
284
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
285
|
+
config.apply(scheduler=args.scheduler, cfg=args.scheduler_cfg)
|
|
286
|
+
component_args = (
|
|
287
|
+
args.component_args_str
|
|
288
|
+
if args.component_args_str != []
|
|
289
|
+
else args.component_args
|
|
190
290
|
)
|
|
191
291
|
try:
|
|
292
|
+
workspace = Workspace.from_str(args.workspace) if args.workspace else None
|
|
293
|
+
|
|
192
294
|
if args.dryrun:
|
|
193
295
|
dryrun_info = runner.dryrun_component(
|
|
194
|
-
|
|
296
|
+
args.component_name,
|
|
195
297
|
component_args,
|
|
196
298
|
args.scheduler,
|
|
197
|
-
workspace=
|
|
198
|
-
cfg=
|
|
299
|
+
workspace=workspace,
|
|
300
|
+
cfg=args.scheduler_cfg,
|
|
199
301
|
parent_run_id=args.parent_run_id,
|
|
200
302
|
)
|
|
201
303
|
print(
|
|
@@ -206,40 +308,143 @@ class CmdRun(SubCommand):
|
|
|
206
308
|
print("\n=== SCHEDULER REQUEST ===\n" f"{dryrun_info}")
|
|
207
309
|
else:
|
|
208
310
|
app_handle = runner.run_component(
|
|
209
|
-
|
|
311
|
+
args.component_name,
|
|
210
312
|
component_args,
|
|
211
313
|
args.scheduler,
|
|
212
314
|
workspace=args.workspace,
|
|
213
|
-
cfg=
|
|
315
|
+
cfg=args.scheduler_cfg,
|
|
214
316
|
parent_run_id=args.parent_run_id,
|
|
215
317
|
)
|
|
216
318
|
# DO NOT delete this line. It is used by slurm tests to retrieve the app id
|
|
217
319
|
print(app_handle)
|
|
218
320
|
|
|
219
321
|
if args.scheduler.startswith("local"):
|
|
220
|
-
self._wait_and_exit(
|
|
322
|
+
self._wait_and_exit(
|
|
323
|
+
runner, app_handle, log=True, tee_logs=args.tee_logs
|
|
324
|
+
)
|
|
221
325
|
else:
|
|
222
326
|
logger.info(f"Launched app: {app_handle}")
|
|
223
327
|
app_status = runner.status(app_handle)
|
|
224
328
|
if app_status:
|
|
225
329
|
logger.info(app_status.format())
|
|
226
330
|
if args.wait or args.log:
|
|
227
|
-
self._wait_and_exit(
|
|
331
|
+
self._wait_and_exit(
|
|
332
|
+
runner, app_handle, log=args.log, tee_logs=args.tee_logs
|
|
333
|
+
)
|
|
228
334
|
|
|
229
335
|
except (ComponentValidationException, ComponentNotFoundException) as e:
|
|
230
|
-
error_msg =
|
|
336
|
+
error_msg = (
|
|
337
|
+
f"\nFailed to run component `{args.component_name}` got errors: \n {e}"
|
|
338
|
+
)
|
|
231
339
|
logger.error(error_msg)
|
|
232
340
|
sys.exit(1)
|
|
233
341
|
except specs.InvalidRunConfigException as e:
|
|
234
342
|
error_msg = (
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
"
|
|
343
|
+
"Invalid scheduler configuration: %s\n"
|
|
344
|
+
"To configure scheduler options, either:\n"
|
|
345
|
+
" 1. Use the `-cfg` command-line argument, e.g., `-cfg key1=value1,key2=value2`\n"
|
|
346
|
+
" 2. Set up a `.torchxconfig` file. For more details, visit: https://meta-pytorch.org/torchx/main/runner.config.html\n"
|
|
347
|
+
"Run `torchx runopts %s` to check all available configuration options for the "
|
|
348
|
+
"`%s` scheduler."
|
|
239
349
|
)
|
|
240
|
-
|
|
350
|
+
print(error_msg % (e, args.scheduler, args.scheduler), file=sys.stderr)
|
|
241
351
|
sys.exit(1)
|
|
242
352
|
|
|
353
|
+
def _run_from_cli_args(self, runner: Runner, args: argparse.Namespace) -> None:
|
|
354
|
+
scheduler_opts = runner.scheduler_run_opts(args.scheduler)
|
|
355
|
+
cfg = scheduler_opts.cfg_from_str(args.scheduler_args)
|
|
356
|
+
|
|
357
|
+
component, component_args = _parse_component_name_and_args(
|
|
358
|
+
args.component_name_and_args,
|
|
359
|
+
none_throws(self._subparser),
|
|
360
|
+
)
|
|
361
|
+
torchx_run_args = torchx_run_args_from_argparse(
|
|
362
|
+
args, component, component_args, cfg
|
|
363
|
+
)
|
|
364
|
+
self._run_inner(runner, torchx_run_args)
|
|
365
|
+
|
|
366
|
+
def _run_from_stdin_args(self, runner: Runner, stdin_data: Dict[str, Any]) -> None:
|
|
367
|
+
torchx_run_args = torchx_run_args_from_json(stdin_data)
|
|
368
|
+
scheduler_opts = runner.scheduler_run_opts(torchx_run_args.scheduler)
|
|
369
|
+
cfg = scheduler_opts.cfg_from_json_repr(
|
|
370
|
+
json.dumps(torchx_run_args.scheduler_args)
|
|
371
|
+
)
|
|
372
|
+
torchx_run_args.scheduler_cfg = cfg
|
|
373
|
+
self._run_inner(runner, torchx_run_args)
|
|
374
|
+
|
|
375
|
+
def _get_torchx_stdin_args(
|
|
376
|
+
self, args: argparse.Namespace
|
|
377
|
+
) -> Optional[Dict[str, Any]]:
|
|
378
|
+
if not args.stdin:
|
|
379
|
+
return None
|
|
380
|
+
if self._stdin_data_json is None:
|
|
381
|
+
self._stdin_data_json = self.torchx_json_from_stdin(args)
|
|
382
|
+
return self._stdin_data_json
|
|
383
|
+
|
|
384
|
+
def torchx_json_from_stdin(
|
|
385
|
+
self, args: Optional[argparse.Namespace] = None
|
|
386
|
+
) -> Dict[str, Any]:
|
|
387
|
+
try:
|
|
388
|
+
stdin_data_json = json.load(sys.stdin)
|
|
389
|
+
if args and args.dryrun:
|
|
390
|
+
stdin_data_json["dryrun"] = True
|
|
391
|
+
if not isinstance(stdin_data_json, dict):
|
|
392
|
+
logger.error(
|
|
393
|
+
"Invalid JSON input for `torchx run` command. Expected a dictionary."
|
|
394
|
+
)
|
|
395
|
+
sys.exit(1)
|
|
396
|
+
return stdin_data_json
|
|
397
|
+
except (json.JSONDecodeError, EOFError):
|
|
398
|
+
logger.error(
|
|
399
|
+
"Unable to parse JSON input for `torchx run` command, please make sure it's a valid JSON input."
|
|
400
|
+
)
|
|
401
|
+
sys.exit(1)
|
|
402
|
+
|
|
403
|
+
def verify_no_extra_args(self, args: argparse.Namespace) -> None:
|
|
404
|
+
"""
|
|
405
|
+
Verifies that only --stdin was provided when using stdin mode.
|
|
406
|
+
"""
|
|
407
|
+
if not args.stdin:
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
subparser = none_throws(self._subparser)
|
|
411
|
+
conflicting_args = []
|
|
412
|
+
|
|
413
|
+
# Check each argument against its default value
|
|
414
|
+
for action in subparser._actions:
|
|
415
|
+
if action.dest == "stdin": # Skip stdin itself
|
|
416
|
+
continue
|
|
417
|
+
if action.dest == "help": # Skip help
|
|
418
|
+
continue
|
|
419
|
+
if action.dest == "dryrun": # Skip dryrun
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
current_value = getattr(args, action.dest, None)
|
|
423
|
+
default_value = action.default
|
|
424
|
+
|
|
425
|
+
# For arguments that differ from default
|
|
426
|
+
if current_value != default_value:
|
|
427
|
+
# Handle special cases where non-default doesn't mean explicitly set
|
|
428
|
+
if action.dest == "component_name_and_args" and current_value == []:
|
|
429
|
+
continue # Empty list is still default
|
|
430
|
+
print(f"*********\n {default_value} = {current_value}")
|
|
431
|
+
conflicting_args.append(f"--{action.dest.replace('_', '-')}")
|
|
432
|
+
|
|
433
|
+
if conflicting_args:
|
|
434
|
+
subparser.error(
|
|
435
|
+
f"Cannot specify {', '.join(conflicting_args)} when using --stdin. "
|
|
436
|
+
"All configuration should be provided in JSON input."
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
def _run(self, runner: Runner, args: argparse.Namespace) -> None:
|
|
440
|
+
self.verify_no_extra_args(args)
|
|
441
|
+
if args.stdin:
|
|
442
|
+
stdin_data_json = self._get_torchx_stdin_args(args)
|
|
443
|
+
if stdin_data_json is not None:
|
|
444
|
+
self._run_from_stdin_args(runner, stdin_data_json)
|
|
445
|
+
else:
|
|
446
|
+
self._run_from_cli_args(runner, args)
|
|
447
|
+
|
|
243
448
|
def run(self, args: argparse.Namespace) -> None:
|
|
244
449
|
os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "cli_run")
|
|
245
450
|
component_defaults = load_sections(prefix="component")
|
|
@@ -247,10 +452,16 @@ class CmdRun(SubCommand):
|
|
|
247
452
|
with get_runner(component_defaults=component_defaults) as runner:
|
|
248
453
|
self._run(runner, args)
|
|
249
454
|
|
|
250
|
-
def _wait_and_exit(
|
|
455
|
+
def _wait_and_exit(
|
|
456
|
+
self, runner: Runner, app_handle: str, log: bool, tee_logs: bool = False
|
|
457
|
+
) -> None:
|
|
251
458
|
logger.info("Waiting for the app to finish...")
|
|
252
459
|
|
|
253
|
-
log_thread =
|
|
460
|
+
log_thread = (
|
|
461
|
+
self._start_log_thread(runner, app_handle, tee_logs_enabled=tee_logs)
|
|
462
|
+
if log
|
|
463
|
+
else None
|
|
464
|
+
)
|
|
254
465
|
|
|
255
466
|
status = runner.wait(app_handle, wait_interval=1)
|
|
256
467
|
if not status:
|
|
@@ -267,17 +478,30 @@ class CmdRun(SubCommand):
|
|
|
267
478
|
else:
|
|
268
479
|
logger.debug(status)
|
|
269
480
|
|
|
270
|
-
def _start_log_thread(
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
481
|
+
def _start_log_thread(
|
|
482
|
+
self, runner: Runner, app_handle: str, tee_logs_enabled: bool = False
|
|
483
|
+
) -> threading.Thread:
|
|
484
|
+
if tee_logs_enabled:
|
|
485
|
+
thread = tee_logs(
|
|
486
|
+
dst=sys.stderr,
|
|
487
|
+
app_handle=app_handle,
|
|
488
|
+
regex=None,
|
|
489
|
+
runner=runner,
|
|
490
|
+
should_tail=True,
|
|
491
|
+
streams=None,
|
|
492
|
+
colorize=not sys.stderr.closed and sys.stderr.isatty(),
|
|
493
|
+
)
|
|
494
|
+
else:
|
|
495
|
+
thread = threading.Thread(
|
|
496
|
+
target=get_logs,
|
|
497
|
+
kwargs={
|
|
498
|
+
"file": sys.stderr,
|
|
499
|
+
"runner": runner,
|
|
500
|
+
"identifier": app_handle,
|
|
501
|
+
"regex": None,
|
|
502
|
+
"should_tail": True,
|
|
503
|
+
},
|
|
504
|
+
)
|
|
505
|
+
thread.daemon = True
|
|
282
506
|
thread.start()
|
|
283
507
|
return thread
|
torchx/cli/cmd_runopts.py
CHANGED
torchx/cli/cmd_status.py
CHANGED
|
@@ -5,7 +5,10 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
8
10
|
import argparse
|
|
11
|
+
import json
|
|
9
12
|
import logging
|
|
10
13
|
import sys
|
|
11
14
|
from typing import List, Optional
|
|
@@ -44,6 +47,11 @@ class CmdStatus(SubCommand):
|
|
|
44
47
|
subparser.add_argument(
|
|
45
48
|
"--roles", type=str, default="", help="comma separated roles to filter"
|
|
46
49
|
)
|
|
50
|
+
subparser.add_argument(
|
|
51
|
+
"--json",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="output the status in JSON format",
|
|
54
|
+
)
|
|
47
55
|
|
|
48
56
|
def run(self, args: argparse.Namespace) -> None:
|
|
49
57
|
app_handle = args.app_handle
|
|
@@ -52,7 +60,10 @@ class CmdStatus(SubCommand):
|
|
|
52
60
|
app_status = runner.status(app_handle)
|
|
53
61
|
filter_roles = parse_list_arg(args.roles)
|
|
54
62
|
if app_status:
|
|
55
|
-
|
|
63
|
+
if args.json:
|
|
64
|
+
print(json.dumps(app_status.to_json(filter_roles)))
|
|
65
|
+
else:
|
|
66
|
+
print(app_status.format(filter_roles))
|
|
56
67
|
else:
|
|
57
68
|
logger.error(
|
|
58
69
|
f"AppDef: {app_id},"
|
torchx/cli/cmd_tracker.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
7
9
|
import argparse
|
|
8
10
|
import logging
|
|
9
11
|
|
|
@@ -43,7 +45,7 @@ class CmdTracker(SubCommand):
|
|
|
43
45
|
else:
|
|
44
46
|
raise RuntimeError(
|
|
45
47
|
"No trackers configured."
|
|
46
|
-
" See: https://pytorch.org/torchx/latest/runtime/tracking.html"
|
|
48
|
+
" See: https://meta-pytorch.org/torchx/latest/runtime/tracking.html"
|
|
47
49
|
)
|
|
48
50
|
|
|
49
51
|
def add_list_job_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
torchx/cli/colors.py
CHANGED
torchx/cli/main.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
7
9
|
import logging
|
|
8
10
|
import os
|
|
9
11
|
import sys
|
|
@@ -14,6 +16,7 @@ import torchx
|
|
|
14
16
|
from torchx.cli.cmd_base import SubCommand
|
|
15
17
|
from torchx.cli.cmd_cancel import CmdCancel
|
|
16
18
|
from torchx.cli.cmd_configure import CmdConfigure
|
|
19
|
+
from torchx.cli.cmd_delete import CmdDelete
|
|
17
20
|
from torchx.cli.cmd_describe import CmdDescribe
|
|
18
21
|
from torchx.cli.cmd_list import CmdList
|
|
19
22
|
from torchx.cli.cmd_log import CmdLog
|
|
@@ -35,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
|
|
|
35
38
|
"builtins": CmdBuiltins(),
|
|
36
39
|
"cancel": CmdCancel(),
|
|
37
40
|
"configure": CmdConfigure(),
|
|
41
|
+
"delete": CmdDelete(),
|
|
38
42
|
"describe": CmdDescribe(),
|
|
39
43
|
"list": CmdList(),
|
|
40
44
|
"log": CmdLog(),
|
torchx/components/__init__.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
7
9
|
"""
|
|
8
10
|
This module contains a collection of builtin TorchX components. The directory
|
|
9
11
|
structure is organized by component category. Components are simply
|
|
@@ -179,7 +181,7 @@ To validate that you've defined your component correctly you can either:
|
|
|
179
181
|
|
|
180
182
|
1. (easiest) Dryrun your component's ``--help`` with the cli: ``torchx run --dryrun ~/component.py:train --help``
|
|
181
183
|
2. Use the component :ref:`linter<specs:Component Linter>`
|
|
182
|
-
(see `dist_test.py <https://github.com/pytorch/torchx/blob/main/torchx/components/test/dist_test.py>`_ as an example)
|
|
184
|
+
(see `dist_test.py <https://github.com/meta-pytorch/torchx/blob/main/torchx/components/test/dist_test.py>`_ as an example)
|
|
183
185
|
|
|
184
186
|
|
|
185
187
|
Running as a Job
|
|
@@ -296,13 +298,6 @@ imagine the component is defined as:
|
|
|
296
298
|
* ``*args=["--help"]``: ``torchx run comp.py:f -- --help``
|
|
297
299
|
* ``*args=["--i", "2"]``: ``torchx run comp.py:f --i 1 -- --i 2``
|
|
298
300
|
|
|
299
|
-
Run in a Pipeline
|
|
300
|
-
--------------------------------
|
|
301
|
-
|
|
302
|
-
The :ref:`torchx.pipelines<pipelines:torchx.pipelines>` define adapters that
|
|
303
|
-
convert a torchx component into the object that represents a pipeline "stage" in the
|
|
304
|
-
target pipeline platform (see :ref:`Pipelines` for a list of supported pipeline orchestrators).
|
|
305
|
-
|
|
306
301
|
Additional Resources
|
|
307
302
|
-----------------------
|
|
308
303
|
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
7
9
|
"""
|
|
8
10
|
You can unit test the component definitions as you would normal Python code
|
|
9
11
|
since they are valid Python definitions.
|