torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/__init__.py +2 -0
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/apps/serve/serve.py +2 -0
- torchx/apps/utils/booth_main.py +2 -0
- torchx/apps/utils/copy_main.py +2 -0
- torchx/apps/utils/process_monitor.py +2 -0
- torchx/cli/__init__.py +2 -0
- torchx/cli/argparse_util.py +38 -3
- torchx/cli/cmd_base.py +2 -0
- torchx/cli/cmd_cancel.py +2 -0
- torchx/cli/cmd_configure.py +2 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_describe.py +2 -0
- torchx/cli/cmd_list.py +8 -4
- torchx/cli/cmd_log.py +6 -24
- torchx/cli/cmd_run.py +269 -45
- torchx/cli/cmd_runopts.py +2 -0
- torchx/cli/cmd_status.py +12 -1
- torchx/cli/cmd_tracker.py +3 -1
- torchx/cli/colors.py +2 -0
- torchx/cli/main.py +4 -0
- torchx/components/__init__.py +3 -8
- torchx/components/component_test_base.py +2 -0
- torchx/components/dist.py +18 -7
- torchx/components/integration_tests/component_provider.py +4 -2
- torchx/components/integration_tests/integ_tests.py +2 -0
- torchx/components/serve.py +2 -0
- torchx/components/structured_arg.py +4 -3
- torchx/components/utils.py +15 -4
- torchx/distributed/__init__.py +2 -4
- torchx/examples/apps/datapreproc/datapreproc.py +2 -0
- torchx/examples/apps/lightning/data.py +5 -3
- torchx/examples/apps/lightning/model.py +7 -6
- torchx/examples/apps/lightning/profiler.py +7 -4
- torchx/examples/apps/lightning/train.py +11 -2
- torchx/examples/torchx_out_of_sync_training.py +11 -0
- torchx/notebook.py +2 -0
- torchx/runner/__init__.py +2 -0
- torchx/runner/api.py +167 -60
- torchx/runner/config.py +43 -10
- torchx/runner/events/__init__.py +57 -13
- torchx/runner/events/api.py +14 -3
- torchx/runner/events/handlers.py +2 -0
- torchx/runtime/tracking/__init__.py +2 -0
- torchx/runtime/tracking/api.py +2 -0
- torchx/schedulers/__init__.py +16 -15
- torchx/schedulers/api.py +70 -14
- torchx/schedulers/aws_batch_scheduler.py +75 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
- torchx/schedulers/devices.py +17 -4
- torchx/schedulers/docker_scheduler.py +43 -11
- torchx/schedulers/ids.py +29 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
- torchx/schedulers/kubernetes_scheduler.py +383 -38
- torchx/schedulers/local_scheduler.py +100 -27
- torchx/schedulers/lsf_scheduler.py +5 -4
- torchx/schedulers/slurm_scheduler.py +336 -20
- torchx/schedulers/streams.py +2 -0
- torchx/specs/__init__.py +89 -12
- torchx/specs/api.py +418 -30
- torchx/specs/builders.py +176 -38
- torchx/specs/file_linter.py +143 -57
- torchx/specs/finder.py +68 -28
- torchx/specs/named_resources_aws.py +181 -4
- torchx/specs/named_resources_generic.py +2 -0
- torchx/specs/overlays.py +106 -0
- torchx/specs/test/components/__init__.py +2 -0
- torchx/specs/test/components/a/__init__.py +2 -0
- torchx/specs/test/components/a/b/__init__.py +2 -0
- torchx/specs/test/components/a/b/c.py +2 -0
- torchx/specs/test/components/c/__init__.py +2 -0
- torchx/specs/test/components/c/d.py +2 -0
- torchx/tracker/__init__.py +12 -6
- torchx/tracker/api.py +15 -18
- torchx/tracker/backend/fsspec.py +2 -0
- torchx/util/cuda.py +2 -0
- torchx/util/datetime.py +2 -0
- torchx/util/entrypoints.py +39 -15
- torchx/util/io.py +2 -0
- torchx/util/log_tee_helpers.py +210 -0
- torchx/util/modules.py +65 -0
- torchx/util/session.py +42 -0
- torchx/util/shlex.py +2 -0
- torchx/util/strings.py +3 -1
- torchx/util/types.py +90 -29
- torchx/version.py +4 -2
- torchx/workspace/__init__.py +2 -0
- torchx/workspace/api.py +136 -6
- torchx/workspace/dir_workspace.py +2 -0
- torchx/workspace/docker_workspace.py +30 -2
- torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
- torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
- torchx/pipelines/kfp/__init__.py +0 -28
- torchx/pipelines/kfp/adapter.py +0 -271
- torchx/pipelines/kfp/version.py +0 -17
- torchx/schedulers/gcp_batch_scheduler.py +0 -487
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -453
- torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
- torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
8
10
|
"""
|
|
9
11
|
This contains the TorchX local scheduler which can be used to run TorchX
|
|
10
12
|
components locally via subprocesses.
|
|
@@ -27,10 +29,21 @@ import warnings
|
|
|
27
29
|
from dataclasses import asdict, dataclass
|
|
28
30
|
from datetime import datetime
|
|
29
31
|
from types import FrameType
|
|
30
|
-
from typing import
|
|
32
|
+
from typing import (
|
|
33
|
+
Any,
|
|
34
|
+
BinaryIO,
|
|
35
|
+
Callable,
|
|
36
|
+
Dict,
|
|
37
|
+
Iterable,
|
|
38
|
+
List,
|
|
39
|
+
Optional,
|
|
40
|
+
Protocol,
|
|
41
|
+
TextIO,
|
|
42
|
+
Tuple,
|
|
43
|
+
TypedDict,
|
|
44
|
+
)
|
|
31
45
|
|
|
32
46
|
from torchx.schedulers.api import (
|
|
33
|
-
AppDryRunInfo,
|
|
34
47
|
DescribeAppResponse,
|
|
35
48
|
filter_regex,
|
|
36
49
|
ListAppResponse,
|
|
@@ -40,10 +53,10 @@ from torchx.schedulers.api import (
|
|
|
40
53
|
)
|
|
41
54
|
from torchx.schedulers.ids import make_unique
|
|
42
55
|
from torchx.schedulers.streams import Tee
|
|
56
|
+
from torchx.specs import AppDryRunInfo
|
|
43
57
|
from torchx.specs.api import AppDef, AppState, is_terminal, macros, NONE, Role, runopts
|
|
44
58
|
|
|
45
59
|
from torchx.util.types import none_throws
|
|
46
|
-
from typing_extensions import TypedDict
|
|
47
60
|
|
|
48
61
|
log: logging.Logger = logging.getLogger(__name__)
|
|
49
62
|
|
|
@@ -252,6 +265,26 @@ AppName = str
|
|
|
252
265
|
RoleName = str
|
|
253
266
|
|
|
254
267
|
|
|
268
|
+
class PopenProtocol(Protocol):
|
|
269
|
+
"""
|
|
270
|
+
Protocol wrapper around python's ``subprocess.Popen``. Keeps track of
|
|
271
|
+
the a list of interface methods that the process scheduled by the `LocalScheduler`
|
|
272
|
+
must implement.
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
@property
|
|
276
|
+
def pid(self) -> int: ...
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def returncode(self) -> int: ...
|
|
280
|
+
|
|
281
|
+
def wait(self, timeout: Optional[float] = None) -> int: ...
|
|
282
|
+
|
|
283
|
+
def poll(self) -> Optional[int]: ...
|
|
284
|
+
|
|
285
|
+
def kill(self) -> None: ...
|
|
286
|
+
|
|
287
|
+
|
|
255
288
|
@dataclass
|
|
256
289
|
class _LocalReplica:
|
|
257
290
|
"""
|
|
@@ -260,8 +293,7 @@ class _LocalReplica:
|
|
|
260
293
|
|
|
261
294
|
role_name: RoleName
|
|
262
295
|
replica_id: int
|
|
263
|
-
|
|
264
|
-
proc: subprocess.Popen
|
|
296
|
+
proc: PopenProtocol
|
|
265
297
|
|
|
266
298
|
# IO streams:
|
|
267
299
|
# None means no log_dir (out to console)
|
|
@@ -598,7 +630,7 @@ class LocalScheduler(Scheduler[LocalOpts]):
|
|
|
598
630
|
)
|
|
599
631
|
return opts
|
|
600
632
|
|
|
601
|
-
def _validate(self, app: AppDef, scheduler: str) -> None:
|
|
633
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: LocalOpts) -> None:
|
|
602
634
|
# Skip validation step for local application
|
|
603
635
|
pass
|
|
604
636
|
|
|
@@ -658,6 +690,55 @@ class LocalScheduler(Scheduler[LocalOpts]):
|
|
|
658
690
|
as file name ``str`` rather than a file-like obj.
|
|
659
691
|
"""
|
|
660
692
|
|
|
693
|
+
stdout_, stderr_, combined_ = self._get_replica_output_handles(replica_params)
|
|
694
|
+
|
|
695
|
+
args_pfmt = pprint.pformat(asdict(replica_params), indent=2, width=80)
|
|
696
|
+
log.debug(f"Running {role_name} (replica {replica_id}):\n {args_pfmt}")
|
|
697
|
+
env = self._get_replica_env(replica_params)
|
|
698
|
+
|
|
699
|
+
proc = self.run_local_job(
|
|
700
|
+
args=replica_params.args,
|
|
701
|
+
env=env,
|
|
702
|
+
stdout=stdout_,
|
|
703
|
+
stderr=stderr_,
|
|
704
|
+
cwd=replica_params.cwd,
|
|
705
|
+
)
|
|
706
|
+
return _LocalReplica(
|
|
707
|
+
role_name,
|
|
708
|
+
replica_id,
|
|
709
|
+
proc,
|
|
710
|
+
stdout=stdout_,
|
|
711
|
+
stderr=stderr_,
|
|
712
|
+
combined=combined_,
|
|
713
|
+
error_file=env.get("TORCHELASTIC_ERROR_FILE", "<N/A>"),
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
def run_local_job(
|
|
717
|
+
self,
|
|
718
|
+
args: List[str],
|
|
719
|
+
env: Dict[str, str],
|
|
720
|
+
stdout: Optional[io.FileIO],
|
|
721
|
+
stderr: Optional[io.FileIO],
|
|
722
|
+
cwd: Optional[str] = None,
|
|
723
|
+
) -> "subprocess.Popen[bytes]":
|
|
724
|
+
return subprocess.Popen(
|
|
725
|
+
args=args,
|
|
726
|
+
env=env,
|
|
727
|
+
stdout=stdout,
|
|
728
|
+
stderr=stderr,
|
|
729
|
+
start_new_session=True,
|
|
730
|
+
cwd=cwd,
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
def _get_replica_output_handles(
|
|
734
|
+
self,
|
|
735
|
+
replica_params: ReplicaParam,
|
|
736
|
+
) -> Tuple[Optional[io.FileIO], Optional[io.FileIO], Optional[Tee]]:
|
|
737
|
+
"""
|
|
738
|
+
Returns the stdout, stderr, and combined outputs of the replica.
|
|
739
|
+
If the combined output file is not specified, then the combined output is ``None``.
|
|
740
|
+
"""
|
|
741
|
+
|
|
661
742
|
stdout_ = self._get_file_io(replica_params.stdout)
|
|
662
743
|
stderr_ = self._get_file_io(replica_params.stderr)
|
|
663
744
|
combined_: Optional[Tee] = None
|
|
@@ -668,6 +749,15 @@ class LocalScheduler(Scheduler[LocalOpts]):
|
|
|
668
749
|
none_throws(replica_params.stdout),
|
|
669
750
|
none_throws(replica_params.stderr),
|
|
670
751
|
)
|
|
752
|
+
return stdout_, stderr_, combined_
|
|
753
|
+
|
|
754
|
+
def _get_replica_env(
|
|
755
|
+
self,
|
|
756
|
+
replica_params: ReplicaParam,
|
|
757
|
+
) -> Dict[str, str]:
|
|
758
|
+
"""
|
|
759
|
+
Returns environment variables for the ``_LocalReplica``
|
|
760
|
+
"""
|
|
671
761
|
|
|
672
762
|
# inherit parent's env vars since 99.9% of the time we want this behavior
|
|
673
763
|
# just make sure we override the parent's env vars with the user_defined ones
|
|
@@ -679,26 +769,7 @@ class LocalScheduler(Scheduler[LocalOpts]):
|
|
|
679
769
|
# default to unbuffered python for faster responsiveness locally
|
|
680
770
|
env.setdefault("PYTHONUNBUFFERED", "x")
|
|
681
771
|
|
|
682
|
-
|
|
683
|
-
log.debug(f"Running {role_name} (replica {replica_id}):\n {args_pfmt}")
|
|
684
|
-
|
|
685
|
-
proc = subprocess.Popen(
|
|
686
|
-
args=replica_params.args,
|
|
687
|
-
env=env,
|
|
688
|
-
stdout=stdout_,
|
|
689
|
-
stderr=stderr_,
|
|
690
|
-
start_new_session=True,
|
|
691
|
-
cwd=replica_params.cwd,
|
|
692
|
-
)
|
|
693
|
-
return _LocalReplica(
|
|
694
|
-
role_name,
|
|
695
|
-
replica_id,
|
|
696
|
-
proc,
|
|
697
|
-
stdout=stdout_,
|
|
698
|
-
stderr=stderr_,
|
|
699
|
-
combined=combined_,
|
|
700
|
-
error_file=env.get("TORCHELASTIC_ERROR_FILE", "<N/A>"),
|
|
701
|
-
)
|
|
772
|
+
return env
|
|
702
773
|
|
|
703
774
|
def _get_app_log_dir(self, app_id: str, cfg: LocalOpts) -> str:
|
|
704
775
|
"""
|
|
@@ -1088,6 +1159,7 @@ class LogIterator:
|
|
|
1088
1159
|
self._check_finished() # check to see if app has finished running
|
|
1089
1160
|
|
|
1090
1161
|
if os.path.isfile(self._log_file):
|
|
1162
|
+
time.sleep(0.1) # fix timing issue
|
|
1091
1163
|
self._log_fp = open(
|
|
1092
1164
|
self._log_file,
|
|
1093
1165
|
mode="rt",
|
|
@@ -1129,11 +1201,12 @@ def create_scheduler(
|
|
|
1129
1201
|
session_name: str,
|
|
1130
1202
|
cache_size: int = 100,
|
|
1131
1203
|
extra_paths: Optional[List[str]] = None,
|
|
1204
|
+
image_provider_class: Callable[[LocalOpts], ImageProvider] = CWDImageProvider,
|
|
1132
1205
|
**kwargs: Any,
|
|
1133
1206
|
) -> LocalScheduler:
|
|
1134
1207
|
return LocalScheduler(
|
|
1135
1208
|
session_name=session_name,
|
|
1136
|
-
image_provider_class=
|
|
1209
|
+
image_provider_class=image_provider_class,
|
|
1137
1210
|
cache_size=cache_size,
|
|
1138
1211
|
extra_paths=extra_paths,
|
|
1139
1212
|
)
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
8
10
|
"""
|
|
9
11
|
This contains the TorchX LSF scheduler which can be used to run TorchX
|
|
10
12
|
components on a LSF cluster.
|
|
@@ -27,11 +29,10 @@ import subprocess
|
|
|
27
29
|
import tempfile
|
|
28
30
|
from dataclasses import dataclass
|
|
29
31
|
from datetime import datetime
|
|
30
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
32
|
+
from typing import Any, Dict, Iterable, List, Optional, TypedDict
|
|
31
33
|
|
|
32
34
|
import torchx
|
|
33
35
|
from torchx.schedulers.api import (
|
|
34
|
-
AppDryRunInfo,
|
|
35
36
|
DescribeAppResponse,
|
|
36
37
|
filter_regex,
|
|
37
38
|
ListAppResponse,
|
|
@@ -43,6 +44,7 @@ from torchx.schedulers.ids import make_unique
|
|
|
43
44
|
from torchx.schedulers.local_scheduler import LogIterator
|
|
44
45
|
from torchx.specs import (
|
|
45
46
|
AppDef,
|
|
47
|
+
AppDryRunInfo,
|
|
46
48
|
AppState,
|
|
47
49
|
BindMount,
|
|
48
50
|
DeviceMount,
|
|
@@ -55,7 +57,6 @@ from torchx.specs import (
|
|
|
55
57
|
VolumeMount,
|
|
56
58
|
)
|
|
57
59
|
from torchx.util import shlex
|
|
58
|
-
from typing_extensions import TypedDict
|
|
59
60
|
|
|
60
61
|
JOB_STATE: Dict[str, AppState] = {
|
|
61
62
|
"DONE": AppState.SUCCEEDED,
|
|
@@ -486,7 +487,7 @@ class LsfScheduler(Scheduler[LsfOpts]):
|
|
|
486
487
|
subprocess.run(req.cmd, stdout=subprocess.PIPE, check=True)
|
|
487
488
|
return req.app_id
|
|
488
489
|
|
|
489
|
-
def _validate(self, app: AppDef, scheduler: str) -> None:
|
|
490
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: LsfOpts) -> None:
|
|
490
491
|
# Skip validation step for lsf
|
|
491
492
|
pass
|
|
492
493
|
|