torchx-nightly 2023.10.21__py3-none-any.whl → 2025.12.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show
  1. torchx/__init__.py +2 -0
  2. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  3. torchx/apps/serve/serve.py +2 -0
  4. torchx/apps/utils/booth_main.py +2 -0
  5. torchx/apps/utils/copy_main.py +2 -0
  6. torchx/apps/utils/process_monitor.py +2 -0
  7. torchx/cli/__init__.py +2 -0
  8. torchx/cli/argparse_util.py +38 -3
  9. torchx/cli/cmd_base.py +2 -0
  10. torchx/cli/cmd_cancel.py +2 -0
  11. torchx/cli/cmd_configure.py +2 -0
  12. torchx/cli/cmd_delete.py +30 -0
  13. torchx/cli/cmd_describe.py +2 -0
  14. torchx/cli/cmd_list.py +8 -4
  15. torchx/cli/cmd_log.py +6 -24
  16. torchx/cli/cmd_run.py +269 -45
  17. torchx/cli/cmd_runopts.py +2 -0
  18. torchx/cli/cmd_status.py +12 -1
  19. torchx/cli/cmd_tracker.py +3 -1
  20. torchx/cli/colors.py +2 -0
  21. torchx/cli/main.py +4 -0
  22. torchx/components/__init__.py +3 -8
  23. torchx/components/component_test_base.py +2 -0
  24. torchx/components/dist.py +18 -7
  25. torchx/components/integration_tests/component_provider.py +4 -2
  26. torchx/components/integration_tests/integ_tests.py +2 -0
  27. torchx/components/serve.py +2 -0
  28. torchx/components/structured_arg.py +7 -6
  29. torchx/components/utils.py +15 -4
  30. torchx/distributed/__init__.py +2 -4
  31. torchx/examples/apps/datapreproc/datapreproc.py +2 -0
  32. torchx/examples/apps/lightning/data.py +5 -3
  33. torchx/examples/apps/lightning/model.py +7 -6
  34. torchx/examples/apps/lightning/profiler.py +7 -4
  35. torchx/examples/apps/lightning/train.py +11 -2
  36. torchx/examples/torchx_out_of_sync_training.py +11 -0
  37. torchx/notebook.py +2 -0
  38. torchx/runner/__init__.py +2 -0
  39. torchx/runner/api.py +167 -60
  40. torchx/runner/config.py +43 -10
  41. torchx/runner/events/__init__.py +57 -13
  42. torchx/runner/events/api.py +14 -3
  43. torchx/runner/events/handlers.py +2 -0
  44. torchx/runtime/tracking/__init__.py +2 -0
  45. torchx/runtime/tracking/api.py +2 -0
  46. torchx/schedulers/__init__.py +16 -15
  47. torchx/schedulers/api.py +70 -14
  48. torchx/schedulers/aws_batch_scheduler.py +79 -5
  49. torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
  50. torchx/schedulers/devices.py +17 -4
  51. torchx/schedulers/docker_scheduler.py +43 -11
  52. torchx/schedulers/ids.py +29 -23
  53. torchx/schedulers/kubernetes_mcad_scheduler.py +10 -8
  54. torchx/schedulers/kubernetes_scheduler.py +383 -38
  55. torchx/schedulers/local_scheduler.py +100 -27
  56. torchx/schedulers/lsf_scheduler.py +5 -4
  57. torchx/schedulers/slurm_scheduler.py +336 -20
  58. torchx/schedulers/streams.py +2 -0
  59. torchx/specs/__init__.py +89 -12
  60. torchx/specs/api.py +431 -32
  61. torchx/specs/builders.py +176 -38
  62. torchx/specs/file_linter.py +143 -57
  63. torchx/specs/finder.py +68 -28
  64. torchx/specs/named_resources_aws.py +254 -22
  65. torchx/specs/named_resources_generic.py +2 -0
  66. torchx/specs/overlays.py +106 -0
  67. torchx/specs/test/components/__init__.py +2 -0
  68. torchx/specs/test/components/a/__init__.py +2 -0
  69. torchx/specs/test/components/a/b/__init__.py +2 -0
  70. torchx/specs/test/components/a/b/c.py +2 -0
  71. torchx/specs/test/components/c/__init__.py +2 -0
  72. torchx/specs/test/components/c/d.py +2 -0
  73. torchx/tracker/__init__.py +12 -6
  74. torchx/tracker/api.py +15 -18
  75. torchx/tracker/backend/fsspec.py +2 -0
  76. torchx/util/cuda.py +2 -0
  77. torchx/util/datetime.py +2 -0
  78. torchx/util/entrypoints.py +39 -15
  79. torchx/util/io.py +2 -0
  80. torchx/util/log_tee_helpers.py +210 -0
  81. torchx/util/modules.py +65 -0
  82. torchx/util/session.py +42 -0
  83. torchx/util/shlex.py +2 -0
  84. torchx/util/strings.py +3 -1
  85. torchx/util/types.py +90 -29
  86. torchx/version.py +4 -2
  87. torchx/workspace/__init__.py +2 -0
  88. torchx/workspace/api.py +136 -6
  89. torchx/workspace/dir_workspace.py +2 -0
  90. torchx/workspace/docker_workspace.py +30 -2
  91. torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
  92. torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
  93. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
  94. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
  95. torchx/examples/pipelines/__init__.py +0 -0
  96. torchx/examples/pipelines/kfp/__init__.py +0 -0
  97. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
  98. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
  99. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
  100. torchx/pipelines/kfp/__init__.py +0 -28
  101. torchx/pipelines/kfp/adapter.py +0 -271
  102. torchx/pipelines/kfp/version.py +0 -17
  103. torchx/schedulers/gcp_batch_scheduler.py +0 -487
  104. torchx/schedulers/ray/ray_common.py +0 -22
  105. torchx/schedulers/ray/ray_driver.py +0 -307
  106. torchx/schedulers/ray_scheduler.py +0 -453
  107. torchx_nightly-2023.10.21.dist-info/METADATA +0 -174
  108. torchx_nightly-2023.10.21.dist-info/RECORD +0 -118
  109. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
  110. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,8 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  """
9
11
  This contains the TorchX local scheduler which can be used to run TorchX
10
12
  components locally via subprocesses.
@@ -27,10 +29,21 @@ import warnings
27
29
  from dataclasses import asdict, dataclass
28
30
  from datetime import datetime
29
31
  from types import FrameType
30
- from typing import Any, BinaryIO, Callable, Dict, Iterable, List, Optional, TextIO
32
+ from typing import (
33
+ Any,
34
+ BinaryIO,
35
+ Callable,
36
+ Dict,
37
+ Iterable,
38
+ List,
39
+ Optional,
40
+ Protocol,
41
+ TextIO,
42
+ Tuple,
43
+ TypedDict,
44
+ )
31
45
 
32
46
  from torchx.schedulers.api import (
33
- AppDryRunInfo,
34
47
  DescribeAppResponse,
35
48
  filter_regex,
36
49
  ListAppResponse,
@@ -40,10 +53,10 @@ from torchx.schedulers.api import (
40
53
  )
41
54
  from torchx.schedulers.ids import make_unique
42
55
  from torchx.schedulers.streams import Tee
56
+ from torchx.specs import AppDryRunInfo
43
57
  from torchx.specs.api import AppDef, AppState, is_terminal, macros, NONE, Role, runopts
44
58
 
45
59
  from torchx.util.types import none_throws
46
- from typing_extensions import TypedDict
47
60
 
48
61
  log: logging.Logger = logging.getLogger(__name__)
49
62
 
@@ -252,6 +265,26 @@ AppName = str
252
265
  RoleName = str
253
266
 
254
267
 
268
+ class PopenProtocol(Protocol):
269
+ """
270
+ Protocol wrapper around python's ``subprocess.Popen``. Keeps track of
271
+ the a list of interface methods that the process scheduled by the `LocalScheduler`
272
+ must implement.
273
+ """
274
+
275
+ @property
276
+ def pid(self) -> int: ...
277
+
278
+ @property
279
+ def returncode(self) -> int: ...
280
+
281
+ def wait(self, timeout: Optional[float] = None) -> int: ...
282
+
283
+ def poll(self) -> Optional[int]: ...
284
+
285
+ def kill(self) -> None: ...
286
+
287
+
255
288
  @dataclass
256
289
  class _LocalReplica:
257
290
  """
@@ -260,8 +293,7 @@ class _LocalReplica:
260
293
 
261
294
  role_name: RoleName
262
295
  replica_id: int
263
- # pyre-fixme[24]: Generic type `subprocess.Popen` expects 1 type parameter.
264
- proc: subprocess.Popen
296
+ proc: PopenProtocol
265
297
 
266
298
  # IO streams:
267
299
  # None means no log_dir (out to console)
@@ -598,7 +630,7 @@ class LocalScheduler(Scheduler[LocalOpts]):
598
630
  )
599
631
  return opts
600
632
 
601
- def _validate(self, app: AppDef, scheduler: str) -> None:
633
+ def _validate(self, app: AppDef, scheduler: str, cfg: LocalOpts) -> None:
602
634
  # Skip validation step for local application
603
635
  pass
604
636
 
@@ -658,6 +690,55 @@ class LocalScheduler(Scheduler[LocalOpts]):
658
690
  as file name ``str`` rather than a file-like obj.
659
691
  """
660
692
 
693
+ stdout_, stderr_, combined_ = self._get_replica_output_handles(replica_params)
694
+
695
+ args_pfmt = pprint.pformat(asdict(replica_params), indent=2, width=80)
696
+ log.debug(f"Running {role_name} (replica {replica_id}):\n {args_pfmt}")
697
+ env = self._get_replica_env(replica_params)
698
+
699
+ proc = self.run_local_job(
700
+ args=replica_params.args,
701
+ env=env,
702
+ stdout=stdout_,
703
+ stderr=stderr_,
704
+ cwd=replica_params.cwd,
705
+ )
706
+ return _LocalReplica(
707
+ role_name,
708
+ replica_id,
709
+ proc,
710
+ stdout=stdout_,
711
+ stderr=stderr_,
712
+ combined=combined_,
713
+ error_file=env.get("TORCHELASTIC_ERROR_FILE", "<N/A>"),
714
+ )
715
+
716
+ def run_local_job(
717
+ self,
718
+ args: List[str],
719
+ env: Dict[str, str],
720
+ stdout: Optional[io.FileIO],
721
+ stderr: Optional[io.FileIO],
722
+ cwd: Optional[str] = None,
723
+ ) -> "subprocess.Popen[bytes]":
724
+ return subprocess.Popen(
725
+ args=args,
726
+ env=env,
727
+ stdout=stdout,
728
+ stderr=stderr,
729
+ start_new_session=True,
730
+ cwd=cwd,
731
+ )
732
+
733
+ def _get_replica_output_handles(
734
+ self,
735
+ replica_params: ReplicaParam,
736
+ ) -> Tuple[Optional[io.FileIO], Optional[io.FileIO], Optional[Tee]]:
737
+ """
738
+ Returns the stdout, stderr, and combined outputs of the replica.
739
+ If the combined output file is not specified, then the combined output is ``None``.
740
+ """
741
+
661
742
  stdout_ = self._get_file_io(replica_params.stdout)
662
743
  stderr_ = self._get_file_io(replica_params.stderr)
663
744
  combined_: Optional[Tee] = None
@@ -668,6 +749,15 @@ class LocalScheduler(Scheduler[LocalOpts]):
668
749
  none_throws(replica_params.stdout),
669
750
  none_throws(replica_params.stderr),
670
751
  )
752
+ return stdout_, stderr_, combined_
753
+
754
+ def _get_replica_env(
755
+ self,
756
+ replica_params: ReplicaParam,
757
+ ) -> Dict[str, str]:
758
+ """
759
+ Returns environment variables for the ``_LocalReplica``
760
+ """
671
761
 
672
762
  # inherit parent's env vars since 99.9% of the time we want this behavior
673
763
  # just make sure we override the parent's env vars with the user_defined ones
@@ -679,26 +769,7 @@ class LocalScheduler(Scheduler[LocalOpts]):
679
769
  # default to unbuffered python for faster responsiveness locally
680
770
  env.setdefault("PYTHONUNBUFFERED", "x")
681
771
 
682
- args_pfmt = pprint.pformat(asdict(replica_params), indent=2, width=80)
683
- log.debug(f"Running {role_name} (replica {replica_id}):\n {args_pfmt}")
684
-
685
- proc = subprocess.Popen(
686
- args=replica_params.args,
687
- env=env,
688
- stdout=stdout_,
689
- stderr=stderr_,
690
- start_new_session=True,
691
- cwd=replica_params.cwd,
692
- )
693
- return _LocalReplica(
694
- role_name,
695
- replica_id,
696
- proc,
697
- stdout=stdout_,
698
- stderr=stderr_,
699
- combined=combined_,
700
- error_file=env.get("TORCHELASTIC_ERROR_FILE", "<N/A>"),
701
- )
772
+ return env
702
773
 
703
774
  def _get_app_log_dir(self, app_id: str, cfg: LocalOpts) -> str:
704
775
  """
@@ -1088,6 +1159,7 @@ class LogIterator:
1088
1159
  self._check_finished() # check to see if app has finished running
1089
1160
 
1090
1161
  if os.path.isfile(self._log_file):
1162
+ time.sleep(0.1) # fix timing issue
1091
1163
  self._log_fp = open(
1092
1164
  self._log_file,
1093
1165
  mode="rt",
@@ -1129,11 +1201,12 @@ def create_scheduler(
1129
1201
  session_name: str,
1130
1202
  cache_size: int = 100,
1131
1203
  extra_paths: Optional[List[str]] = None,
1204
+ image_provider_class: Callable[[LocalOpts], ImageProvider] = CWDImageProvider,
1132
1205
  **kwargs: Any,
1133
1206
  ) -> LocalScheduler:
1134
1207
  return LocalScheduler(
1135
1208
  session_name=session_name,
1136
- image_provider_class=CWDImageProvider,
1209
+ image_provider_class=image_provider_class,
1137
1210
  cache_size=cache_size,
1138
1211
  extra_paths=extra_paths,
1139
1212
  )
@@ -5,6 +5,8 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  """
9
11
  This contains the TorchX LSF scheduler which can be used to run TorchX
10
12
  components on a LSF cluster.
@@ -27,11 +29,10 @@ import subprocess
27
29
  import tempfile
28
30
  from dataclasses import dataclass
29
31
  from datetime import datetime
30
- from typing import Any, Dict, Iterable, List, Optional
32
+ from typing import Any, Dict, Iterable, List, Optional, TypedDict
31
33
 
32
34
  import torchx
33
35
  from torchx.schedulers.api import (
34
- AppDryRunInfo,
35
36
  DescribeAppResponse,
36
37
  filter_regex,
37
38
  ListAppResponse,
@@ -43,6 +44,7 @@ from torchx.schedulers.ids import make_unique
43
44
  from torchx.schedulers.local_scheduler import LogIterator
44
45
  from torchx.specs import (
45
46
  AppDef,
47
+ AppDryRunInfo,
46
48
  AppState,
47
49
  BindMount,
48
50
  DeviceMount,
@@ -55,7 +57,6 @@ from torchx.specs import (
55
57
  VolumeMount,
56
58
  )
57
59
  from torchx.util import shlex
58
- from typing_extensions import TypedDict
59
60
 
60
61
  JOB_STATE: Dict[str, AppState] = {
61
62
  "DONE": AppState.SUCCEEDED,
@@ -486,7 +487,7 @@ class LsfScheduler(Scheduler[LsfOpts]):
486
487
  subprocess.run(req.cmd, stdout=subprocess.PIPE, check=True)
487
488
  return req.app_id
488
489
 
489
- def _validate(self, app: AppDef, scheduler: str) -> None:
490
+ def _validate(self, app: AppDef, scheduler: str, cfg: LsfOpts) -> None:
490
491
  # Skip validation step for lsf
491
492
  pass
492
493