torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_delete.py +30 -0
  3. torchx/cli/cmd_list.py +1 -2
  4. torchx/cli/cmd_run.py +202 -28
  5. torchx/cli/cmd_tracker.py +1 -1
  6. torchx/cli/main.py +2 -0
  7. torchx/components/__init__.py +1 -8
  8. torchx/components/dist.py +9 -3
  9. torchx/components/integration_tests/component_provider.py +2 -2
  10. torchx/components/utils.py +1 -1
  11. torchx/distributed/__init__.py +1 -1
  12. torchx/runner/api.py +102 -81
  13. torchx/runner/config.py +3 -1
  14. torchx/runner/events/__init__.py +20 -10
  15. torchx/runner/events/api.py +1 -1
  16. torchx/schedulers/__init__.py +7 -10
  17. torchx/schedulers/api.py +66 -25
  18. torchx/schedulers/aws_batch_scheduler.py +47 -6
  19. torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
  20. torchx/schedulers/docker_scheduler.py +4 -3
  21. torchx/schedulers/ids.py +27 -23
  22. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
  23. torchx/schedulers/kubernetes_scheduler.py +355 -36
  24. torchx/schedulers/local_scheduler.py +2 -1
  25. torchx/schedulers/lsf_scheduler.py +1 -1
  26. torchx/schedulers/slurm_scheduler.py +102 -27
  27. torchx/specs/__init__.py +40 -9
  28. torchx/specs/api.py +222 -12
  29. torchx/specs/builders.py +109 -28
  30. torchx/specs/file_linter.py +117 -53
  31. torchx/specs/finder.py +25 -37
  32. torchx/specs/named_resources_aws.py +13 -2
  33. torchx/specs/overlays.py +106 -0
  34. torchx/tracker/__init__.py +2 -2
  35. torchx/tracker/api.py +1 -1
  36. torchx/util/entrypoints.py +1 -6
  37. torchx/util/strings.py +1 -1
  38. torchx/util/types.py +12 -1
  39. torchx/version.py +2 -2
  40. torchx/workspace/api.py +102 -5
  41. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
  42. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
  43. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
  44. torchx/examples/pipelines/__init__.py +0 -0
  45. torchx/examples/pipelines/kfp/__init__.py +0 -0
  46. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  47. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  48. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  49. torchx/pipelines/kfp/__init__.py +0 -30
  50. torchx/pipelines/kfp/adapter.py +0 -274
  51. torchx/pipelines/kfp/version.py +0 -19
  52. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  53. torchx/schedulers/ray/ray_common.py +0 -22
  54. torchx/schedulers/ray/ray_driver.py +0 -307
  55. torchx/schedulers/ray_scheduler.py +0 -454
  56. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
  57. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
  58. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
@@ -1,274 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- import json
11
- import os
12
- import os.path
13
- import shlex
14
- from typing import Mapping, Optional, Tuple
15
-
16
- import yaml
17
- from kfp import components, dsl
18
-
19
- # @manual=fbsource//third-party/pypi/kfp:kfp
20
- from kfp.components.structures import ComponentSpec, OutputSpec
21
- from kubernetes.client.models import (
22
- V1ContainerPort,
23
- V1EmptyDirVolumeSource,
24
- V1Volume,
25
- V1VolumeMount,
26
- )
27
- from torchx.schedulers.kubernetes_scheduler import app_to_resource, pod_labels
28
- from torchx.specs import api
29
- from typing_extensions import Protocol
30
-
31
- from .version import __version__ as __version__ # noqa F401
32
-
33
-
34
- def component_spec_from_app(app: api.AppDef) -> Tuple[str, api.Role]:
35
- """
36
- component_spec_from_app takes in a TorchX component and generates the yaml
37
- spec for it. Notably this doesn't apply resources or port_maps since those
38
- must be applied at runtime which is why it returns the role spec as well.
39
-
40
- >>> from torchx import specs
41
- >>> from torchx.pipelines.kfp.adapter import component_spec_from_app
42
- >>> app_def = specs.AppDef(
43
- ... name="trainer",
44
- ... roles=[specs.Role("trainer", image="foo:latest")],
45
- ... )
46
- >>> component_spec_from_app(app_def)
47
- ('description: ...', Role(...))
48
- """
49
- assert len(app.roles) == 1, f"KFP adapter only support one role, got {app.roles}"
50
-
51
- role = app.roles[0]
52
- assert (
53
- role.num_replicas
54
- == 1
55
- # pyre-fixme[16]: `AppDef` has no attribute `num_replicas`.
56
- ), f"KFP adapter only supports one replica, got {app.num_replicas}"
57
-
58
- command = [role.entrypoint, *role.args]
59
-
60
- spec = {
61
- "name": f"{app.name}-{role.name}",
62
- "description": f"KFP wrapper for TorchX component {app.name}, role {role.name}",
63
- "implementation": {
64
- "container": {
65
- "image": role.image,
66
- "command": command,
67
- "env": role.env,
68
- }
69
- },
70
- "outputs": [],
71
- }
72
- return yaml.dump(spec), role
73
-
74
-
75
- class ContainerFactory(Protocol):
76
- """
77
- ContainerFactory is a protocol that represents a function that when called produces a
78
- kfp.dsl.ContainerOp.
79
- """
80
-
81
- def __call__(self, *args: object, **kwargs: object) -> dsl.ContainerOp: ...
82
-
83
-
84
- class KFPContainerFactory(ContainerFactory, Protocol):
85
- """
86
- KFPContainerFactory is a ContainerFactory that also has some KFP metadata
87
- attached to it.
88
- """
89
-
90
- component_spec: ComponentSpec
91
-
92
-
93
- METADATA_FILE = "/tmp/outputs/mlpipeline-ui-metadata/data.json"
94
-
95
-
96
- def component_from_app(
97
- app: api.AppDef, ui_metadata: Optional[Mapping[str, object]] = None
98
- ) -> ContainerFactory:
99
- """
100
- component_from_app takes in a TorchX component/AppDef and returns a KFP
101
- ContainerOp factory. This is equivalent to the
102
- `kfp.components.load_component_from_*
103
- <https://kubeflow-pipelines.readthedocs.io/en/1.8.22/source/kfp.components.html#kfp.components.load_component_from_text>`_
104
- methods.
105
-
106
- Args:
107
- app: The AppDef to generate a KFP container factory for.
108
- ui_metadata: KFP UI Metadata to output so you can have model results show
109
- up in the UI. See
110
- https://www.kubeflow.org/docs/components/pipelines/legacy-v1/sdk/output-viewer/
111
- for more info on the format.
112
-
113
- >>> from torchx import specs
114
- >>> from torchx.pipelines.kfp.adapter import component_from_app
115
- >>> app_def = specs.AppDef(
116
- ... name="trainer",
117
- ... roles=[specs.Role("trainer", image="foo:latest")],
118
- ... )
119
- >>> component_from_app(app_def)
120
- <function component_from_app...>
121
- """
122
-
123
- role_spec: api.Role
124
- spec, role_spec = component_spec_from_app(app)
125
- resources: api.Resource = role_spec.resource
126
- assert (
127
- len(resources.capabilities) == 0
128
- ), f"KFP doesn't support capabilities, got {resources.capabilities}"
129
- component_factory: KFPContainerFactory = components.load_component_from_text(spec)
130
-
131
- if ui_metadata is not None:
132
- # pyre-fixme[16]: `ComponentSpec` has no attribute `outputs`
133
- component_factory.component_spec.outputs.append(
134
- OutputSpec(
135
- name="mlpipeline-ui-metadata",
136
- type="MLPipeline UI Metadata",
137
- description="ui metadata",
138
- )
139
- )
140
-
141
- def factory_wrapper(*args: object, **kwargs: object) -> dsl.ContainerOp:
142
- c = component_factory(*args, **kwargs)
143
- container = c.container
144
-
145
- if ui_metadata is not None:
146
- # We generate the UI metadata from the sidecar so we need to make
147
- # both the container and the sidecar share the same tmp directory so
148
- # the outputs appear in the original container.
149
- c.add_volume(V1Volume(name="tmp", empty_dir=V1EmptyDirVolumeSource()))
150
- container.add_volume_mount(
151
- V1VolumeMount(
152
- name="tmp",
153
- mount_path="/tmp/",
154
- )
155
- )
156
- c.output_artifact_paths["mlpipeline-ui-metadata"] = METADATA_FILE
157
- c.add_sidecar(_ui_metadata_sidecar(ui_metadata))
158
-
159
- cpu = resources.cpu
160
- if cpu >= 0:
161
- cpu_str = f"{int(cpu*1000)}m"
162
- container.set_cpu_request(cpu_str)
163
- container.set_cpu_limit(cpu_str)
164
- mem = resources.memMB
165
- if mem >= 0:
166
- mem_str = f"{int(mem)}M"
167
- container.set_memory_request(mem_str)
168
- container.set_memory_limit(mem_str)
169
- gpu = resources.gpu
170
- if gpu > 0:
171
- container.set_gpu_limit(str(gpu))
172
-
173
- for name, port in role_spec.port_map.items():
174
- container.add_port(
175
- V1ContainerPort(
176
- name=name,
177
- container_port=port,
178
- ),
179
- )
180
-
181
- c.pod_labels.update(pod_labels(app, 0, role_spec, 0, app.name))
182
-
183
- return c
184
-
185
- return factory_wrapper
186
-
187
-
188
- def _ui_metadata_sidecar(
189
- ui_metadata: Mapping[str, object], image: str = "alpine"
190
- ) -> dsl.Sidecar:
191
- shell_encoded = shlex.quote(json.dumps(ui_metadata))
192
- dirname = os.path.dirname(METADATA_FILE)
193
- return dsl.Sidecar(
194
- name="ui-metadata-sidecar",
195
- image=image,
196
- command=[
197
- "sh",
198
- "-c",
199
- f"mkdir -p {dirname}; echo {shell_encoded} > {METADATA_FILE}",
200
- ],
201
- mirror_volume_mounts=True,
202
- )
203
-
204
-
205
- def container_from_app(
206
- app: api.AppDef,
207
- *args: object,
208
- ui_metadata: Optional[Mapping[str, object]] = None,
209
- **kwargs: object,
210
- ) -> dsl.ContainerOp:
211
- """
212
- container_from_app transforms the app into a KFP component and returns a
213
- corresponding ContainerOp instance.
214
-
215
- See component_from_app for description on the arguments. Any unspecified
216
- arguments are passed through to the KFP container factory method.
217
-
218
- >>> import kfp
219
- >>> from torchx import specs
220
- >>> from torchx.pipelines.kfp.adapter import container_from_app
221
- >>> app_def = specs.AppDef(
222
- ... name="trainer",
223
- ... roles=[specs.Role("trainer", image="foo:latest")],
224
- ... )
225
- >>> def pipeline():
226
- ... trainer = container_from_app(app_def)
227
- ... print(trainer)
228
- >>> kfp.compiler.Compiler().compile(
229
- ... pipeline_func=pipeline,
230
- ... package_path="/tmp/pipeline.yaml",
231
- ... )
232
- {'ContainerOp': {... 'name': 'trainer-trainer', ...}}
233
- """
234
- factory = component_from_app(app, ui_metadata)
235
- return factory(*args, **kwargs)
236
-
237
-
238
- def resource_from_app(
239
- app: api.AppDef,
240
- queue: str,
241
- service_account: Optional[str] = None,
242
- ) -> dsl.ResourceOp:
243
- """
244
- resource_from_app generates a KFP ResourceOp from the provided app that uses
245
- the Volcano job scheduler on Kubernetes to run distributed apps. See
246
- https://volcano.sh/en/docs/ for more info on Volcano and how to install.
247
-
248
- Args:
249
- app: The torchx AppDef to adapt.
250
- queue: the Volcano queue to schedule the operator in.
251
-
252
- >>> import kfp
253
- >>> from torchx import specs
254
- >>> from torchx.pipelines.kfp.adapter import resource_from_app
255
- >>> app_def = specs.AppDef(
256
- ... name="trainer",
257
- ... roles=[specs.Role("trainer", image="foo:latest", num_replicas=3)],
258
- ... )
259
- >>> def pipeline():
260
- ... trainer = resource_from_app(app_def, queue="test")
261
- ... print(trainer)
262
- >>> kfp.compiler.Compiler().compile(
263
- ... pipeline_func=pipeline,
264
- ... package_path="/tmp/pipeline.yaml",
265
- ... )
266
- {'ResourceOp': {... 'name': 'trainer-0', ... 'name': 'trainer-1', ... 'name': 'trainer-2', ...}}
267
- """
268
- return dsl.ResourceOp(
269
- name=app.name,
270
- action="create",
271
- success_condition="status.state.phase = Completed",
272
- failure_condition="status.state.phase = Failed",
273
- k8s_resource=app_to_resource(app, queue, service_account=service_account),
274
- )
@@ -1,19 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- # Follows PEP-0440 version scheme guidelines
11
- # https://www.python.org/dev/peps/pep-0440/#version-scheme
12
- #
13
- # Examples:
14
- # 0.1.0.devN # Developmental release
15
- # 0.1.0aN # Alpha release
16
- # 0.1.0bN # Beta release
17
- # 0.1.0rcN # Release Candidate
18
- # 0.1.0 # Final release
19
- __version__ = "0.1.0.dev0"