torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_delete.py +30 -0
  3. torchx/cli/cmd_list.py +1 -2
  4. torchx/cli/cmd_run.py +202 -28
  5. torchx/cli/cmd_tracker.py +1 -1
  6. torchx/cli/main.py +2 -0
  7. torchx/components/__init__.py +1 -8
  8. torchx/components/dist.py +9 -3
  9. torchx/components/integration_tests/component_provider.py +2 -2
  10. torchx/components/utils.py +1 -1
  11. torchx/distributed/__init__.py +1 -1
  12. torchx/runner/api.py +102 -81
  13. torchx/runner/config.py +3 -1
  14. torchx/runner/events/__init__.py +20 -10
  15. torchx/runner/events/api.py +1 -1
  16. torchx/schedulers/__init__.py +7 -10
  17. torchx/schedulers/api.py +66 -25
  18. torchx/schedulers/aws_batch_scheduler.py +47 -6
  19. torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
  20. torchx/schedulers/docker_scheduler.py +4 -3
  21. torchx/schedulers/ids.py +27 -23
  22. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
  23. torchx/schedulers/kubernetes_scheduler.py +355 -36
  24. torchx/schedulers/local_scheduler.py +2 -1
  25. torchx/schedulers/lsf_scheduler.py +1 -1
  26. torchx/schedulers/slurm_scheduler.py +102 -27
  27. torchx/specs/__init__.py +40 -9
  28. torchx/specs/api.py +222 -12
  29. torchx/specs/builders.py +109 -28
  30. torchx/specs/file_linter.py +117 -53
  31. torchx/specs/finder.py +25 -37
  32. torchx/specs/named_resources_aws.py +13 -2
  33. torchx/specs/overlays.py +106 -0
  34. torchx/tracker/__init__.py +2 -2
  35. torchx/tracker/api.py +1 -1
  36. torchx/util/entrypoints.py +1 -6
  37. torchx/util/strings.py +1 -1
  38. torchx/util/types.py +12 -1
  39. torchx/version.py +2 -2
  40. torchx/workspace/api.py +102 -5
  41. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
  42. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
  43. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
  44. torchx/examples/pipelines/__init__.py +0 -0
  45. torchx/examples/pipelines/kfp/__init__.py +0 -0
  46. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  47. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  48. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  49. torchx/pipelines/kfp/__init__.py +0 -30
  50. torchx/pipelines/kfp/adapter.py +0 -274
  51. torchx/pipelines/kfp/version.py +0 -19
  52. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  53. torchx/schedulers/ray/ray_common.py +0 -22
  54. torchx/schedulers/ray/ray_driver.py +0 -307
  55. torchx/schedulers/ray_scheduler.py +0 -454
  56. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
  57. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
  58. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
@@ -1,289 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- """
11
- Advanced KubeFlow Pipelines Example
12
- ===================================
13
-
14
- This is an example pipeline using KubeFlow Pipelines built with only TorchX
15
- components.
16
-
17
- KFP adapters can be used transform the TorchX components directly into
18
- something that can be used within KFP.
19
- """
20
-
21
- # %%
22
- # Input Arguments
23
- # ###############
24
- # Lets first define some arguments for the pipeline.
25
-
26
- import argparse
27
- import os.path
28
- import sys
29
- from typing import Dict
30
-
31
- import kfp
32
- import torchx
33
- from torchx import specs
34
- from torchx.components.dist import ddp as dist_ddp
35
- from torchx.components.serve import torchserve
36
- from torchx.components.utils import copy as utils_copy, python as utils_python
37
- from torchx.pipelines.kfp.adapter import container_from_app
38
-
39
-
40
- parser = argparse.ArgumentParser(description="example kfp pipeline")
41
-
42
- # %%
43
- # TorchX components are built around images. Depending on what scheduler
44
- # you're using this can vary but for KFP these images are specified as
45
- # docker containers. We have one container for the example apps and one for
46
- # the standard built in apps. If you modify the torchx example code you'll
47
- # need to rebuild the container before launching it on KFP
48
-
49
-
50
- parser.add_argument(
51
- "--image",
52
- type=str,
53
- help="docker image to use for the examples apps",
54
- default=torchx.IMAGE,
55
- )
56
-
57
- # %%
58
- # Most TorchX components use
59
- # `fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`_ to abstract
60
- # away dealing with remote filesystems. This allows the components to take
61
- # paths like ``s3://`` to make it easy to use cloud storage providers.
62
- parser.add_argument(
63
- "--output_path",
64
- type=str,
65
- help="path to place the data",
66
- required=True,
67
- )
68
- parser.add_argument("--load_path", type=str, help="checkpoint path to load from")
69
-
70
- # %%
71
- # This example uses the torchserve for inference so we need to specify some
72
- # options. This assumes you have a TorchServe instance running in the same
73
- # Kubernetes cluster with with the service name ``torchserve`` in the default
74
- # namespace.
75
- #
76
- # See https://github.com/pytorch/serve/blob/master/kubernetes/README.md for info
77
- # on how to setup TorchServe.
78
- parser.add_argument(
79
- "--management_api",
80
- type=str,
81
- help="path to the torchserve management API",
82
- default="http://torchserve.default.svc.cluster.local:8081",
83
- )
84
- parser.add_argument(
85
- "--model_name",
86
- type=str,
87
- help="the name of the inference model",
88
- default="tiny_image_net",
89
- )
90
-
91
- # %% Parse the arguments, you'll need to set these accordingly if running from a
92
- # notebook.
93
-
94
-
95
- if "NOTEBOOK" in globals():
96
- argv = [
97
- "--output_path",
98
- "/tmp/output",
99
- ]
100
- else:
101
- argv = sys.argv[1:]
102
-
103
- args: argparse.Namespace = parser.parse_args(argv)
104
-
105
- # %%
106
- # Creating the Components
107
- # #######################
108
- # The first step is downloading the data to somewhere we can work on it. For
109
- # this we can just the builtin copy component. This component takes two valid
110
- # fsspec paths and copies them from one to another. In this case we're using
111
- # http as the source and a file under the output_path as the output.
112
-
113
-
114
- data_path: str = os.path.join(args.output_path, "tiny-imagenet-200.zip")
115
- copy_app: specs.AppDef = utils_copy(
116
- "http://cs231n.stanford.edu/tiny-imagenet-200.zip",
117
- data_path,
118
- image=args.image,
119
- )
120
-
121
- # %%
122
- # The next component is for data preprocessing. This takes in the raw data from
123
- # the previous operator and runs some transforms on it for use with the trainer.
124
- #
125
- # datapreproc outputs the data to a specified fsspec path. These paths are all
126
- # specified ahead of time so we have a fully static pipeline.
127
-
128
-
129
- processed_data_path: str = os.path.join(args.output_path, "processed")
130
- datapreproc_app: specs.AppDef = utils_python(
131
- "--output_path",
132
- processed_data_path,
133
- "--input_path",
134
- data_path,
135
- "--limit",
136
- "100",
137
- image=args.image,
138
- m="torchx.examples.apps.datapreproc.datapreproc",
139
- cpu=1,
140
- memMB=1024,
141
- )
142
-
143
- # %%
144
- # Next we'll create the trainer component that takes in the training data from the
145
- # previous datapreproc component. We've defined this in a separate component
146
- # file as you normally would.
147
- #
148
- # Having a separate component file allows you to launch your trainer from the
149
- # TorchX CLI via ``torchx run`` for fast iteration as well as run it from a
150
- # pipeline in an automated fashion.
151
-
152
- # make sure examples is on the path
153
- if "__file__" in globals():
154
- sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
155
-
156
-
157
- logs_path: str = os.path.join(args.output_path, "logs")
158
- models_path: str = os.path.join(args.output_path, "models")
159
-
160
- trainer_app: specs.AppDef = dist_ddp(
161
- *(
162
- "--output_path",
163
- models_path,
164
- "--load_path",
165
- args.load_path or "",
166
- "--log_path",
167
- logs_path,
168
- "--data_path",
169
- processed_data_path,
170
- "--epochs",
171
- str(1),
172
- ),
173
- image=args.image,
174
- m="torchx.examples.apps.lightning.train",
175
- j="1x1",
176
- # per node resource settings
177
- cpu=1,
178
- memMB=3000,
179
- )
180
-
181
- # %%
182
- # To have the tensorboard path show up in KFPs UI we need to some metadata so
183
- # KFP knows where to consume the metrics from.
184
- #
185
- # This will get used when we create the KFP container.
186
-
187
-
188
- ui_metadata: Dict[str, object] = {
189
- "outputs": [
190
- {
191
- "type": "tensorboard",
192
- "source": os.path.join(logs_path, "lightning_logs"),
193
- }
194
- ]
195
- }
196
-
197
- # %%
198
- # For the inference, we're leveraging one of the builtin TorchX components. This
199
- # component takes in a model and uploads it to the TorchServe management API
200
- # endpoints.
201
-
202
-
203
- serve_app: specs.AppDef = torchserve(
204
- model_path=os.path.join(models_path, "model.mar"),
205
- management_api=args.management_api,
206
- image=args.image,
207
- params={
208
- "model_name": args.model_name,
209
- # set this to allocate a worker
210
- # "initial_workers": 1,
211
- },
212
- )
213
-
214
- # %%
215
- # For model interpretability we're leveraging a custom component stored in it's
216
- # own component file. This component takes in the output from datapreproc and
217
- # train components and produces images with integrated gradient results.
218
-
219
- interpret_path: str = os.path.join(args.output_path, "interpret")
220
- interpret_app: specs.AppDef = utils_python(
221
- *(
222
- "--load_path",
223
- os.path.join(models_path, "last.ckpt"),
224
- "--data_path",
225
- processed_data_path,
226
- "--output_path",
227
- interpret_path,
228
- ),
229
- image=args.image,
230
- m="torchx.examples.apps.lightning.interpret",
231
- )
232
-
233
- # %%
234
- # Pipeline Definition
235
- # ###################
236
- # The last step is to define the actual pipeline using the torchx components via
237
- # the KFP adapter and export the pipeline package that can be uploaded to a KFP
238
- # cluster.
239
- #
240
- # The KFP adapter currently doesn't track the input and outputs so the
241
- # containers need to have their dependencies specified via `.after()`.
242
- #
243
- # We call `.set_tty()` to make the logs from the components more responsive for
244
- # example purposes.
245
-
246
-
247
- def pipeline() -> None:
248
- # container_from_app creates a KFP container from the TorchX app
249
- # definition.
250
- copy = container_from_app(copy_app)
251
- copy.container.set_tty()
252
-
253
- datapreproc = container_from_app(datapreproc_app)
254
- datapreproc.container.set_tty()
255
- datapreproc.after(copy)
256
-
257
- # For the trainer we want to log that UI metadata so you can access
258
- # tensorboard from the UI.
259
- trainer = container_from_app(trainer_app, ui_metadata=ui_metadata)
260
- trainer.container.set_tty()
261
- trainer.after(datapreproc)
262
-
263
- if False:
264
- serve = container_from_app(serve_app)
265
- serve.container.set_tty()
266
- serve.after(trainer)
267
-
268
- if False:
269
- # Serve and interpret only require the trained model so we can run them
270
- # in parallel to each other.
271
- interpret = container_from_app(interpret_app)
272
- interpret.container.set_tty()
273
- interpret.after(trainer)
274
-
275
-
276
- kfp.compiler.Compiler().compile(
277
- pipeline_func=pipeline,
278
- package_path="pipeline.yaml",
279
- )
280
-
281
- with open("pipeline.yaml", "rt") as f:
282
- print(f.read())
283
-
284
- # %%
285
- # Once this has all run you should have a pipeline file (typically
286
- # pipeline.yaml) that you can upload to your KFP cluster via the UI or
287
- # a kfp.Client.
288
-
289
- # sphinx_gallery_thumbnail_path = '_static/img/gallery-kfp.png'
@@ -1,71 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- """
11
- Distributed KubeFlow Pipelines Example
12
- ======================================
13
-
14
- This is an example KFP pipeline that uses resource_from_app to launch a
15
- distributed operator using the kubernetes/volcano job scheduler. This only works
16
- in Kubernetes KFP clusters with https://volcano.sh/en/docs/ installed on them.
17
- """
18
-
19
- import kfp
20
- from torchx import specs
21
- from torchx.pipelines.kfp.adapter import resource_from_app
22
-
23
-
24
- def pipeline() -> None:
25
- # First we define our AppDef for the component, we set
26
- echo_app = specs.AppDef(
27
- name="test-dist",
28
- roles=[
29
- specs.Role(
30
- name="dist-echo",
31
- image="alpine",
32
- entrypoint="/bin/echo",
33
- args=["hello dist!"],
34
- num_replicas=3,
35
- ),
36
- ],
37
- )
38
-
39
- # To convert the TorchX AppDef into a KFP container we use
40
- # the resource_from_app adapter. This takes generates a KFP Kubernetes
41
- # resource operator definition from the TorchX app def and instantiates it.
42
- echo_container: kfp.dsl.BaseOp = resource_from_app(echo_app, queue="default")
43
-
44
-
45
- # %%
46
- # To generate the pipeline definition file we need to call into the KFP compiler
47
- # with our pipeline function.
48
-
49
- kfp.compiler.Compiler().compile(
50
- pipeline_func=pipeline,
51
- package_path="pipeline.yaml",
52
- )
53
-
54
- with open("pipeline.yaml", "rt") as f:
55
- print(f.read())
56
-
57
- # %%
58
- # Once this has all run you should have a pipeline file (typically
59
- # pipeline.yaml) that you can upload to your KFP cluster via the UI or
60
- # a kfp.Client.
61
- #
62
- # See the
63
- # `KFP SDK Examples <https://www.kubeflow.org/docs/components/pipelines/legacy-v1/tutorials/sdk-examples/#examples>`_
64
- # for more info on launching KFP pipelines.
65
-
66
- # %%
67
- # See the :ref:`examples_pipelines/kfp/advanced_pipeline:Advanced KubeFlow Pipelines Example` for how to chain multiple
68
- # components together and use builtin components.
69
-
70
-
71
- # sphinx_gallery_thumbnail_path = '_static/img/gallery-kfp.png'
@@ -1,83 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- """
11
- Intro KubeFlow Pipelines Example
12
- ================================
13
-
14
- This an introductory pipeline using KubeFlow Pipelines built with only TorchX
15
- components.
16
-
17
- TorchX is intended to allow making cross platform components. As such, we have
18
- a standard definition that uses adapters to convert it to the specific
19
- pipeline platform. This is an example of using the KFP adapter to run a TorchX
20
- component as part of a KubeFlow Pipeline.
21
-
22
- TorchX tries to leverage standard mechanisms wherever possible. For KFP we use
23
- the existing KFP pipeline definition syntax and add a single
24
- `component_from_app` conversion step to convert a TorchX component into one
25
- KFP can understand.
26
-
27
- Typically you have a separate component file but for this example we define the
28
- AppDef inline.
29
- """
30
-
31
- import kfp
32
- from torchx import specs
33
- from torchx.pipelines.kfp.adapter import container_from_app
34
-
35
-
36
- def pipeline() -> None:
37
- # First we define our AppDef for the component. AppDef is a core part of TorchX
38
- # and can be used to describe complex distributed multi container apps or
39
- # just a single node component like here.
40
- echo_app: specs.AppDef = specs.AppDef(
41
- name="examples-intro",
42
- roles=[
43
- specs.Role(
44
- name="worker",
45
- entrypoint="/bin/echo",
46
- args=["Hello TorchX!"],
47
- image="alpine",
48
- )
49
- ],
50
- )
51
-
52
- # To convert the TorchX AppDef into a KFP container we use
53
- # the container_from_app adapter. This takes generates a KFP component
54
- # definition from the TorchX app def and instantiates it into a container.
55
- echo_container: kfp.dsl.ContainerOp = container_from_app(echo_app)
56
-
57
-
58
- # %%
59
- # To generate the pipeline definition file we need to call into the KFP compiler
60
- # with our pipeline function.
61
-
62
- kfp.compiler.Compiler().compile(
63
- pipeline_func=pipeline,
64
- package_path="pipeline.yaml",
65
- )
66
-
67
- with open("pipeline.yaml", "rt") as f:
68
- print(f.read())
69
-
70
- # %%
71
- # Once this has all run you should have a pipeline file (typically
72
- # pipeline.yaml) that you can upload to your KFP cluster via the UI or
73
- # a kfp.Client.
74
- #
75
- # See the
76
- # `KFP SDK Examples <https://www.kubeflow.org/docs/components/pipelines/legacy-v1/tutorials/sdk-examples/#examples>`_
77
- # for more info on launching KFP pipelines.
78
-
79
- # %%
80
- # See the :ref:`examples_pipelines/kfp/advanced_pipeline:Advanced KubeFlow Pipelines Example` for how to chain multiple
81
- # components together and use builtin components.
82
-
83
- # sphinx_gallery_thumbnail_path = '_static/img/gallery-kfp.png'
@@ -1,30 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) Meta Platforms, Inc. and affiliates.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the BSD-style license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- # pyre-strict
9
-
10
- """
11
- This module contains adapters for converting TorchX components into KubeFlow
12
- Pipeline components.
13
-
14
- The current KFP adapters only support single node (1 role and 1 replica)
15
- components.
16
- """
17
-
18
- import kfp
19
-
20
- from .version import __version__ as __version__ # noqa F401
21
-
22
-
23
- def _check_kfp_version() -> None:
24
- if not kfp.__version__.startswith("1."):
25
- raise ImportError(
26
- f"Only kfp version 1.x.x is supported! kfp version {kfp.__version__}"
27
- )
28
-
29
-
30
- _check_kfp_version()