torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show
  1. torchx/__init__.py +2 -0
  2. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  3. torchx/apps/serve/serve.py +2 -0
  4. torchx/apps/utils/booth_main.py +2 -0
  5. torchx/apps/utils/copy_main.py +2 -0
  6. torchx/apps/utils/process_monitor.py +2 -0
  7. torchx/cli/__init__.py +2 -0
  8. torchx/cli/argparse_util.py +38 -3
  9. torchx/cli/cmd_base.py +2 -0
  10. torchx/cli/cmd_cancel.py +2 -0
  11. torchx/cli/cmd_configure.py +2 -0
  12. torchx/cli/cmd_delete.py +30 -0
  13. torchx/cli/cmd_describe.py +2 -0
  14. torchx/cli/cmd_list.py +8 -4
  15. torchx/cli/cmd_log.py +6 -24
  16. torchx/cli/cmd_run.py +269 -45
  17. torchx/cli/cmd_runopts.py +2 -0
  18. torchx/cli/cmd_status.py +12 -1
  19. torchx/cli/cmd_tracker.py +3 -1
  20. torchx/cli/colors.py +2 -0
  21. torchx/cli/main.py +4 -0
  22. torchx/components/__init__.py +3 -8
  23. torchx/components/component_test_base.py +2 -0
  24. torchx/components/dist.py +18 -7
  25. torchx/components/integration_tests/component_provider.py +4 -2
  26. torchx/components/integration_tests/integ_tests.py +2 -0
  27. torchx/components/serve.py +2 -0
  28. torchx/components/structured_arg.py +4 -3
  29. torchx/components/utils.py +15 -4
  30. torchx/distributed/__init__.py +2 -4
  31. torchx/examples/apps/datapreproc/datapreproc.py +2 -0
  32. torchx/examples/apps/lightning/data.py +5 -3
  33. torchx/examples/apps/lightning/model.py +7 -6
  34. torchx/examples/apps/lightning/profiler.py +7 -4
  35. torchx/examples/apps/lightning/train.py +11 -2
  36. torchx/examples/torchx_out_of_sync_training.py +11 -0
  37. torchx/notebook.py +2 -0
  38. torchx/runner/__init__.py +2 -0
  39. torchx/runner/api.py +167 -60
  40. torchx/runner/config.py +43 -10
  41. torchx/runner/events/__init__.py +57 -13
  42. torchx/runner/events/api.py +14 -3
  43. torchx/runner/events/handlers.py +2 -0
  44. torchx/runtime/tracking/__init__.py +2 -0
  45. torchx/runtime/tracking/api.py +2 -0
  46. torchx/schedulers/__init__.py +16 -15
  47. torchx/schedulers/api.py +70 -14
  48. torchx/schedulers/aws_batch_scheduler.py +75 -6
  49. torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
  50. torchx/schedulers/devices.py +17 -4
  51. torchx/schedulers/docker_scheduler.py +43 -11
  52. torchx/schedulers/ids.py +29 -23
  53. torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
  54. torchx/schedulers/kubernetes_scheduler.py +383 -38
  55. torchx/schedulers/local_scheduler.py +100 -27
  56. torchx/schedulers/lsf_scheduler.py +5 -4
  57. torchx/schedulers/slurm_scheduler.py +336 -20
  58. torchx/schedulers/streams.py +2 -0
  59. torchx/specs/__init__.py +89 -12
  60. torchx/specs/api.py +418 -30
  61. torchx/specs/builders.py +176 -38
  62. torchx/specs/file_linter.py +143 -57
  63. torchx/specs/finder.py +68 -28
  64. torchx/specs/named_resources_aws.py +181 -4
  65. torchx/specs/named_resources_generic.py +2 -0
  66. torchx/specs/overlays.py +106 -0
  67. torchx/specs/test/components/__init__.py +2 -0
  68. torchx/specs/test/components/a/__init__.py +2 -0
  69. torchx/specs/test/components/a/b/__init__.py +2 -0
  70. torchx/specs/test/components/a/b/c.py +2 -0
  71. torchx/specs/test/components/c/__init__.py +2 -0
  72. torchx/specs/test/components/c/d.py +2 -0
  73. torchx/tracker/__init__.py +12 -6
  74. torchx/tracker/api.py +15 -18
  75. torchx/tracker/backend/fsspec.py +2 -0
  76. torchx/util/cuda.py +2 -0
  77. torchx/util/datetime.py +2 -0
  78. torchx/util/entrypoints.py +39 -15
  79. torchx/util/io.py +2 -0
  80. torchx/util/log_tee_helpers.py +210 -0
  81. torchx/util/modules.py +65 -0
  82. torchx/util/session.py +42 -0
  83. torchx/util/shlex.py +2 -0
  84. torchx/util/strings.py +3 -1
  85. torchx/util/types.py +90 -29
  86. torchx/version.py +4 -2
  87. torchx/workspace/__init__.py +2 -0
  88. torchx/workspace/api.py +136 -6
  89. torchx/workspace/dir_workspace.py +2 -0
  90. torchx/workspace/docker_workspace.py +30 -2
  91. torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
  92. torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
  93. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
  94. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
  95. torchx/examples/pipelines/__init__.py +0 -0
  96. torchx/examples/pipelines/kfp/__init__.py +0 -0
  97. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
  98. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
  99. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
  100. torchx/pipelines/kfp/__init__.py +0 -28
  101. torchx/pipelines/kfp/adapter.py +0 -271
  102. torchx/pipelines/kfp/version.py +0 -17
  103. torchx/schedulers/gcp_batch_scheduler.py +0 -487
  104. torchx/schedulers/ray/ray_common.py +0 -22
  105. torchx/schedulers/ray/ray_driver.py +0 -307
  106. torchx/schedulers/ray_scheduler.py +0 -453
  107. torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
  108. torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
  109. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
  110. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
@@ -1,453 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- import dataclasses
8
- import json
9
- import logging
10
- import os
11
- import re
12
- import tempfile
13
- import time
14
- from dataclasses import dataclass, field
15
- from datetime import datetime
16
- from shutil import copy2, rmtree
17
- from typing import Any, cast, Dict, Final, Iterable, List, Optional, Tuple # noqa
18
-
19
- import urllib3
20
-
21
- from torchx.schedulers.api import (
22
- AppDryRunInfo,
23
- AppState,
24
- DescribeAppResponse,
25
- filter_regex,
26
- ListAppResponse,
27
- Scheduler,
28
- split_lines,
29
- Stream,
30
- )
31
- from torchx.schedulers.ids import make_unique
32
- from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
33
- from torchx.specs import AppDef, macros, NONE, ReplicaStatus, Role, RoleStatus, runopts
34
- from torchx.workspace.dir_workspace import TmpDirWorkspaceMixin
35
- from typing_extensions import TypedDict
36
-
37
-
38
- try:
39
- from ray.autoscaler import sdk as ray_autoscaler_sdk
40
- from ray.dashboard.modules.job.common import JobStatus
41
- from ray.dashboard.modules.job.sdk import JobSubmissionClient
42
-
43
- _has_ray = True
44
-
45
- except ImportError:
46
- _has_ray = False
47
-
48
-
49
- def has_ray() -> bool:
50
- """Indicates whether Ray is installed in the current Python environment."""
51
- return _has_ray
52
-
53
-
54
- class RayOpts(TypedDict, total=False):
55
- cluster_config_file: Optional[str]
56
- cluster_name: Optional[str]
57
- dashboard_address: Optional[str]
58
- working_dir: Optional[str]
59
- requirements: Optional[str]
60
-
61
-
62
- if _has_ray:
63
- _logger: logging.Logger = logging.getLogger(__name__)
64
-
65
- _ray_status_to_torchx_appstate: Dict[JobStatus, AppState] = {
66
- JobStatus.PENDING: AppState.PENDING,
67
- JobStatus.RUNNING: AppState.RUNNING,
68
- JobStatus.SUCCEEDED: AppState.SUCCEEDED,
69
- JobStatus.FAILED: AppState.FAILED,
70
- JobStatus.STOPPED: AppState.CANCELLED,
71
- }
72
-
73
- class _EnhancedJSONEncoder(json.JSONEncoder):
74
- def default(self, o: RayActor): # pyre-ignore[3]
75
- if dataclasses.is_dataclass(o):
76
- return dataclasses.asdict(o)
77
- return super().default(o)
78
-
79
- def serialize(
80
- actors: List[RayActor], dirpath: str, output_filename: str = "actors.json"
81
- ) -> None:
82
- actors_json = json.dumps(actors, cls=_EnhancedJSONEncoder)
83
- with open(os.path.join(dirpath, output_filename), "w") as tmp:
84
- json.dump(actors_json, tmp)
85
-
86
- @dataclass
87
- class RayJob:
88
- """Represents a job that should be run on a Ray cluster.
89
-
90
- Attributes:
91
- app_id:
92
- The unique ID of the application (a.k.a. job).
93
- cluster_config_file:
94
- The Ray cluster configuration file.
95
- cluster_name:
96
- The cluster name to use.
97
- dashboard_address:
98
- The existing dashboard IP address to connect to
99
- working_dir:
100
- The working directory to copy to the cluster
101
- requirements:
102
- The libraries to install on the cluster per requirements.txt
103
- actors:
104
- The Ray actors which represent the job to be run. This attribute is
105
- dumped to a JSON file and copied to the cluster where `ray_main.py`
106
- uses it to initiate the job.
107
- """
108
-
109
- app_id: str
110
- working_dir: str
111
- cluster_config_file: Optional[str] = None
112
- cluster_name: Optional[str] = None
113
- dashboard_address: Optional[str] = None
114
- requirements: Optional[str] = None
115
- actors: List[RayActor] = field(default_factory=list)
116
-
117
- class RayScheduler(TmpDirWorkspaceMixin, Scheduler[RayOpts]):
118
- """
119
- RayScheduler is a TorchX scheduling interface to Ray. The job def
120
- workers will be launched as Ray actors
121
-
122
- The job environment is specified by the TorchX workspace. Any files in
123
- the workspace will be present in the Ray job unless specified in
124
- ``.torchxignore``. Python dependencies will be read from the
125
- ``requirements.txt`` file located at the root of the workspace unless
126
- it's overridden via ``-c ...,requirements=foo/requirements.txt``.
127
-
128
- **Config Options**
129
-
130
- .. runopts::
131
- class: torchx.schedulers.ray_scheduler.create_scheduler
132
-
133
- **Compatibility**
134
-
135
- .. compatibility::
136
- type: scheduler
137
- features:
138
- cancel: true
139
- logs: |
140
- Partial support. Ray only supports a single log stream so
141
- only a dummy "ray/0" combined log role is supported.
142
- Tailing and time seeking are not supported.
143
- distributed: true
144
- describe: |
145
- Partial support. RayScheduler will return job status but
146
- does not provide the complete original AppSpec.
147
- workspaces: true
148
- mounts: false
149
- elasticity: Partial support. Multi role jobs are not supported.
150
-
151
- """
152
-
153
- def __init__(
154
- self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
155
- ) -> None:
156
- # NOTE: make sure any new init options are supported in create_scheduler(...)
157
- super().__init__("ray", session_name)
158
-
159
- # w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
160
- self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
161
-
162
- def _get_ray_client(
163
- self, job_submission_netloc: Optional[str] = None
164
- ) -> JobSubmissionClient:
165
- if self._ray_client is not None:
166
- client_netloc = urllib3.util.parse_url(
167
- self._ray_client.get_address()
168
- ).netloc
169
- if job_submission_netloc and job_submission_netloc != client_netloc:
170
- raise ValueError(
171
- f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
172
- )
173
- return self._ray_client
174
- elif os.getenv("RAY_ADDRESS"):
175
- return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
176
- elif not job_submission_netloc:
177
- raise Exception(
178
- "RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
179
- " See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
180
- )
181
- return JobSubmissionClient(f"http://{job_submission_netloc}")
182
-
183
- # TODO: Add address as a potential CLI argument after writing ray.status() or passing in config file
184
- def _run_opts(self) -> runopts:
185
- opts = runopts()
186
- opts.add(
187
- "cluster_config_file",
188
- type_=str,
189
- required=False,
190
- help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
191
- )
192
- opts.add(
193
- "cluster_name",
194
- type_=str,
195
- help="Override the configured cluster name.",
196
- )
197
- opts.add(
198
- "dashboard_address",
199
- type_=str,
200
- required=False,
201
- default="127.0.0.1:8265",
202
- help="Use ray status to get the dashboard address you will submit jobs against",
203
- )
204
- opts.add("requirements", type_=str, help="Path to requirements.txt")
205
- return opts
206
-
207
- def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
208
- cfg: RayJob = dryrun_info.request
209
-
210
- # Create serialized actors for ray_driver.py
211
- actors = cfg.actors
212
- dirpath = cfg.working_dir
213
- serialize(actors, dirpath)
214
-
215
- job_submission_addr: str = ""
216
- if cfg.cluster_config_file:
217
- job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
218
- cfg.cluster_config_file
219
- ) # pragma: no cover
220
- elif cfg.dashboard_address:
221
- job_submission_addr = cfg.dashboard_address
222
- else:
223
- raise RuntimeError(
224
- "Either `dashboard_address` or `cluster_config_file` must be specified"
225
- )
226
-
227
- # 0. Create Job Client
228
- client = self._get_ray_client(job_submission_netloc=job_submission_addr)
229
-
230
- # 1. Copy Ray driver utilities
231
- current_directory = os.path.dirname(os.path.abspath(__file__))
232
- copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
233
- copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
234
- runtime_env = {"working_dir": dirpath}
235
- if cfg.requirements:
236
- runtime_env["pip"] = cfg.requirements
237
-
238
- # 1. Submit Job via the Ray Job Submission API
239
- try:
240
- job_id: str = client.submit_job(
241
- submission_id=cfg.app_id,
242
- # we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
243
- # and use it to configure your job execution.
244
- entrypoint="python3 ray_driver.py",
245
- runtime_env=runtime_env,
246
- )
247
-
248
- finally:
249
- if dirpath.startswith(tempfile.gettempdir()):
250
- rmtree(dirpath)
251
-
252
- # Encode job submission client in job_id
253
- return f"{job_submission_addr}-{job_id}"
254
-
255
- def _submit_dryrun(self, app: AppDef, cfg: RayOpts) -> AppDryRunInfo[RayJob]:
256
- app_id = make_unique(app.name)
257
-
258
- working_dir = app.roles[0].image
259
- if not os.path.exists(working_dir):
260
- raise RuntimeError(
261
- f"Role image must be a valid directory, got: {working_dir} "
262
- )
263
-
264
- requirements: Optional[str] = cfg.get("requirements")
265
- if requirements is None:
266
- workspace_reqs = os.path.join(working_dir, "requirements.txt")
267
- if os.path.exists(workspace_reqs):
268
- requirements = workspace_reqs
269
-
270
- cluster_cfg = cfg.get("cluster_config_file")
271
- if cluster_cfg:
272
- if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
273
- raise ValueError(
274
- "The cluster configuration file must be a YAML file."
275
- )
276
-
277
- job: RayJob = RayJob(
278
- app_id,
279
- cluster_config_file=cluster_cfg,
280
- requirements=requirements,
281
- working_dir=working_dir,
282
- )
283
-
284
- else: # pragma: no cover
285
- dashboard_address = cfg.get("dashboard_address")
286
- job: RayJob = RayJob(
287
- app_id=app_id,
288
- dashboard_address=dashboard_address,
289
- requirements=requirements,
290
- working_dir=working_dir,
291
- )
292
- job.cluster_name = cfg.get("cluster_name")
293
-
294
- for role in app.roles:
295
- for replica_id in range(role.num_replicas):
296
- # Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
297
- # in arguments and environment variables.
298
- replica_role = macros.Values(
299
- img_root=role.image,
300
- app_id=app_id,
301
- replica_id=str(replica_id),
302
- rank0_env=TORCHX_RANK0_HOST,
303
- ).apply(role)
304
-
305
- actor = RayActor(
306
- name=role.name,
307
- min_replicas=role.min_replicas,
308
- command=[replica_role.entrypoint] + replica_role.args,
309
- env=replica_role.env,
310
- num_cpus=max(1, replica_role.resource.cpu),
311
- num_gpus=max(0, replica_role.resource.gpu),
312
- )
313
-
314
- job.actors.append(actor)
315
-
316
- if len(app.roles) > 1 and app.roles[0].min_replicas is not None:
317
- raise ValueError("min_replicas is only supported with single role jobs")
318
-
319
- return AppDryRunInfo(job, repr)
320
-
321
- def _validate(self, app: AppDef, scheduler: str) -> None:
322
- if scheduler != "ray":
323
- raise ValueError(
324
- f"An unknown scheduler backend '{scheduler}' has been passed to the Ray scheduler."
325
- )
326
-
327
- if app.metadata:
328
- _logger.warning("The Ray scheduler does not use metadata information.")
329
-
330
- for role in app.roles:
331
- if role.resource.capabilities:
332
- _logger.warning(
333
- "The Ray scheduler does not support custom resource capabilities."
334
- )
335
- break
336
-
337
- for role in app.roles:
338
- if role.port_map:
339
- _logger.warning("The Ray scheduler does not support port mapping.")
340
- break
341
-
342
- def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:
343
- """
344
- ``wait_until_finish`` waits until the specified job has finished
345
- with a given timeout. This is intended for testing. Programmatic
346
- usage should use the runner wait method instead.
347
- """
348
-
349
- start = time.time()
350
- while time.time() - start <= timeout:
351
- status_info = self._get_job_status(app_id)
352
- status = status_info
353
- if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
354
- break
355
- time.sleep(1)
356
-
357
- def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
358
- # find index of '-' in the first :\d+-
359
- m = re.search(r":\d+-", app_id)
360
- if m:
361
- sep = m.span()[1]
362
- addr = app_id[: sep - 1]
363
- app_id = app_id[sep:]
364
- return addr, app_id
365
-
366
- addr, _, app_id = app_id.partition("-")
367
- return addr, app_id
368
-
369
- def _cancel_existing(self, app_id: str) -> None: # pragma: no cover
370
- addr, app_id = self._parse_app_id(app_id)
371
- client = self._get_ray_client(job_submission_netloc=addr)
372
- client.stop_job(app_id)
373
-
374
- def _get_job_status(self, app_id: str) -> JobStatus:
375
- addr, app_id = self._parse_app_id(app_id)
376
- client = self._get_ray_client(job_submission_netloc=addr)
377
- status = client.get_job_status(app_id)
378
- if isinstance(status, str):
379
- return cast(JobStatus, status)
380
- return status.status
381
-
382
- def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
383
- job_status_info = self._get_job_status(app_id)
384
- state = _ray_status_to_torchx_appstate[job_status_info]
385
- roles = [Role(name="ray", num_replicas=1, image="<N/A>")]
386
-
387
- # get ip_address and put it in hostname
388
-
389
- roles_statuses = [
390
- RoleStatus(
391
- role="ray",
392
- replicas=[
393
- ReplicaStatus(
394
- id=0,
395
- role="ray",
396
- hostname=NONE,
397
- state=state,
398
- )
399
- ],
400
- )
401
- ]
402
- return DescribeAppResponse(
403
- app_id=app_id,
404
- state=state,
405
- msg=job_status_info,
406
- roles_statuses=roles_statuses,
407
- roles=roles,
408
- )
409
-
410
- def log_iter(
411
- self,
412
- app_id: str,
413
- role_name: Optional[str] = None,
414
- k: int = 0,
415
- regex: Optional[str] = None,
416
- since: Optional[datetime] = None,
417
- until: Optional[datetime] = None,
418
- should_tail: bool = False,
419
- streams: Optional[Stream] = None,
420
- ) -> Iterable[str]:
421
- # TODO: support tailing, streams etc..
422
- addr, app_id = self._parse_app_id(app_id)
423
- client: JobSubmissionClient = self._get_ray_client(
424
- job_submission_netloc=addr
425
- )
426
- logs: str = client.get_job_logs(app_id)
427
- iterator = split_lines(logs)
428
- if regex:
429
- return filter_regex(regex, iterator)
430
- return iterator
431
-
432
- def list(self) -> List[ListAppResponse]:
433
- client = self._get_ray_client()
434
- jobs = client.list_jobs()
435
- netloc = urllib3.util.parse_url(client.get_address()).netloc
436
- return [
437
- ListAppResponse(
438
- app_id=f"{netloc}-{details.submission_id}",
439
- state=_ray_status_to_torchx_appstate[details.status],
440
- )
441
- for details in jobs
442
- ]
443
-
444
-
445
- def create_scheduler(
446
- session_name: str, ray_client: Optional[JobSubmissionClient] = None, **kwargs: Any
447
- ) -> "RayScheduler":
448
- if not has_ray(): # pragma: no cover
449
- raise ModuleNotFoundError(
450
- "Ray is not installed in the current Python environment."
451
- )
452
-
453
- return RayScheduler(session_name=session_name, ray_client=ray_client)
@@ -1,176 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: torchx-nightly
3
- Version: 2024.1.6
4
- Summary: TorchX SDK and Components
5
- Home-page: https://github.com/pytorch/torchx
6
- Author: TorchX Devs
7
- Author-email: torchx@fb.com
8
- License: BSD-3
9
- Keywords: pytorch,machine learning
10
- Platform: UNKNOWN
11
- Classifier: Development Status :: 4 - Beta
12
- Classifier: Intended Audience :: Developers
13
- Classifier: Intended Audience :: Science/Research
14
- Classifier: License :: OSI Approved :: BSD License
15
- Classifier: Programming Language :: Python :: 3
16
- Classifier: Programming Language :: Python :: 3.8
17
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
- Requires-Python: >=3.7
19
- Description-Content-Type: text/markdown
20
- License-File: LICENSE
21
- Requires-Dist: pyre-extensions
22
- Requires-Dist: docstring-parser >=0.8.1
23
- Requires-Dist: importlib-metadata
24
- Requires-Dist: pyyaml
25
- Requires-Dist: docker
26
- Requires-Dist: filelock
27
- Requires-Dist: fsspec ==2023.10.0
28
- Requires-Dist: urllib3 <1.27,>=1.21.1
29
- Requires-Dist: tabulate
30
- Provides-Extra: aws_batch
31
- Requires-Dist: boto3 ; extra == 'aws_batch'
32
- Provides-Extra: dev
33
- Requires-Dist: aiobotocore ; extra == 'dev'
34
- Requires-Dist: ax-platform[mysql] ==0.2.3 ; extra == 'dev'
35
- Requires-Dist: black ==23.3.0 ; extra == 'dev'
36
- Requires-Dist: boto3 ; extra == 'dev'
37
- Requires-Dist: captum >=0.4.0 ; extra == 'dev'
38
- Requires-Dist: docker ; extra == 'dev'
39
- Requires-Dist: flake8 ==3.9.0 ; extra == 'dev'
40
- Requires-Dist: fsspec[s3] ==2023.10.0 ; extra == 'dev'
41
- Requires-Dist: google-api-core ; extra == 'dev'
42
- Requires-Dist: google-cloud-batch >=0.5.0 ; extra == 'dev'
43
- Requires-Dist: google-cloud-logging >=3.0.0 ; extra == 'dev'
44
- Requires-Dist: google-cloud-runtimeconfig >=0.33.2 ; extra == 'dev'
45
- Requires-Dist: hydra-core ; extra == 'dev'
46
- Requires-Dist: ipython ; extra == 'dev'
47
- Requires-Dist: kfp ==1.8.22 ; extra == 'dev'
48
- Requires-Dist: mlflow-skinny ; extra == 'dev'
49
- Requires-Dist: moto ==4.1.6 ; extra == 'dev'
50
- Requires-Dist: protobuf ==3.20.3 ; extra == 'dev'
51
- Requires-Dist: pyre-extensions ; extra == 'dev'
52
- Requires-Dist: pyre-check ; extra == 'dev'
53
- Requires-Dist: pytest ; extra == 'dev'
54
- Requires-Dist: pytorch-lightning ==1.5.10 ; extra == 'dev'
55
- Requires-Dist: torch-model-archiver >=0.4.2 ; extra == 'dev'
56
- Requires-Dist: torch >=1.10.0 ; extra == 'dev'
57
- Requires-Dist: torchmetrics <0.11.0 ; extra == 'dev'
58
- Requires-Dist: torchserve >=0.4.2 ; extra == 'dev'
59
- Requires-Dist: torchtext >=0.11.0 ; extra == 'dev'
60
- Requires-Dist: torchvision >=0.11.1 ; extra == 'dev'
61
- Requires-Dist: ts ==0.5.1 ; extra == 'dev'
62
- Requires-Dist: usort ==1.0.2 ; extra == 'dev'
63
- Requires-Dist: ray[default] ; extra == 'dev'
64
- Provides-Extra: gcp_batch
65
- Requires-Dist: google-cloud-batch >=0.5.0 ; extra == 'gcp_batch'
66
- Requires-Dist: google-cloud-logging >=3.0.0 ; extra == 'gcp_batch'
67
- Requires-Dist: google-cloud-runtimeconfig >=0.33.2 ; extra == 'gcp_batch'
68
- Provides-Extra: kfp
69
- Requires-Dist: kfp ==1.6.2 ; extra == 'kfp'
70
- Provides-Extra: kubernetes
71
- Requires-Dist: kubernetes >=11 ; extra == 'kubernetes'
72
- Provides-Extra: ray
73
- Requires-Dist: ray >=1.12.1 ; extra == 'ray'
74
-
75
- [![PyPI](https://img.shields.io/pypi/v/torchx)](https://pypi.org/project/torchx/)
76
- [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](LICENSE)
77
- ![Tests](https://github.com/pytorch/torchx/actions/workflows/python-unittests.yaml/badge.svg)
78
- ![Lint](https://github.com/pytorch/torchx/actions/workflows/lint.yaml/badge.svg)
79
- [![codecov](https://codecov.io/gh/pytorch/torchx/branch/main/graph/badge.svg?token=ceHHIm0hXy)](https://codecov.io/gh/pytorch/torchx)
80
-
81
-
82
- # TorchX
83
-
84
-
85
- TorchX is a universal job launcher for PyTorch applications.
86
- TorchX is designed to have fast iteration time for training/research and support
87
- for E2E production ML pipelines when you're ready.
88
-
89
- TorchX currently supports:
90
-
91
- * Kubernetes (EKS, GKE, AKS, etc)
92
- * Slurm
93
- * AWS Batch
94
- * Docker
95
- * Local
96
- * Ray (prototype)
97
- * GCP Batch (prototype)
98
-
99
- Need a scheduler not listed? [Let us know!](https://github.com/pytorch/torchx/issues?q=is%3Aopen+is%3Aissue+label%3Ascheduler-request)
100
-
101
- ## Quickstart
102
-
103
- See the [quickstart guide](https://pytorch.org/torchx/latest/quickstart.html).
104
-
105
- ## Documentation
106
-
107
- * [Stable Documentation](https://pytorch.org/torchx/latest/)
108
- * [Nightly Documentation](https://pytorch.org/torchx/main/)
109
-
110
- ## Requirements
111
-
112
- torchx:
113
-
114
- * python3 (3.8+)
115
- * [PyTorch](https://pytorch.org/get-started/locally/)
116
- * optional: [Docker](https://docs.docker.com/get-docker/) (needed for docker based schedulers)
117
-
118
- Certain schedulers may require scheduler specific requirements. See installation
119
- for info.
120
-
121
- ## Installation
122
-
123
- ### Stable
124
-
125
- ```bash
126
- # install torchx sdk and CLI -- minimum dependencies
127
- pip install torchx
128
-
129
- # install torchx sdk and CLI -- all dependencies
130
- pip install "torchx[dev]"
131
-
132
- # install torchx kubeflow pipelines (kfp) support
133
- pip install "torchx[kfp]"
134
-
135
- # install torchx Kubernetes / Volcano support
136
- pip install "torchx[kubernetes]"
137
-
138
- # install torchx Ray support
139
- pip install "torchx[ray]"
140
-
141
- # install torchx GCP Batch support
142
- pip install "torchx[gcp_batch]"
143
- ```
144
-
145
- ### Nightly
146
-
147
- ```bash
148
- # install torchx sdk and CLI
149
- pip install torchx-nightly[dev]
150
- ```
151
-
152
- ### Source
153
-
154
- ```bash
155
- # install torchx sdk and CLI from source
156
- $ pip install -e git+https://github.com/pytorch/torchx.git#egg=torchx
157
-
158
- # install extra dependencies
159
- $ pip install -e git+https://github.com/pytorch/torchx.git#egg=torchx[dev]
160
- ```
161
-
162
- ### Docker
163
-
164
- TorchX provides a docker container for using as as part of a TorchX role.
165
-
166
- See: https://github.com/pytorch/torchx/pkgs/container/torchx
167
-
168
- ## Contributing
169
-
170
- We welcome PRs! See the [CONTRIBUTING](CONTRIBUTING.md) file.
171
-
172
- ## License
173
-
174
- TorchX is BSD licensed, as found in the [LICENSE](LICENSE) file.
175
-
176
-