torchx-nightly 2025.7.9__py3-none-any.whl → 2025.11.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_list.py +1 -2
  3. torchx/cli/cmd_run.py +202 -28
  4. torchx/cli/cmd_tracker.py +1 -1
  5. torchx/components/__init__.py +1 -8
  6. torchx/components/dist.py +9 -3
  7. torchx/components/integration_tests/component_provider.py +2 -2
  8. torchx/components/utils.py +1 -1
  9. torchx/distributed/__init__.py +1 -1
  10. torchx/runner/api.py +92 -81
  11. torchx/runner/config.py +11 -9
  12. torchx/runner/events/__init__.py +20 -10
  13. torchx/runner/events/api.py +1 -1
  14. torchx/schedulers/__init__.py +7 -10
  15. torchx/schedulers/api.py +20 -15
  16. torchx/schedulers/aws_batch_scheduler.py +45 -2
  17. torchx/schedulers/docker_scheduler.py +3 -0
  18. torchx/schedulers/kubernetes_scheduler.py +200 -17
  19. torchx/schedulers/local_scheduler.py +1 -0
  20. torchx/schedulers/slurm_scheduler.py +160 -26
  21. torchx/specs/__init__.py +23 -6
  22. torchx/specs/api.py +279 -33
  23. torchx/specs/builders.py +109 -28
  24. torchx/specs/file_linter.py +117 -53
  25. torchx/specs/finder.py +25 -37
  26. torchx/specs/named_resources_aws.py +13 -2
  27. torchx/tracker/__init__.py +2 -2
  28. torchx/tracker/api.py +1 -1
  29. torchx/util/entrypoints.py +1 -6
  30. torchx/util/strings.py +1 -1
  31. torchx/util/types.py +12 -1
  32. torchx/version.py +2 -2
  33. torchx/workspace/api.py +102 -5
  34. {torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/METADATA +34 -48
  35. {torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/RECORD +39 -51
  36. {torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/WHEEL +1 -1
  37. torchx/examples/pipelines/__init__.py +0 -0
  38. torchx/examples/pipelines/kfp/__init__.py +0 -0
  39. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  40. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  41. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  42. torchx/pipelines/kfp/__init__.py +0 -30
  43. torchx/pipelines/kfp/adapter.py +0 -274
  44. torchx/pipelines/kfp/version.py +0 -19
  45. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  46. torchx/schedulers/ray/ray_common.py +0 -22
  47. torchx/schedulers/ray/ray_driver.py +0 -307
  48. torchx/schedulers/ray_scheduler.py +0 -454
  49. {torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/entry_points.txt +0 -0
  50. {torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info/licenses}/LICENSE +0 -0
  51. {torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/top_level.txt +0 -0
@@ -1,454 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- import dataclasses
8
- import json
9
- import logging
10
- import os
11
- import re
12
- import tempfile
13
- import time
14
- from dataclasses import dataclass, field
15
- from datetime import datetime
16
- from shutil import copy2, rmtree
17
- from typing import ( # noqa
18
- Any,
19
- cast,
20
- Dict,
21
- Final,
22
- Iterable,
23
- List,
24
- Optional,
25
- Tuple,
26
- TypedDict,
27
- )
28
-
29
- import urllib3
30
-
31
- from ray.autoscaler import sdk as ray_autoscaler_sdk
32
- from ray.dashboard.modules.job.common import JobStatus
33
- from ray.dashboard.modules.job.sdk import JobSubmissionClient
34
-
35
- from torchx.schedulers.api import (
36
- AppState,
37
- DescribeAppResponse,
38
- filter_regex,
39
- ListAppResponse,
40
- Scheduler,
41
- split_lines,
42
- Stream,
43
- )
44
- from torchx.schedulers.ids import make_unique
45
- from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
46
- from torchx.specs import (
47
- AppDef,
48
- AppDryRunInfo,
49
- macros,
50
- NONE,
51
- ReplicaStatus,
52
- Role,
53
- RoleStatus,
54
- runopts,
55
- )
56
- from torchx.workspace.dir_workspace import TmpDirWorkspaceMixin
57
-
58
-
59
- class RayOpts(TypedDict, total=False):
60
- cluster_config_file: Optional[str]
61
- cluster_name: Optional[str]
62
- dashboard_address: Optional[str]
63
- working_dir: Optional[str]
64
- requirements: Optional[str]
65
-
66
-
67
- _logger: logging.Logger = logging.getLogger(__name__)
68
-
69
- _ray_status_to_torchx_appstate: Dict[JobStatus, AppState] = {
70
- JobStatus.PENDING: AppState.PENDING,
71
- JobStatus.RUNNING: AppState.RUNNING,
72
- JobStatus.SUCCEEDED: AppState.SUCCEEDED,
73
- JobStatus.FAILED: AppState.FAILED,
74
- JobStatus.STOPPED: AppState.CANCELLED,
75
- }
76
-
77
-
78
- class _EnhancedJSONEncoder(json.JSONEncoder):
79
- def default(self, o: RayActor): # pyre-ignore[3]
80
- if dataclasses.is_dataclass(o):
81
- return dataclasses.asdict(o)
82
- return super().default(o)
83
-
84
-
85
- def serialize(
86
- actors: List[RayActor], dirpath: str, output_filename: str = "actors.json"
87
- ) -> None:
88
- actors_json = json.dumps(actors, cls=_EnhancedJSONEncoder)
89
- with open(os.path.join(dirpath, output_filename), "w") as tmp:
90
- json.dump(actors_json, tmp)
91
-
92
-
93
- @dataclass
94
- class RayJob:
95
- """Represents a job that should be run on a Ray cluster.
96
-
97
- Attributes:
98
- app_id:
99
- The unique ID of the application (a.k.a. job).
100
- cluster_config_file:
101
- The Ray cluster configuration file.
102
- cluster_name:
103
- The cluster name to use.
104
- dashboard_address:
105
- The existing dashboard IP address to connect to
106
- working_dir:
107
- The working directory to copy to the cluster
108
- requirements:
109
- The libraries to install on the cluster per requirements.txt
110
- actors:
111
- The Ray actors which represent the job to be run. This attribute is
112
- dumped to a JSON file and copied to the cluster where `ray_main.py`
113
- uses it to initiate the job.
114
- """
115
-
116
- app_id: str
117
- working_dir: str
118
- cluster_config_file: Optional[str] = None
119
- cluster_name: Optional[str] = None
120
- dashboard_address: Optional[str] = None
121
- requirements: Optional[str] = None
122
- actors: List[RayActor] = field(default_factory=list)
123
-
124
-
125
- class RayScheduler(
126
- TmpDirWorkspaceMixin, Scheduler[RayOpts, AppDef, AppDryRunInfo[RayJob]]
127
- ):
128
- """
129
- RayScheduler is a TorchX scheduling interface to Ray. The job def
130
- workers will be launched as Ray actors
131
-
132
- The job environment is specified by the TorchX workspace. Any files in
133
- the workspace will be present in the Ray job unless specified in
134
- ``.torchxignore``. Python dependencies will be read from the
135
- ``requirements.txt`` file located at the root of the workspace unless
136
- it's overridden via ``-c ...,requirements=foo/requirements.txt``.
137
-
138
- **Config Options**
139
-
140
- .. runopts::
141
- class: torchx.schedulers.ray_scheduler.create_scheduler
142
-
143
- **Compatibility**
144
-
145
- .. compatibility::
146
- type: scheduler
147
- features:
148
- cancel: true
149
- logs: |
150
- Partial support. Ray only supports a single log stream so
151
- only a dummy "ray/0" combined log role is supported.
152
- Tailing and time seeking are not supported.
153
- distributed: true
154
- describe: |
155
- Partial support. RayScheduler will return job status but
156
- does not provide the complete original AppSpec.
157
- workspaces: true
158
- mounts: false
159
- elasticity: Partial support. Multi role jobs are not supported.
160
-
161
- """
162
-
163
- def __init__(
164
- self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
165
- ) -> None:
166
- # NOTE: make sure any new init options are supported in create_scheduler(...)
167
- super().__init__("ray", session_name)
168
-
169
- # w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
170
- self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
171
-
172
- def _get_ray_client(
173
- self, job_submission_netloc: Optional[str] = None
174
- ) -> JobSubmissionClient:
175
- if self._ray_client is not None:
176
- client_netloc = urllib3.util.parse_url(
177
- self._ray_client.get_address()
178
- ).netloc
179
- if job_submission_netloc and job_submission_netloc != client_netloc:
180
- raise ValueError(
181
- f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
182
- )
183
- return self._ray_client
184
- elif os.getenv("RAY_ADDRESS"):
185
- return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
186
- elif not job_submission_netloc:
187
- raise Exception(
188
- "RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
189
- " See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
190
- )
191
- return JobSubmissionClient(f"http://{job_submission_netloc}")
192
-
193
- # TODO: Add address as a potential CLI argument after writing ray.status() or passing in config file
194
- def _run_opts(self) -> runopts:
195
- opts = runopts()
196
- opts.add(
197
- "cluster_config_file",
198
- type_=str,
199
- required=False,
200
- help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
201
- )
202
- opts.add(
203
- "cluster_name",
204
- type_=str,
205
- help="Override the configured cluster name.",
206
- )
207
- opts.add(
208
- "dashboard_address",
209
- type_=str,
210
- required=False,
211
- default="127.0.0.1:8265",
212
- help="Use ray status to get the dashboard address you will submit jobs against",
213
- )
214
- opts.add("requirements", type_=str, help="Path to requirements.txt")
215
- return opts
216
-
217
- def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
218
- cfg: RayJob = dryrun_info.request
219
-
220
- # Create serialized actors for ray_driver.py
221
- actors = cfg.actors
222
- dirpath = cfg.working_dir
223
- serialize(actors, dirpath)
224
-
225
- job_submission_addr: str = ""
226
- if cfg.cluster_config_file:
227
- job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
228
- cfg.cluster_config_file
229
- ) # pragma: no cover
230
- elif cfg.dashboard_address:
231
- job_submission_addr = cfg.dashboard_address
232
- else:
233
- raise RuntimeError(
234
- "Either `dashboard_address` or `cluster_config_file` must be specified"
235
- )
236
-
237
- # 0. Create Job Client
238
- client = self._get_ray_client(job_submission_netloc=job_submission_addr)
239
-
240
- # 1. Copy Ray driver utilities
241
- current_directory = os.path.dirname(os.path.abspath(__file__))
242
- copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
243
- copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
244
- runtime_env = {"working_dir": dirpath}
245
- if cfg.requirements:
246
- runtime_env["pip"] = cfg.requirements
247
-
248
- # 1. Submit Job via the Ray Job Submission API
249
- try:
250
- job_id: str = client.submit_job(
251
- submission_id=cfg.app_id,
252
- # we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
253
- # and use it to configure your job execution.
254
- entrypoint="python3 ray_driver.py",
255
- runtime_env=runtime_env,
256
- )
257
-
258
- finally:
259
- if dirpath.startswith(tempfile.gettempdir()):
260
- rmtree(dirpath)
261
-
262
- # Encode job submission client in job_id
263
- return f"{job_submission_addr}-{job_id}"
264
-
265
- def _submit_dryrun(self, app: AppDef, cfg: RayOpts) -> AppDryRunInfo[RayJob]:
266
- app_id = make_unique(app.name)
267
-
268
- working_dir = app.roles[0].image
269
- if not os.path.exists(working_dir):
270
- raise RuntimeError(
271
- f"Role image must be a valid directory, got: {working_dir} "
272
- )
273
-
274
- requirements: Optional[str] = cfg.get("requirements")
275
- if requirements is None:
276
- workspace_reqs = os.path.join(working_dir, "requirements.txt")
277
- if os.path.exists(workspace_reqs):
278
- requirements = workspace_reqs
279
-
280
- cluster_cfg = cfg.get("cluster_config_file")
281
- if cluster_cfg:
282
- if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
283
- raise ValueError("The cluster configuration file must be a YAML file.")
284
-
285
- job: RayJob = RayJob(
286
- app_id,
287
- cluster_config_file=cluster_cfg,
288
- requirements=requirements,
289
- working_dir=working_dir,
290
- )
291
-
292
- else: # pragma: no cover
293
- dashboard_address = cfg.get("dashboard_address")
294
- job: RayJob = RayJob(
295
- app_id=app_id,
296
- dashboard_address=dashboard_address,
297
- requirements=requirements,
298
- working_dir=working_dir,
299
- )
300
- job.cluster_name = cfg.get("cluster_name")
301
-
302
- for role in app.roles:
303
- for replica_id in range(role.num_replicas):
304
- # Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
305
- # in arguments and environment variables.
306
- replica_role = macros.Values(
307
- img_root=role.image,
308
- app_id=app_id,
309
- replica_id=str(replica_id),
310
- rank0_env=TORCHX_RANK0_HOST,
311
- ).apply(role)
312
-
313
- actor = RayActor(
314
- name=role.name,
315
- min_replicas=role.min_replicas,
316
- command=[replica_role.entrypoint] + replica_role.args,
317
- env=replica_role.env,
318
- num_cpus=max(1, replica_role.resource.cpu),
319
- num_gpus=max(0, replica_role.resource.gpu),
320
- )
321
-
322
- job.actors.append(actor)
323
-
324
- if len(app.roles) > 1 and app.roles[0].min_replicas is not None:
325
- raise ValueError("min_replicas is only supported with single role jobs")
326
-
327
- return AppDryRunInfo(job, repr)
328
-
329
- def _validate(self, app: AppDef, scheduler: str, cfg: RayOpts) -> None:
330
- if scheduler != "ray":
331
- raise ValueError(
332
- f"An unknown scheduler backend '{scheduler}' has been passed to the Ray scheduler."
333
- )
334
-
335
- if app.metadata:
336
- _logger.warning("The Ray scheduler does not use metadata information.")
337
-
338
- for role in app.roles:
339
- if role.resource.capabilities:
340
- _logger.warning(
341
- "The Ray scheduler does not support custom resource capabilities."
342
- )
343
- break
344
-
345
- for role in app.roles:
346
- if role.port_map:
347
- _logger.warning("The Ray scheduler does not support port mapping.")
348
- break
349
-
350
- def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:
351
- """
352
- ``wait_until_finish`` waits until the specified job has finished
353
- with a given timeout. This is intended for testing. Programmatic
354
- usage should use the runner wait method instead.
355
- """
356
-
357
- start = time.time()
358
- while time.time() - start <= timeout:
359
- status_info = self._get_job_status(app_id)
360
- status = status_info
361
- if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
362
- break
363
- time.sleep(1)
364
-
365
- def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
366
- # find index of '-' in the first :\d+-
367
- m = re.search(r":\d+-", app_id)
368
- if m:
369
- sep = m.span()[1]
370
- addr = app_id[: sep - 1]
371
- app_id = app_id[sep:]
372
- return addr, app_id
373
-
374
- addr, _, app_id = app_id.partition("-")
375
- return addr, app_id
376
-
377
- def _cancel_existing(self, app_id: str) -> None: # pragma: no cover
378
- addr, app_id = self._parse_app_id(app_id)
379
- client = self._get_ray_client(job_submission_netloc=addr)
380
- client.stop_job(app_id)
381
-
382
- def _get_job_status(self, app_id: str) -> JobStatus:
383
- addr, app_id = self._parse_app_id(app_id)
384
- client = self._get_ray_client(job_submission_netloc=addr)
385
- status = client.get_job_status(app_id)
386
- if isinstance(status, str):
387
- return cast(JobStatus, status)
388
- return status.status
389
-
390
- def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
391
- job_status_info = self._get_job_status(app_id)
392
- state = _ray_status_to_torchx_appstate[job_status_info]
393
- roles = [Role(name="ray", num_replicas=1, image="<N/A>")]
394
-
395
- # get ip_address and put it in hostname
396
-
397
- roles_statuses = [
398
- RoleStatus(
399
- role="ray",
400
- replicas=[
401
- ReplicaStatus(
402
- id=0,
403
- role="ray",
404
- hostname=NONE,
405
- state=state,
406
- )
407
- ],
408
- )
409
- ]
410
- return DescribeAppResponse(
411
- app_id=app_id,
412
- state=state,
413
- msg=job_status_info,
414
- roles_statuses=roles_statuses,
415
- roles=roles,
416
- )
417
-
418
- def log_iter(
419
- self,
420
- app_id: str,
421
- role_name: Optional[str] = None,
422
- k: int = 0,
423
- regex: Optional[str] = None,
424
- since: Optional[datetime] = None,
425
- until: Optional[datetime] = None,
426
- should_tail: bool = False,
427
- streams: Optional[Stream] = None,
428
- ) -> Iterable[str]:
429
- # TODO: support tailing, streams etc..
430
- addr, app_id = self._parse_app_id(app_id)
431
- client: JobSubmissionClient = self._get_ray_client(job_submission_netloc=addr)
432
- logs: str = client.get_job_logs(app_id)
433
- iterator = split_lines(logs)
434
- if regex:
435
- return filter_regex(regex, iterator)
436
- return iterator
437
-
438
- def list(self) -> List[ListAppResponse]:
439
- client = self._get_ray_client()
440
- jobs = client.list_jobs()
441
- netloc = urllib3.util.parse_url(client.get_address()).netloc
442
- return [
443
- ListAppResponse(
444
- app_id=f"{netloc}-{details.submission_id}",
445
- state=_ray_status_to_torchx_appstate[details.status],
446
- )
447
- for details in jobs
448
- ]
449
-
450
-
451
- def create_scheduler(
452
- session_name: str, ray_client: Optional[JobSubmissionClient] = None, **kwargs: Any
453
- ) -> "RayScheduler":
454
- return RayScheduler(session_name=session_name, ray_client=ray_client)