torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/__init__.py +2 -0
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/apps/serve/serve.py +2 -0
- torchx/apps/utils/booth_main.py +2 -0
- torchx/apps/utils/copy_main.py +2 -0
- torchx/apps/utils/process_monitor.py +2 -0
- torchx/cli/__init__.py +2 -0
- torchx/cli/argparse_util.py +38 -3
- torchx/cli/cmd_base.py +2 -0
- torchx/cli/cmd_cancel.py +2 -0
- torchx/cli/cmd_configure.py +2 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_describe.py +2 -0
- torchx/cli/cmd_list.py +8 -4
- torchx/cli/cmd_log.py +6 -24
- torchx/cli/cmd_run.py +269 -45
- torchx/cli/cmd_runopts.py +2 -0
- torchx/cli/cmd_status.py +12 -1
- torchx/cli/cmd_tracker.py +3 -1
- torchx/cli/colors.py +2 -0
- torchx/cli/main.py +4 -0
- torchx/components/__init__.py +3 -8
- torchx/components/component_test_base.py +2 -0
- torchx/components/dist.py +18 -7
- torchx/components/integration_tests/component_provider.py +4 -2
- torchx/components/integration_tests/integ_tests.py +2 -0
- torchx/components/serve.py +2 -0
- torchx/components/structured_arg.py +4 -3
- torchx/components/utils.py +15 -4
- torchx/distributed/__init__.py +2 -4
- torchx/examples/apps/datapreproc/datapreproc.py +2 -0
- torchx/examples/apps/lightning/data.py +5 -3
- torchx/examples/apps/lightning/model.py +7 -6
- torchx/examples/apps/lightning/profiler.py +7 -4
- torchx/examples/apps/lightning/train.py +11 -2
- torchx/examples/torchx_out_of_sync_training.py +11 -0
- torchx/notebook.py +2 -0
- torchx/runner/__init__.py +2 -0
- torchx/runner/api.py +167 -60
- torchx/runner/config.py +43 -10
- torchx/runner/events/__init__.py +57 -13
- torchx/runner/events/api.py +14 -3
- torchx/runner/events/handlers.py +2 -0
- torchx/runtime/tracking/__init__.py +2 -0
- torchx/runtime/tracking/api.py +2 -0
- torchx/schedulers/__init__.py +16 -15
- torchx/schedulers/api.py +70 -14
- torchx/schedulers/aws_batch_scheduler.py +75 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
- torchx/schedulers/devices.py +17 -4
- torchx/schedulers/docker_scheduler.py +43 -11
- torchx/schedulers/ids.py +29 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
- torchx/schedulers/kubernetes_scheduler.py +383 -38
- torchx/schedulers/local_scheduler.py +100 -27
- torchx/schedulers/lsf_scheduler.py +5 -4
- torchx/schedulers/slurm_scheduler.py +336 -20
- torchx/schedulers/streams.py +2 -0
- torchx/specs/__init__.py +89 -12
- torchx/specs/api.py +418 -30
- torchx/specs/builders.py +176 -38
- torchx/specs/file_linter.py +143 -57
- torchx/specs/finder.py +68 -28
- torchx/specs/named_resources_aws.py +181 -4
- torchx/specs/named_resources_generic.py +2 -0
- torchx/specs/overlays.py +106 -0
- torchx/specs/test/components/__init__.py +2 -0
- torchx/specs/test/components/a/__init__.py +2 -0
- torchx/specs/test/components/a/b/__init__.py +2 -0
- torchx/specs/test/components/a/b/c.py +2 -0
- torchx/specs/test/components/c/__init__.py +2 -0
- torchx/specs/test/components/c/d.py +2 -0
- torchx/tracker/__init__.py +12 -6
- torchx/tracker/api.py +15 -18
- torchx/tracker/backend/fsspec.py +2 -0
- torchx/util/cuda.py +2 -0
- torchx/util/datetime.py +2 -0
- torchx/util/entrypoints.py +39 -15
- torchx/util/io.py +2 -0
- torchx/util/log_tee_helpers.py +210 -0
- torchx/util/modules.py +65 -0
- torchx/util/session.py +42 -0
- torchx/util/shlex.py +2 -0
- torchx/util/strings.py +3 -1
- torchx/util/types.py +90 -29
- torchx/version.py +4 -2
- torchx/workspace/__init__.py +2 -0
- torchx/workspace/api.py +136 -6
- torchx/workspace/dir_workspace.py +2 -0
- torchx/workspace/docker_workspace.py +30 -2
- torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
- torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
- torchx/pipelines/kfp/__init__.py +0 -28
- torchx/pipelines/kfp/adapter.py +0 -271
- torchx/pipelines/kfp/version.py +0 -17
- torchx/schedulers/gcp_batch_scheduler.py +0 -487
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -453
- torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
- torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
|
@@ -1,453 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
#
|
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
|
6
|
-
|
|
7
|
-
import dataclasses
|
|
8
|
-
import json
|
|
9
|
-
import logging
|
|
10
|
-
import os
|
|
11
|
-
import re
|
|
12
|
-
import tempfile
|
|
13
|
-
import time
|
|
14
|
-
from dataclasses import dataclass, field
|
|
15
|
-
from datetime import datetime
|
|
16
|
-
from shutil import copy2, rmtree
|
|
17
|
-
from typing import Any, cast, Dict, Final, Iterable, List, Optional, Tuple # noqa
|
|
18
|
-
|
|
19
|
-
import urllib3
|
|
20
|
-
|
|
21
|
-
from torchx.schedulers.api import (
|
|
22
|
-
AppDryRunInfo,
|
|
23
|
-
AppState,
|
|
24
|
-
DescribeAppResponse,
|
|
25
|
-
filter_regex,
|
|
26
|
-
ListAppResponse,
|
|
27
|
-
Scheduler,
|
|
28
|
-
split_lines,
|
|
29
|
-
Stream,
|
|
30
|
-
)
|
|
31
|
-
from torchx.schedulers.ids import make_unique
|
|
32
|
-
from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST
|
|
33
|
-
from torchx.specs import AppDef, macros, NONE, ReplicaStatus, Role, RoleStatus, runopts
|
|
34
|
-
from torchx.workspace.dir_workspace import TmpDirWorkspaceMixin
|
|
35
|
-
from typing_extensions import TypedDict
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
try:
|
|
39
|
-
from ray.autoscaler import sdk as ray_autoscaler_sdk
|
|
40
|
-
from ray.dashboard.modules.job.common import JobStatus
|
|
41
|
-
from ray.dashboard.modules.job.sdk import JobSubmissionClient
|
|
42
|
-
|
|
43
|
-
_has_ray = True
|
|
44
|
-
|
|
45
|
-
except ImportError:
|
|
46
|
-
_has_ray = False
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def has_ray() -> bool:
|
|
50
|
-
"""Indicates whether Ray is installed in the current Python environment."""
|
|
51
|
-
return _has_ray
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class RayOpts(TypedDict, total=False):
|
|
55
|
-
cluster_config_file: Optional[str]
|
|
56
|
-
cluster_name: Optional[str]
|
|
57
|
-
dashboard_address: Optional[str]
|
|
58
|
-
working_dir: Optional[str]
|
|
59
|
-
requirements: Optional[str]
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
if _has_ray:
|
|
63
|
-
_logger: logging.Logger = logging.getLogger(__name__)
|
|
64
|
-
|
|
65
|
-
_ray_status_to_torchx_appstate: Dict[JobStatus, AppState] = {
|
|
66
|
-
JobStatus.PENDING: AppState.PENDING,
|
|
67
|
-
JobStatus.RUNNING: AppState.RUNNING,
|
|
68
|
-
JobStatus.SUCCEEDED: AppState.SUCCEEDED,
|
|
69
|
-
JobStatus.FAILED: AppState.FAILED,
|
|
70
|
-
JobStatus.STOPPED: AppState.CANCELLED,
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
class _EnhancedJSONEncoder(json.JSONEncoder):
|
|
74
|
-
def default(self, o: RayActor): # pyre-ignore[3]
|
|
75
|
-
if dataclasses.is_dataclass(o):
|
|
76
|
-
return dataclasses.asdict(o)
|
|
77
|
-
return super().default(o)
|
|
78
|
-
|
|
79
|
-
def serialize(
|
|
80
|
-
actors: List[RayActor], dirpath: str, output_filename: str = "actors.json"
|
|
81
|
-
) -> None:
|
|
82
|
-
actors_json = json.dumps(actors, cls=_EnhancedJSONEncoder)
|
|
83
|
-
with open(os.path.join(dirpath, output_filename), "w") as tmp:
|
|
84
|
-
json.dump(actors_json, tmp)
|
|
85
|
-
|
|
86
|
-
@dataclass
|
|
87
|
-
class RayJob:
|
|
88
|
-
"""Represents a job that should be run on a Ray cluster.
|
|
89
|
-
|
|
90
|
-
Attributes:
|
|
91
|
-
app_id:
|
|
92
|
-
The unique ID of the application (a.k.a. job).
|
|
93
|
-
cluster_config_file:
|
|
94
|
-
The Ray cluster configuration file.
|
|
95
|
-
cluster_name:
|
|
96
|
-
The cluster name to use.
|
|
97
|
-
dashboard_address:
|
|
98
|
-
The existing dashboard IP address to connect to
|
|
99
|
-
working_dir:
|
|
100
|
-
The working directory to copy to the cluster
|
|
101
|
-
requirements:
|
|
102
|
-
The libraries to install on the cluster per requirements.txt
|
|
103
|
-
actors:
|
|
104
|
-
The Ray actors which represent the job to be run. This attribute is
|
|
105
|
-
dumped to a JSON file and copied to the cluster where `ray_main.py`
|
|
106
|
-
uses it to initiate the job.
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
app_id: str
|
|
110
|
-
working_dir: str
|
|
111
|
-
cluster_config_file: Optional[str] = None
|
|
112
|
-
cluster_name: Optional[str] = None
|
|
113
|
-
dashboard_address: Optional[str] = None
|
|
114
|
-
requirements: Optional[str] = None
|
|
115
|
-
actors: List[RayActor] = field(default_factory=list)
|
|
116
|
-
|
|
117
|
-
class RayScheduler(TmpDirWorkspaceMixin, Scheduler[RayOpts]):
|
|
118
|
-
"""
|
|
119
|
-
RayScheduler is a TorchX scheduling interface to Ray. The job def
|
|
120
|
-
workers will be launched as Ray actors
|
|
121
|
-
|
|
122
|
-
The job environment is specified by the TorchX workspace. Any files in
|
|
123
|
-
the workspace will be present in the Ray job unless specified in
|
|
124
|
-
``.torchxignore``. Python dependencies will be read from the
|
|
125
|
-
``requirements.txt`` file located at the root of the workspace unless
|
|
126
|
-
it's overridden via ``-c ...,requirements=foo/requirements.txt``.
|
|
127
|
-
|
|
128
|
-
**Config Options**
|
|
129
|
-
|
|
130
|
-
.. runopts::
|
|
131
|
-
class: torchx.schedulers.ray_scheduler.create_scheduler
|
|
132
|
-
|
|
133
|
-
**Compatibility**
|
|
134
|
-
|
|
135
|
-
.. compatibility::
|
|
136
|
-
type: scheduler
|
|
137
|
-
features:
|
|
138
|
-
cancel: true
|
|
139
|
-
logs: |
|
|
140
|
-
Partial support. Ray only supports a single log stream so
|
|
141
|
-
only a dummy "ray/0" combined log role is supported.
|
|
142
|
-
Tailing and time seeking are not supported.
|
|
143
|
-
distributed: true
|
|
144
|
-
describe: |
|
|
145
|
-
Partial support. RayScheduler will return job status but
|
|
146
|
-
does not provide the complete original AppSpec.
|
|
147
|
-
workspaces: true
|
|
148
|
-
mounts: false
|
|
149
|
-
elasticity: Partial support. Multi role jobs are not supported.
|
|
150
|
-
|
|
151
|
-
"""
|
|
152
|
-
|
|
153
|
-
def __init__(
|
|
154
|
-
self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
|
|
155
|
-
) -> None:
|
|
156
|
-
# NOTE: make sure any new init options are supported in create_scheduler(...)
|
|
157
|
-
super().__init__("ray", session_name)
|
|
158
|
-
|
|
159
|
-
# w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
|
|
160
|
-
self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
|
|
161
|
-
|
|
162
|
-
def _get_ray_client(
|
|
163
|
-
self, job_submission_netloc: Optional[str] = None
|
|
164
|
-
) -> JobSubmissionClient:
|
|
165
|
-
if self._ray_client is not None:
|
|
166
|
-
client_netloc = urllib3.util.parse_url(
|
|
167
|
-
self._ray_client.get_address()
|
|
168
|
-
).netloc
|
|
169
|
-
if job_submission_netloc and job_submission_netloc != client_netloc:
|
|
170
|
-
raise ValueError(
|
|
171
|
-
f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
|
|
172
|
-
)
|
|
173
|
-
return self._ray_client
|
|
174
|
-
elif os.getenv("RAY_ADDRESS"):
|
|
175
|
-
return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
|
|
176
|
-
elif not job_submission_netloc:
|
|
177
|
-
raise Exception(
|
|
178
|
-
"RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
|
|
179
|
-
" See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
|
|
180
|
-
)
|
|
181
|
-
return JobSubmissionClient(f"http://{job_submission_netloc}")
|
|
182
|
-
|
|
183
|
-
# TODO: Add address as a potential CLI argument after writing ray.status() or passing in config file
|
|
184
|
-
def _run_opts(self) -> runopts:
|
|
185
|
-
opts = runopts()
|
|
186
|
-
opts.add(
|
|
187
|
-
"cluster_config_file",
|
|
188
|
-
type_=str,
|
|
189
|
-
required=False,
|
|
190
|
-
help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
|
|
191
|
-
)
|
|
192
|
-
opts.add(
|
|
193
|
-
"cluster_name",
|
|
194
|
-
type_=str,
|
|
195
|
-
help="Override the configured cluster name.",
|
|
196
|
-
)
|
|
197
|
-
opts.add(
|
|
198
|
-
"dashboard_address",
|
|
199
|
-
type_=str,
|
|
200
|
-
required=False,
|
|
201
|
-
default="127.0.0.1:8265",
|
|
202
|
-
help="Use ray status to get the dashboard address you will submit jobs against",
|
|
203
|
-
)
|
|
204
|
-
opts.add("requirements", type_=str, help="Path to requirements.txt")
|
|
205
|
-
return opts
|
|
206
|
-
|
|
207
|
-
def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
|
|
208
|
-
cfg: RayJob = dryrun_info.request
|
|
209
|
-
|
|
210
|
-
# Create serialized actors for ray_driver.py
|
|
211
|
-
actors = cfg.actors
|
|
212
|
-
dirpath = cfg.working_dir
|
|
213
|
-
serialize(actors, dirpath)
|
|
214
|
-
|
|
215
|
-
job_submission_addr: str = ""
|
|
216
|
-
if cfg.cluster_config_file:
|
|
217
|
-
job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
|
|
218
|
-
cfg.cluster_config_file
|
|
219
|
-
) # pragma: no cover
|
|
220
|
-
elif cfg.dashboard_address:
|
|
221
|
-
job_submission_addr = cfg.dashboard_address
|
|
222
|
-
else:
|
|
223
|
-
raise RuntimeError(
|
|
224
|
-
"Either `dashboard_address` or `cluster_config_file` must be specified"
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
# 0. Create Job Client
|
|
228
|
-
client = self._get_ray_client(job_submission_netloc=job_submission_addr)
|
|
229
|
-
|
|
230
|
-
# 1. Copy Ray driver utilities
|
|
231
|
-
current_directory = os.path.dirname(os.path.abspath(__file__))
|
|
232
|
-
copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
|
|
233
|
-
copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
|
|
234
|
-
runtime_env = {"working_dir": dirpath}
|
|
235
|
-
if cfg.requirements:
|
|
236
|
-
runtime_env["pip"] = cfg.requirements
|
|
237
|
-
|
|
238
|
-
# 1. Submit Job via the Ray Job Submission API
|
|
239
|
-
try:
|
|
240
|
-
job_id: str = client.submit_job(
|
|
241
|
-
submission_id=cfg.app_id,
|
|
242
|
-
# we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
|
|
243
|
-
# and use it to configure your job execution.
|
|
244
|
-
entrypoint="python3 ray_driver.py",
|
|
245
|
-
runtime_env=runtime_env,
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
finally:
|
|
249
|
-
if dirpath.startswith(tempfile.gettempdir()):
|
|
250
|
-
rmtree(dirpath)
|
|
251
|
-
|
|
252
|
-
# Encode job submission client in job_id
|
|
253
|
-
return f"{job_submission_addr}-{job_id}"
|
|
254
|
-
|
|
255
|
-
def _submit_dryrun(self, app: AppDef, cfg: RayOpts) -> AppDryRunInfo[RayJob]:
|
|
256
|
-
app_id = make_unique(app.name)
|
|
257
|
-
|
|
258
|
-
working_dir = app.roles[0].image
|
|
259
|
-
if not os.path.exists(working_dir):
|
|
260
|
-
raise RuntimeError(
|
|
261
|
-
f"Role image must be a valid directory, got: {working_dir} "
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
requirements: Optional[str] = cfg.get("requirements")
|
|
265
|
-
if requirements is None:
|
|
266
|
-
workspace_reqs = os.path.join(working_dir, "requirements.txt")
|
|
267
|
-
if os.path.exists(workspace_reqs):
|
|
268
|
-
requirements = workspace_reqs
|
|
269
|
-
|
|
270
|
-
cluster_cfg = cfg.get("cluster_config_file")
|
|
271
|
-
if cluster_cfg:
|
|
272
|
-
if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
|
|
273
|
-
raise ValueError(
|
|
274
|
-
"The cluster configuration file must be a YAML file."
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
job: RayJob = RayJob(
|
|
278
|
-
app_id,
|
|
279
|
-
cluster_config_file=cluster_cfg,
|
|
280
|
-
requirements=requirements,
|
|
281
|
-
working_dir=working_dir,
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
else: # pragma: no cover
|
|
285
|
-
dashboard_address = cfg.get("dashboard_address")
|
|
286
|
-
job: RayJob = RayJob(
|
|
287
|
-
app_id=app_id,
|
|
288
|
-
dashboard_address=dashboard_address,
|
|
289
|
-
requirements=requirements,
|
|
290
|
-
working_dir=working_dir,
|
|
291
|
-
)
|
|
292
|
-
job.cluster_name = cfg.get("cluster_name")
|
|
293
|
-
|
|
294
|
-
for role in app.roles:
|
|
295
|
-
for replica_id in range(role.num_replicas):
|
|
296
|
-
# Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
|
|
297
|
-
# in arguments and environment variables.
|
|
298
|
-
replica_role = macros.Values(
|
|
299
|
-
img_root=role.image,
|
|
300
|
-
app_id=app_id,
|
|
301
|
-
replica_id=str(replica_id),
|
|
302
|
-
rank0_env=TORCHX_RANK0_HOST,
|
|
303
|
-
).apply(role)
|
|
304
|
-
|
|
305
|
-
actor = RayActor(
|
|
306
|
-
name=role.name,
|
|
307
|
-
min_replicas=role.min_replicas,
|
|
308
|
-
command=[replica_role.entrypoint] + replica_role.args,
|
|
309
|
-
env=replica_role.env,
|
|
310
|
-
num_cpus=max(1, replica_role.resource.cpu),
|
|
311
|
-
num_gpus=max(0, replica_role.resource.gpu),
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
job.actors.append(actor)
|
|
315
|
-
|
|
316
|
-
if len(app.roles) > 1 and app.roles[0].min_replicas is not None:
|
|
317
|
-
raise ValueError("min_replicas is only supported with single role jobs")
|
|
318
|
-
|
|
319
|
-
return AppDryRunInfo(job, repr)
|
|
320
|
-
|
|
321
|
-
def _validate(self, app: AppDef, scheduler: str) -> None:
|
|
322
|
-
if scheduler != "ray":
|
|
323
|
-
raise ValueError(
|
|
324
|
-
f"An unknown scheduler backend '{scheduler}' has been passed to the Ray scheduler."
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
if app.metadata:
|
|
328
|
-
_logger.warning("The Ray scheduler does not use metadata information.")
|
|
329
|
-
|
|
330
|
-
for role in app.roles:
|
|
331
|
-
if role.resource.capabilities:
|
|
332
|
-
_logger.warning(
|
|
333
|
-
"The Ray scheduler does not support custom resource capabilities."
|
|
334
|
-
)
|
|
335
|
-
break
|
|
336
|
-
|
|
337
|
-
for role in app.roles:
|
|
338
|
-
if role.port_map:
|
|
339
|
-
_logger.warning("The Ray scheduler does not support port mapping.")
|
|
340
|
-
break
|
|
341
|
-
|
|
342
|
-
def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:
|
|
343
|
-
"""
|
|
344
|
-
``wait_until_finish`` waits until the specified job has finished
|
|
345
|
-
with a given timeout. This is intended for testing. Programmatic
|
|
346
|
-
usage should use the runner wait method instead.
|
|
347
|
-
"""
|
|
348
|
-
|
|
349
|
-
start = time.time()
|
|
350
|
-
while time.time() - start <= timeout:
|
|
351
|
-
status_info = self._get_job_status(app_id)
|
|
352
|
-
status = status_info
|
|
353
|
-
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
|
|
354
|
-
break
|
|
355
|
-
time.sleep(1)
|
|
356
|
-
|
|
357
|
-
def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
|
|
358
|
-
# find index of '-' in the first :\d+-
|
|
359
|
-
m = re.search(r":\d+-", app_id)
|
|
360
|
-
if m:
|
|
361
|
-
sep = m.span()[1]
|
|
362
|
-
addr = app_id[: sep - 1]
|
|
363
|
-
app_id = app_id[sep:]
|
|
364
|
-
return addr, app_id
|
|
365
|
-
|
|
366
|
-
addr, _, app_id = app_id.partition("-")
|
|
367
|
-
return addr, app_id
|
|
368
|
-
|
|
369
|
-
def _cancel_existing(self, app_id: str) -> None: # pragma: no cover
|
|
370
|
-
addr, app_id = self._parse_app_id(app_id)
|
|
371
|
-
client = self._get_ray_client(job_submission_netloc=addr)
|
|
372
|
-
client.stop_job(app_id)
|
|
373
|
-
|
|
374
|
-
def _get_job_status(self, app_id: str) -> JobStatus:
|
|
375
|
-
addr, app_id = self._parse_app_id(app_id)
|
|
376
|
-
client = self._get_ray_client(job_submission_netloc=addr)
|
|
377
|
-
status = client.get_job_status(app_id)
|
|
378
|
-
if isinstance(status, str):
|
|
379
|
-
return cast(JobStatus, status)
|
|
380
|
-
return status.status
|
|
381
|
-
|
|
382
|
-
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
383
|
-
job_status_info = self._get_job_status(app_id)
|
|
384
|
-
state = _ray_status_to_torchx_appstate[job_status_info]
|
|
385
|
-
roles = [Role(name="ray", num_replicas=1, image="<N/A>")]
|
|
386
|
-
|
|
387
|
-
# get ip_address and put it in hostname
|
|
388
|
-
|
|
389
|
-
roles_statuses = [
|
|
390
|
-
RoleStatus(
|
|
391
|
-
role="ray",
|
|
392
|
-
replicas=[
|
|
393
|
-
ReplicaStatus(
|
|
394
|
-
id=0,
|
|
395
|
-
role="ray",
|
|
396
|
-
hostname=NONE,
|
|
397
|
-
state=state,
|
|
398
|
-
)
|
|
399
|
-
],
|
|
400
|
-
)
|
|
401
|
-
]
|
|
402
|
-
return DescribeAppResponse(
|
|
403
|
-
app_id=app_id,
|
|
404
|
-
state=state,
|
|
405
|
-
msg=job_status_info,
|
|
406
|
-
roles_statuses=roles_statuses,
|
|
407
|
-
roles=roles,
|
|
408
|
-
)
|
|
409
|
-
|
|
410
|
-
def log_iter(
|
|
411
|
-
self,
|
|
412
|
-
app_id: str,
|
|
413
|
-
role_name: Optional[str] = None,
|
|
414
|
-
k: int = 0,
|
|
415
|
-
regex: Optional[str] = None,
|
|
416
|
-
since: Optional[datetime] = None,
|
|
417
|
-
until: Optional[datetime] = None,
|
|
418
|
-
should_tail: bool = False,
|
|
419
|
-
streams: Optional[Stream] = None,
|
|
420
|
-
) -> Iterable[str]:
|
|
421
|
-
# TODO: support tailing, streams etc..
|
|
422
|
-
addr, app_id = self._parse_app_id(app_id)
|
|
423
|
-
client: JobSubmissionClient = self._get_ray_client(
|
|
424
|
-
job_submission_netloc=addr
|
|
425
|
-
)
|
|
426
|
-
logs: str = client.get_job_logs(app_id)
|
|
427
|
-
iterator = split_lines(logs)
|
|
428
|
-
if regex:
|
|
429
|
-
return filter_regex(regex, iterator)
|
|
430
|
-
return iterator
|
|
431
|
-
|
|
432
|
-
def list(self) -> List[ListAppResponse]:
|
|
433
|
-
client = self._get_ray_client()
|
|
434
|
-
jobs = client.list_jobs()
|
|
435
|
-
netloc = urllib3.util.parse_url(client.get_address()).netloc
|
|
436
|
-
return [
|
|
437
|
-
ListAppResponse(
|
|
438
|
-
app_id=f"{netloc}-{details.submission_id}",
|
|
439
|
-
state=_ray_status_to_torchx_appstate[details.status],
|
|
440
|
-
)
|
|
441
|
-
for details in jobs
|
|
442
|
-
]
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
def create_scheduler(
|
|
446
|
-
session_name: str, ray_client: Optional[JobSubmissionClient] = None, **kwargs: Any
|
|
447
|
-
) -> "RayScheduler":
|
|
448
|
-
if not has_ray(): # pragma: no cover
|
|
449
|
-
raise ModuleNotFoundError(
|
|
450
|
-
"Ray is not installed in the current Python environment."
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
return RayScheduler(session_name=session_name, ray_client=ray_client)
|
|
@@ -1,176 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: torchx-nightly
|
|
3
|
-
Version: 2024.1.6
|
|
4
|
-
Summary: TorchX SDK and Components
|
|
5
|
-
Home-page: https://github.com/pytorch/torchx
|
|
6
|
-
Author: TorchX Devs
|
|
7
|
-
Author-email: torchx@fb.com
|
|
8
|
-
License: BSD-3
|
|
9
|
-
Keywords: pytorch,machine learning
|
|
10
|
-
Platform: UNKNOWN
|
|
11
|
-
Classifier: Development Status :: 4 - Beta
|
|
12
|
-
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Intended Audience :: Science/Research
|
|
14
|
-
Classifier: License :: OSI Approved :: BSD License
|
|
15
|
-
Classifier: Programming Language :: Python :: 3
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
-
Requires-Python: >=3.7
|
|
19
|
-
Description-Content-Type: text/markdown
|
|
20
|
-
License-File: LICENSE
|
|
21
|
-
Requires-Dist: pyre-extensions
|
|
22
|
-
Requires-Dist: docstring-parser >=0.8.1
|
|
23
|
-
Requires-Dist: importlib-metadata
|
|
24
|
-
Requires-Dist: pyyaml
|
|
25
|
-
Requires-Dist: docker
|
|
26
|
-
Requires-Dist: filelock
|
|
27
|
-
Requires-Dist: fsspec ==2023.10.0
|
|
28
|
-
Requires-Dist: urllib3 <1.27,>=1.21.1
|
|
29
|
-
Requires-Dist: tabulate
|
|
30
|
-
Provides-Extra: aws_batch
|
|
31
|
-
Requires-Dist: boto3 ; extra == 'aws_batch'
|
|
32
|
-
Provides-Extra: dev
|
|
33
|
-
Requires-Dist: aiobotocore ; extra == 'dev'
|
|
34
|
-
Requires-Dist: ax-platform[mysql] ==0.2.3 ; extra == 'dev'
|
|
35
|
-
Requires-Dist: black ==23.3.0 ; extra == 'dev'
|
|
36
|
-
Requires-Dist: boto3 ; extra == 'dev'
|
|
37
|
-
Requires-Dist: captum >=0.4.0 ; extra == 'dev'
|
|
38
|
-
Requires-Dist: docker ; extra == 'dev'
|
|
39
|
-
Requires-Dist: flake8 ==3.9.0 ; extra == 'dev'
|
|
40
|
-
Requires-Dist: fsspec[s3] ==2023.10.0 ; extra == 'dev'
|
|
41
|
-
Requires-Dist: google-api-core ; extra == 'dev'
|
|
42
|
-
Requires-Dist: google-cloud-batch >=0.5.0 ; extra == 'dev'
|
|
43
|
-
Requires-Dist: google-cloud-logging >=3.0.0 ; extra == 'dev'
|
|
44
|
-
Requires-Dist: google-cloud-runtimeconfig >=0.33.2 ; extra == 'dev'
|
|
45
|
-
Requires-Dist: hydra-core ; extra == 'dev'
|
|
46
|
-
Requires-Dist: ipython ; extra == 'dev'
|
|
47
|
-
Requires-Dist: kfp ==1.8.22 ; extra == 'dev'
|
|
48
|
-
Requires-Dist: mlflow-skinny ; extra == 'dev'
|
|
49
|
-
Requires-Dist: moto ==4.1.6 ; extra == 'dev'
|
|
50
|
-
Requires-Dist: protobuf ==3.20.3 ; extra == 'dev'
|
|
51
|
-
Requires-Dist: pyre-extensions ; extra == 'dev'
|
|
52
|
-
Requires-Dist: pyre-check ; extra == 'dev'
|
|
53
|
-
Requires-Dist: pytest ; extra == 'dev'
|
|
54
|
-
Requires-Dist: pytorch-lightning ==1.5.10 ; extra == 'dev'
|
|
55
|
-
Requires-Dist: torch-model-archiver >=0.4.2 ; extra == 'dev'
|
|
56
|
-
Requires-Dist: torch >=1.10.0 ; extra == 'dev'
|
|
57
|
-
Requires-Dist: torchmetrics <0.11.0 ; extra == 'dev'
|
|
58
|
-
Requires-Dist: torchserve >=0.4.2 ; extra == 'dev'
|
|
59
|
-
Requires-Dist: torchtext >=0.11.0 ; extra == 'dev'
|
|
60
|
-
Requires-Dist: torchvision >=0.11.1 ; extra == 'dev'
|
|
61
|
-
Requires-Dist: ts ==0.5.1 ; extra == 'dev'
|
|
62
|
-
Requires-Dist: usort ==1.0.2 ; extra == 'dev'
|
|
63
|
-
Requires-Dist: ray[default] ; extra == 'dev'
|
|
64
|
-
Provides-Extra: gcp_batch
|
|
65
|
-
Requires-Dist: google-cloud-batch >=0.5.0 ; extra == 'gcp_batch'
|
|
66
|
-
Requires-Dist: google-cloud-logging >=3.0.0 ; extra == 'gcp_batch'
|
|
67
|
-
Requires-Dist: google-cloud-runtimeconfig >=0.33.2 ; extra == 'gcp_batch'
|
|
68
|
-
Provides-Extra: kfp
|
|
69
|
-
Requires-Dist: kfp ==1.6.2 ; extra == 'kfp'
|
|
70
|
-
Provides-Extra: kubernetes
|
|
71
|
-
Requires-Dist: kubernetes >=11 ; extra == 'kubernetes'
|
|
72
|
-
Provides-Extra: ray
|
|
73
|
-
Requires-Dist: ray >=1.12.1 ; extra == 'ray'
|
|
74
|
-
|
|
75
|
-
[](https://pypi.org/project/torchx/)
|
|
76
|
-
[](LICENSE)
|
|
77
|
-

|
|
78
|
-

|
|
79
|
-
[](https://codecov.io/gh/pytorch/torchx)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
# TorchX
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
TorchX is a universal job launcher for PyTorch applications.
|
|
86
|
-
TorchX is designed to have fast iteration time for training/research and support
|
|
87
|
-
for E2E production ML pipelines when you're ready.
|
|
88
|
-
|
|
89
|
-
TorchX currently supports:
|
|
90
|
-
|
|
91
|
-
* Kubernetes (EKS, GKE, AKS, etc)
|
|
92
|
-
* Slurm
|
|
93
|
-
* AWS Batch
|
|
94
|
-
* Docker
|
|
95
|
-
* Local
|
|
96
|
-
* Ray (prototype)
|
|
97
|
-
* GCP Batch (prototype)
|
|
98
|
-
|
|
99
|
-
Need a scheduler not listed? [Let us know!](https://github.com/pytorch/torchx/issues?q=is%3Aopen+is%3Aissue+label%3Ascheduler-request)
|
|
100
|
-
|
|
101
|
-
## Quickstart
|
|
102
|
-
|
|
103
|
-
See the [quickstart guide](https://pytorch.org/torchx/latest/quickstart.html).
|
|
104
|
-
|
|
105
|
-
## Documentation
|
|
106
|
-
|
|
107
|
-
* [Stable Documentation](https://pytorch.org/torchx/latest/)
|
|
108
|
-
* [Nightly Documentation](https://pytorch.org/torchx/main/)
|
|
109
|
-
|
|
110
|
-
## Requirements
|
|
111
|
-
|
|
112
|
-
torchx:
|
|
113
|
-
|
|
114
|
-
* python3 (3.8+)
|
|
115
|
-
* [PyTorch](https://pytorch.org/get-started/locally/)
|
|
116
|
-
* optional: [Docker](https://docs.docker.com/get-docker/) (needed for docker based schedulers)
|
|
117
|
-
|
|
118
|
-
Certain schedulers may require scheduler specific requirements. See installation
|
|
119
|
-
for info.
|
|
120
|
-
|
|
121
|
-
## Installation
|
|
122
|
-
|
|
123
|
-
### Stable
|
|
124
|
-
|
|
125
|
-
```bash
|
|
126
|
-
# install torchx sdk and CLI -- minimum dependencies
|
|
127
|
-
pip install torchx
|
|
128
|
-
|
|
129
|
-
# install torchx sdk and CLI -- all dependencies
|
|
130
|
-
pip install "torchx[dev]"
|
|
131
|
-
|
|
132
|
-
# install torchx kubeflow pipelines (kfp) support
|
|
133
|
-
pip install "torchx[kfp]"
|
|
134
|
-
|
|
135
|
-
# install torchx Kubernetes / Volcano support
|
|
136
|
-
pip install "torchx[kubernetes]"
|
|
137
|
-
|
|
138
|
-
# install torchx Ray support
|
|
139
|
-
pip install "torchx[ray]"
|
|
140
|
-
|
|
141
|
-
# install torchx GCP Batch support
|
|
142
|
-
pip install "torchx[gcp_batch]"
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
### Nightly
|
|
146
|
-
|
|
147
|
-
```bash
|
|
148
|
-
# install torchx sdk and CLI
|
|
149
|
-
pip install torchx-nightly[dev]
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
### Source
|
|
153
|
-
|
|
154
|
-
```bash
|
|
155
|
-
# install torchx sdk and CLI from source
|
|
156
|
-
$ pip install -e git+https://github.com/pytorch/torchx.git#egg=torchx
|
|
157
|
-
|
|
158
|
-
# install extra dependencies
|
|
159
|
-
$ pip install -e git+https://github.com/pytorch/torchx.git#egg=torchx[dev]
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
### Docker
|
|
163
|
-
|
|
164
|
-
TorchX provides a docker container for using as as part of a TorchX role.
|
|
165
|
-
|
|
166
|
-
See: https://github.com/pytorch/torchx/pkgs/container/torchx
|
|
167
|
-
|
|
168
|
-
## Contributing
|
|
169
|
-
|
|
170
|
-
We welcome PRs! See the [CONTRIBUTING](CONTRIBUTING.md) file.
|
|
171
|
-
|
|
172
|
-
## License
|
|
173
|
-
|
|
174
|
-
TorchX is BSD licensed, as found in the [LICENSE](LICENSE) file.
|
|
175
|
-
|
|
176
|
-
|