torchx-nightly 2025.4.28__py3-none-any.whl → 2025.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/ray_scheduler.py +361 -379
- torchx/specs/finder.py +4 -1
- {torchx_nightly-2025.4.28.dist-info → torchx_nightly-2025.5.1.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.4.28.dist-info → torchx_nightly-2025.5.1.dist-info}/RECORD +8 -8
- {torchx_nightly-2025.4.28.dist-info → torchx_nightly-2025.5.1.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.4.28.dist-info → torchx_nightly-2025.5.1.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.4.28.dist-info → torchx_nightly-2025.5.1.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.4.28.dist-info → torchx_nightly-2025.5.1.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,10 @@ from typing import Any, cast, Dict, Final, Iterable, List, Optional, Tuple # no
|
|
|
18
18
|
|
|
19
19
|
import urllib3
|
|
20
20
|
|
|
21
|
+
from ray.autoscaler import sdk as ray_autoscaler_sdk
|
|
22
|
+
from ray.dashboard.modules.job.common import JobStatus
|
|
23
|
+
from ray.dashboard.modules.job.sdk import JobSubmissionClient
|
|
24
|
+
|
|
21
25
|
from torchx.schedulers.api import (
|
|
22
26
|
AppDryRunInfo,
|
|
23
27
|
AppState,
|
|
@@ -35,22 +39,6 @@ from torchx.workspace.dir_workspace import TmpDirWorkspaceMixin
|
|
|
35
39
|
from typing_extensions import TypedDict
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
try:
|
|
39
|
-
from ray.autoscaler import sdk as ray_autoscaler_sdk
|
|
40
|
-
from ray.dashboard.modules.job.common import JobStatus
|
|
41
|
-
from ray.dashboard.modules.job.sdk import JobSubmissionClient
|
|
42
|
-
|
|
43
|
-
_has_ray = True
|
|
44
|
-
|
|
45
|
-
except ImportError:
|
|
46
|
-
_has_ray = False
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def has_ray() -> bool:
|
|
50
|
-
"""Indicates whether Ray is installed in the current Python environment."""
|
|
51
|
-
return _has_ray
|
|
52
|
-
|
|
53
|
-
|
|
54
42
|
class RayOpts(TypedDict, total=False):
|
|
55
43
|
cluster_config_file: Optional[str]
|
|
56
44
|
cluster_name: Optional[str]
|
|
@@ -59,397 +47,391 @@ class RayOpts(TypedDict, total=False):
|
|
|
59
47
|
requirements: Optional[str]
|
|
60
48
|
|
|
61
49
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
50
|
+
_logger: logging.Logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
_ray_status_to_torchx_appstate: Dict[JobStatus, AppState] = {
|
|
53
|
+
JobStatus.PENDING: AppState.PENDING,
|
|
54
|
+
JobStatus.RUNNING: AppState.RUNNING,
|
|
55
|
+
JobStatus.SUCCEEDED: AppState.SUCCEEDED,
|
|
56
|
+
JobStatus.FAILED: AppState.FAILED,
|
|
57
|
+
JobStatus.STOPPED: AppState.CANCELLED,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class _EnhancedJSONEncoder(json.JSONEncoder):
|
|
62
|
+
def default(self, o: RayActor): # pyre-ignore[3]
|
|
63
|
+
if dataclasses.is_dataclass(o):
|
|
64
|
+
return dataclasses.asdict(o)
|
|
65
|
+
return super().default(o)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def serialize(
|
|
69
|
+
actors: List[RayActor], dirpath: str, output_filename: str = "actors.json"
|
|
70
|
+
) -> None:
|
|
71
|
+
actors_json = json.dumps(actors, cls=_EnhancedJSONEncoder)
|
|
72
|
+
with open(os.path.join(dirpath, output_filename), "w") as tmp:
|
|
73
|
+
json.dump(actors_json, tmp)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class RayJob:
|
|
78
|
+
"""Represents a job that should be run on a Ray cluster.
|
|
79
|
+
|
|
80
|
+
Attributes:
|
|
81
|
+
app_id:
|
|
82
|
+
The unique ID of the application (a.k.a. job).
|
|
83
|
+
cluster_config_file:
|
|
84
|
+
The Ray cluster configuration file.
|
|
85
|
+
cluster_name:
|
|
86
|
+
The cluster name to use.
|
|
87
|
+
dashboard_address:
|
|
88
|
+
The existing dashboard IP address to connect to
|
|
89
|
+
working_dir:
|
|
90
|
+
The working directory to copy to the cluster
|
|
91
|
+
requirements:
|
|
92
|
+
The libraries to install on the cluster per requirements.txt
|
|
93
|
+
actors:
|
|
94
|
+
The Ray actors which represent the job to be run. This attribute is
|
|
95
|
+
dumped to a JSON file and copied to the cluster where `ray_main.py`
|
|
96
|
+
uses it to initiate the job.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
app_id: str
|
|
100
|
+
working_dir: str
|
|
101
|
+
cluster_config_file: Optional[str] = None
|
|
102
|
+
cluster_name: Optional[str] = None
|
|
103
|
+
dashboard_address: Optional[str] = None
|
|
104
|
+
requirements: Optional[str] = None
|
|
105
|
+
actors: List[RayActor] = field(default_factory=list)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class RayScheduler(
|
|
109
|
+
TmpDirWorkspaceMixin, Scheduler[RayOpts, AppDef, AppDryRunInfo[RayJob]]
|
|
110
|
+
):
|
|
111
|
+
"""
|
|
112
|
+
RayScheduler is a TorchX scheduling interface to Ray. The job def
|
|
113
|
+
workers will be launched as Ray actors
|
|
114
|
+
|
|
115
|
+
The job environment is specified by the TorchX workspace. Any files in
|
|
116
|
+
the workspace will be present in the Ray job unless specified in
|
|
117
|
+
``.torchxignore``. Python dependencies will be read from the
|
|
118
|
+
``requirements.txt`` file located at the root of the workspace unless
|
|
119
|
+
it's overridden via ``-c ...,requirements=foo/requirements.txt``.
|
|
120
|
+
|
|
121
|
+
**Config Options**
|
|
122
|
+
|
|
123
|
+
.. runopts::
|
|
124
|
+
class: torchx.schedulers.ray_scheduler.create_scheduler
|
|
125
|
+
|
|
126
|
+
**Compatibility**
|
|
127
|
+
|
|
128
|
+
.. compatibility::
|
|
129
|
+
type: scheduler
|
|
130
|
+
features:
|
|
131
|
+
cancel: true
|
|
132
|
+
logs: |
|
|
133
|
+
Partial support. Ray only supports a single log stream so
|
|
134
|
+
only a dummy "ray/0" combined log role is supported.
|
|
135
|
+
Tailing and time seeking are not supported.
|
|
136
|
+
distributed: true
|
|
137
|
+
describe: |
|
|
138
|
+
Partial support. RayScheduler will return job status but
|
|
139
|
+
does not provide the complete original AppSpec.
|
|
140
|
+
workspaces: true
|
|
141
|
+
mounts: false
|
|
142
|
+
elasticity: Partial support. Multi role jobs are not supported.
|
|
143
|
+
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
|
|
81
148
|
) -> None:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
The existing dashboard IP address to connect to
|
|
99
|
-
working_dir:
|
|
100
|
-
The working directory to copy to the cluster
|
|
101
|
-
requirements:
|
|
102
|
-
The libraries to install on the cluster per requirements.txt
|
|
103
|
-
actors:
|
|
104
|
-
The Ray actors which represent the job to be run. This attribute is
|
|
105
|
-
dumped to a JSON file and copied to the cluster where `ray_main.py`
|
|
106
|
-
uses it to initiate the job.
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
app_id: str
|
|
110
|
-
working_dir: str
|
|
111
|
-
cluster_config_file: Optional[str] = None
|
|
112
|
-
cluster_name: Optional[str] = None
|
|
113
|
-
dashboard_address: Optional[str] = None
|
|
114
|
-
requirements: Optional[str] = None
|
|
115
|
-
actors: List[RayActor] = field(default_factory=list)
|
|
116
|
-
|
|
117
|
-
class RayScheduler(
|
|
118
|
-
TmpDirWorkspaceMixin, Scheduler[RayOpts, AppDef, AppDryRunInfo[RayJob]]
|
|
119
|
-
):
|
|
120
|
-
"""
|
|
121
|
-
RayScheduler is a TorchX scheduling interface to Ray. The job def
|
|
122
|
-
workers will be launched as Ray actors
|
|
123
|
-
|
|
124
|
-
The job environment is specified by the TorchX workspace. Any files in
|
|
125
|
-
the workspace will be present in the Ray job unless specified in
|
|
126
|
-
``.torchxignore``. Python dependencies will be read from the
|
|
127
|
-
``requirements.txt`` file located at the root of the workspace unless
|
|
128
|
-
it's overridden via ``-c ...,requirements=foo/requirements.txt``.
|
|
129
|
-
|
|
130
|
-
**Config Options**
|
|
131
|
-
|
|
132
|
-
.. runopts::
|
|
133
|
-
class: torchx.schedulers.ray_scheduler.create_scheduler
|
|
134
|
-
|
|
135
|
-
**Compatibility**
|
|
136
|
-
|
|
137
|
-
.. compatibility::
|
|
138
|
-
type: scheduler
|
|
139
|
-
features:
|
|
140
|
-
cancel: true
|
|
141
|
-
logs: |
|
|
142
|
-
Partial support. Ray only supports a single log stream so
|
|
143
|
-
only a dummy "ray/0" combined log role is supported.
|
|
144
|
-
Tailing and time seeking are not supported.
|
|
145
|
-
distributed: true
|
|
146
|
-
describe: |
|
|
147
|
-
Partial support. RayScheduler will return job status but
|
|
148
|
-
does not provide the complete original AppSpec.
|
|
149
|
-
workspaces: true
|
|
150
|
-
mounts: false
|
|
151
|
-
elasticity: Partial support. Multi role jobs are not supported.
|
|
152
|
-
|
|
153
|
-
"""
|
|
154
|
-
|
|
155
|
-
def __init__(
|
|
156
|
-
self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
|
|
157
|
-
) -> None:
|
|
158
|
-
# NOTE: make sure any new init options are supported in create_scheduler(...)
|
|
159
|
-
super().__init__("ray", session_name)
|
|
160
|
-
|
|
161
|
-
# w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
|
|
162
|
-
self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
|
|
163
|
-
|
|
164
|
-
def _get_ray_client(
|
|
165
|
-
self, job_submission_netloc: Optional[str] = None
|
|
166
|
-
) -> JobSubmissionClient:
|
|
167
|
-
if self._ray_client is not None:
|
|
168
|
-
client_netloc = urllib3.util.parse_url(
|
|
169
|
-
self._ray_client.get_address()
|
|
170
|
-
).netloc
|
|
171
|
-
if job_submission_netloc and job_submission_netloc != client_netloc:
|
|
172
|
-
raise ValueError(
|
|
173
|
-
f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
|
|
174
|
-
)
|
|
175
|
-
return self._ray_client
|
|
176
|
-
elif os.getenv("RAY_ADDRESS"):
|
|
177
|
-
return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
|
|
178
|
-
elif not job_submission_netloc:
|
|
179
|
-
raise Exception(
|
|
180
|
-
"RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
|
|
181
|
-
" See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
|
|
149
|
+
# NOTE: make sure any new init options are supported in create_scheduler(...)
|
|
150
|
+
super().__init__("ray", session_name)
|
|
151
|
+
|
|
152
|
+
# w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
|
|
153
|
+
self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
|
|
154
|
+
|
|
155
|
+
def _get_ray_client(
|
|
156
|
+
self, job_submission_netloc: Optional[str] = None
|
|
157
|
+
) -> JobSubmissionClient:
|
|
158
|
+
if self._ray_client is not None:
|
|
159
|
+
client_netloc = urllib3.util.parse_url(
|
|
160
|
+
self._ray_client.get_address()
|
|
161
|
+
).netloc
|
|
162
|
+
if job_submission_netloc and job_submission_netloc != client_netloc:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
|
|
182
165
|
)
|
|
183
|
-
return
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
"
|
|
190
|
-
type_=str,
|
|
191
|
-
required=False,
|
|
192
|
-
help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
|
|
193
|
-
)
|
|
194
|
-
opts.add(
|
|
195
|
-
"cluster_name",
|
|
196
|
-
type_=str,
|
|
197
|
-
help="Override the configured cluster name.",
|
|
166
|
+
return self._ray_client
|
|
167
|
+
elif os.getenv("RAY_ADDRESS"):
|
|
168
|
+
return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
|
|
169
|
+
elif not job_submission_netloc:
|
|
170
|
+
raise Exception(
|
|
171
|
+
"RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
|
|
172
|
+
" See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
|
|
198
173
|
)
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
174
|
+
return JobSubmissionClient(f"http://{job_submission_netloc}")
|
|
175
|
+
|
|
176
|
+
# TODO: Add address as a potential CLI argument after writing ray.status() or passing in config file
|
|
177
|
+
def _run_opts(self) -> runopts:
|
|
178
|
+
opts = runopts()
|
|
179
|
+
opts.add(
|
|
180
|
+
"cluster_config_file",
|
|
181
|
+
type_=str,
|
|
182
|
+
required=False,
|
|
183
|
+
help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
|
|
184
|
+
)
|
|
185
|
+
opts.add(
|
|
186
|
+
"cluster_name",
|
|
187
|
+
type_=str,
|
|
188
|
+
help="Override the configured cluster name.",
|
|
189
|
+
)
|
|
190
|
+
opts.add(
|
|
191
|
+
"dashboard_address",
|
|
192
|
+
type_=str,
|
|
193
|
+
required=False,
|
|
194
|
+
default="127.0.0.1:8265",
|
|
195
|
+
help="Use ray status to get the dashboard address you will submit jobs against",
|
|
196
|
+
)
|
|
197
|
+
opts.add("requirements", type_=str, help="Path to requirements.txt")
|
|
198
|
+
return opts
|
|
199
|
+
|
|
200
|
+
def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
|
|
201
|
+
cfg: RayJob = dryrun_info.request
|
|
202
|
+
|
|
203
|
+
# Create serialized actors for ray_driver.py
|
|
204
|
+
actors = cfg.actors
|
|
205
|
+
dirpath = cfg.working_dir
|
|
206
|
+
serialize(actors, dirpath)
|
|
207
|
+
|
|
208
|
+
job_submission_addr: str = ""
|
|
209
|
+
if cfg.cluster_config_file:
|
|
210
|
+
job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
|
|
211
|
+
cfg.cluster_config_file
|
|
212
|
+
) # pragma: no cover
|
|
213
|
+
elif cfg.dashboard_address:
|
|
214
|
+
job_submission_addr = cfg.dashboard_address
|
|
215
|
+
else:
|
|
216
|
+
raise RuntimeError(
|
|
217
|
+
"Either `dashboard_address` or `cluster_config_file` must be specified"
|
|
205
218
|
)
|
|
206
|
-
opts.add("requirements", type_=str, help="Path to requirements.txt")
|
|
207
|
-
return opts
|
|
208
|
-
|
|
209
|
-
def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
|
|
210
|
-
cfg: RayJob = dryrun_info.request
|
|
211
|
-
|
|
212
|
-
# Create serialized actors for ray_driver.py
|
|
213
|
-
actors = cfg.actors
|
|
214
|
-
dirpath = cfg.working_dir
|
|
215
|
-
serialize(actors, dirpath)
|
|
216
|
-
|
|
217
|
-
job_submission_addr: str = ""
|
|
218
|
-
if cfg.cluster_config_file:
|
|
219
|
-
job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
|
|
220
|
-
cfg.cluster_config_file
|
|
221
|
-
) # pragma: no cover
|
|
222
|
-
elif cfg.dashboard_address:
|
|
223
|
-
job_submission_addr = cfg.dashboard_address
|
|
224
|
-
else:
|
|
225
|
-
raise RuntimeError(
|
|
226
|
-
"Either `dashboard_address` or `cluster_config_file` must be specified"
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
# 0. Create Job Client
|
|
230
|
-
client = self._get_ray_client(job_submission_netloc=job_submission_addr)
|
|
231
|
-
|
|
232
|
-
# 1. Copy Ray driver utilities
|
|
233
|
-
current_directory = os.path.dirname(os.path.abspath(__file__))
|
|
234
|
-
copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
|
|
235
|
-
copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
|
|
236
|
-
runtime_env = {"working_dir": dirpath}
|
|
237
|
-
if cfg.requirements:
|
|
238
|
-
runtime_env["pip"] = cfg.requirements
|
|
239
|
-
|
|
240
|
-
# 1. Submit Job via the Ray Job Submission API
|
|
241
|
-
try:
|
|
242
|
-
job_id: str = client.submit_job(
|
|
243
|
-
submission_id=cfg.app_id,
|
|
244
|
-
# we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
|
|
245
|
-
# and use it to configure your job execution.
|
|
246
|
-
entrypoint="python3 ray_driver.py",
|
|
247
|
-
runtime_env=runtime_env,
|
|
248
|
-
)
|
|
249
219
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
220
|
+
# 0. Create Job Client
|
|
221
|
+
client = self._get_ray_client(job_submission_netloc=job_submission_addr)
|
|
222
|
+
|
|
223
|
+
# 1. Copy Ray driver utilities
|
|
224
|
+
current_directory = os.path.dirname(os.path.abspath(__file__))
|
|
225
|
+
copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
|
|
226
|
+
copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
|
|
227
|
+
runtime_env = {"working_dir": dirpath}
|
|
228
|
+
if cfg.requirements:
|
|
229
|
+
runtime_env["pip"] = cfg.requirements
|
|
230
|
+
|
|
231
|
+
# 1. Submit Job via the Ray Job Submission API
|
|
232
|
+
try:
|
|
233
|
+
job_id: str = client.submit_job(
|
|
234
|
+
submission_id=cfg.app_id,
|
|
235
|
+
# we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
|
|
236
|
+
# and use it to configure your job execution.
|
|
237
|
+
entrypoint="python3 ray_driver.py",
|
|
238
|
+
runtime_env=runtime_env,
|
|
239
|
+
)
|
|
253
240
|
|
|
254
|
-
|
|
255
|
-
|
|
241
|
+
finally:
|
|
242
|
+
if dirpath.startswith(tempfile.gettempdir()):
|
|
243
|
+
rmtree(dirpath)
|
|
256
244
|
|
|
257
|
-
|
|
258
|
-
|
|
245
|
+
# Encode job submission client in job_id
|
|
246
|
+
return f"{job_submission_addr}-{job_id}"
|
|
259
247
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
raise RuntimeError(
|
|
263
|
-
f"Role image must be a valid directory, got: {working_dir} "
|
|
264
|
-
)
|
|
248
|
+
def _submit_dryrun(self, app: AppDef, cfg: RayOpts) -> AppDryRunInfo[RayJob]:
|
|
249
|
+
app_id = make_unique(app.name)
|
|
265
250
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
cluster_cfg = cfg.get("cluster_config_file")
|
|
273
|
-
if cluster_cfg:
|
|
274
|
-
if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
|
|
275
|
-
raise ValueError(
|
|
276
|
-
"The cluster configuration file must be a YAML file."
|
|
277
|
-
)
|
|
251
|
+
working_dir = app.roles[0].image
|
|
252
|
+
if not os.path.exists(working_dir):
|
|
253
|
+
raise RuntimeError(
|
|
254
|
+
f"Role image must be a valid directory, got: {working_dir} "
|
|
255
|
+
)
|
|
278
256
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
257
|
+
requirements: Optional[str] = cfg.get("requirements")
|
|
258
|
+
if requirements is None:
|
|
259
|
+
workspace_reqs = os.path.join(working_dir, "requirements.txt")
|
|
260
|
+
if os.path.exists(workspace_reqs):
|
|
261
|
+
requirements = workspace_reqs
|
|
262
|
+
|
|
263
|
+
cluster_cfg = cfg.get("cluster_config_file")
|
|
264
|
+
if cluster_cfg:
|
|
265
|
+
if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
|
|
266
|
+
raise ValueError("The cluster configuration file must be a YAML file.")
|
|
267
|
+
|
|
268
|
+
job: RayJob = RayJob(
|
|
269
|
+
app_id,
|
|
270
|
+
cluster_config_file=cluster_cfg,
|
|
271
|
+
requirements=requirements,
|
|
272
|
+
working_dir=working_dir,
|
|
273
|
+
)
|
|
285
274
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
275
|
+
else: # pragma: no cover
|
|
276
|
+
dashboard_address = cfg.get("dashboard_address")
|
|
277
|
+
job: RayJob = RayJob(
|
|
278
|
+
app_id=app_id,
|
|
279
|
+
dashboard_address=dashboard_address,
|
|
280
|
+
requirements=requirements,
|
|
281
|
+
working_dir=working_dir,
|
|
282
|
+
)
|
|
283
|
+
job.cluster_name = cfg.get("cluster_name")
|
|
284
|
+
|
|
285
|
+
for role in app.roles:
|
|
286
|
+
for replica_id in range(role.num_replicas):
|
|
287
|
+
# Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
|
|
288
|
+
# in arguments and environment variables.
|
|
289
|
+
replica_role = macros.Values(
|
|
290
|
+
img_root=role.image,
|
|
289
291
|
app_id=app_id,
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
292
|
+
replica_id=str(replica_id),
|
|
293
|
+
rank0_env=TORCHX_RANK0_HOST,
|
|
294
|
+
).apply(role)
|
|
295
|
+
|
|
296
|
+
actor = RayActor(
|
|
297
|
+
name=role.name,
|
|
298
|
+
min_replicas=role.min_replicas,
|
|
299
|
+
command=[replica_role.entrypoint] + replica_role.args,
|
|
300
|
+
env=replica_role.env,
|
|
301
|
+
num_cpus=max(1, replica_role.resource.cpu),
|
|
302
|
+
num_gpus=max(0, replica_role.resource.gpu),
|
|
293
303
|
)
|
|
294
|
-
job.cluster_name = cfg.get("cluster_name")
|
|
295
|
-
|
|
296
|
-
for role in app.roles:
|
|
297
|
-
for replica_id in range(role.num_replicas):
|
|
298
|
-
# Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
|
|
299
|
-
# in arguments and environment variables.
|
|
300
|
-
replica_role = macros.Values(
|
|
301
|
-
img_root=role.image,
|
|
302
|
-
app_id=app_id,
|
|
303
|
-
replica_id=str(replica_id),
|
|
304
|
-
rank0_env=TORCHX_RANK0_HOST,
|
|
305
|
-
).apply(role)
|
|
306
|
-
|
|
307
|
-
actor = RayActor(
|
|
308
|
-
name=role.name,
|
|
309
|
-
min_replicas=role.min_replicas,
|
|
310
|
-
command=[replica_role.entrypoint] + replica_role.args,
|
|
311
|
-
env=replica_role.env,
|
|
312
|
-
num_cpus=max(1, replica_role.resource.cpu),
|
|
313
|
-
num_gpus=max(0, replica_role.resource.gpu),
|
|
314
|
-
)
|
|
315
304
|
|
|
316
|
-
|
|
305
|
+
job.actors.append(actor)
|
|
317
306
|
|
|
318
|
-
|
|
319
|
-
|
|
307
|
+
if len(app.roles) > 1 and app.roles[0].min_replicas is not None:
|
|
308
|
+
raise ValueError("min_replicas is only supported with single role jobs")
|
|
320
309
|
|
|
321
|
-
|
|
310
|
+
return AppDryRunInfo(job, repr)
|
|
322
311
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
312
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: RayOpts) -> None:
|
|
313
|
+
if scheduler != "ray":
|
|
314
|
+
raise ValueError(
|
|
315
|
+
f"An unknown scheduler backend '{scheduler}' has been passed to the Ray scheduler."
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if app.metadata:
|
|
319
|
+
_logger.warning("The Ray scheduler does not use metadata information.")
|
|
320
|
+
|
|
321
|
+
for role in app.roles:
|
|
322
|
+
if role.resource.capabilities:
|
|
323
|
+
_logger.warning(
|
|
324
|
+
"The Ray scheduler does not support custom resource capabilities."
|
|
327
325
|
)
|
|
326
|
+
break
|
|
328
327
|
|
|
329
|
-
|
|
330
|
-
|
|
328
|
+
for role in app.roles:
|
|
329
|
+
if role.port_map:
|
|
330
|
+
_logger.warning("The Ray scheduler does not support port mapping.")
|
|
331
|
+
break
|
|
331
332
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
status = status_info
|
|
355
|
-
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
|
|
356
|
-
break
|
|
357
|
-
time.sleep(1)
|
|
358
|
-
|
|
359
|
-
def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
|
|
360
|
-
# find index of '-' in the first :\d+-
|
|
361
|
-
m = re.search(r":\d+-", app_id)
|
|
362
|
-
if m:
|
|
363
|
-
sep = m.span()[1]
|
|
364
|
-
addr = app_id[: sep - 1]
|
|
365
|
-
app_id = app_id[sep:]
|
|
366
|
-
return addr, app_id
|
|
367
|
-
|
|
368
|
-
addr, _, app_id = app_id.partition("-")
|
|
333
|
+
def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:
|
|
334
|
+
"""
|
|
335
|
+
``wait_until_finish`` waits until the specified job has finished
|
|
336
|
+
with a given timeout. This is intended for testing. Programmatic
|
|
337
|
+
usage should use the runner wait method instead.
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
start = time.time()
|
|
341
|
+
while time.time() - start <= timeout:
|
|
342
|
+
status_info = self._get_job_status(app_id)
|
|
343
|
+
status = status_info
|
|
344
|
+
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
|
|
345
|
+
break
|
|
346
|
+
time.sleep(1)
|
|
347
|
+
|
|
348
|
+
def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
|
|
349
|
+
# find index of '-' in the first :\d+-
|
|
350
|
+
m = re.search(r":\d+-", app_id)
|
|
351
|
+
if m:
|
|
352
|
+
sep = m.span()[1]
|
|
353
|
+
addr = app_id[: sep - 1]
|
|
354
|
+
app_id = app_id[sep:]
|
|
369
355
|
return addr, app_id
|
|
370
356
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
app_id=app_id,
|
|
406
|
-
state=state,
|
|
407
|
-
msg=job_status_info,
|
|
408
|
-
roles_statuses=roles_statuses,
|
|
409
|
-
roles=roles,
|
|
357
|
+
addr, _, app_id = app_id.partition("-")
|
|
358
|
+
return addr, app_id
|
|
359
|
+
|
|
360
|
+
def _cancel_existing(self, app_id: str) -> None: # pragma: no cover
|
|
361
|
+
addr, app_id = self._parse_app_id(app_id)
|
|
362
|
+
client = self._get_ray_client(job_submission_netloc=addr)
|
|
363
|
+
client.stop_job(app_id)
|
|
364
|
+
|
|
365
|
+
def _get_job_status(self, app_id: str) -> JobStatus:
|
|
366
|
+
addr, app_id = self._parse_app_id(app_id)
|
|
367
|
+
client = self._get_ray_client(job_submission_netloc=addr)
|
|
368
|
+
status = client.get_job_status(app_id)
|
|
369
|
+
if isinstance(status, str):
|
|
370
|
+
return cast(JobStatus, status)
|
|
371
|
+
return status.status
|
|
372
|
+
|
|
373
|
+
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
374
|
+
job_status_info = self._get_job_status(app_id)
|
|
375
|
+
state = _ray_status_to_torchx_appstate[job_status_info]
|
|
376
|
+
roles = [Role(name="ray", num_replicas=1, image="<N/A>")]
|
|
377
|
+
|
|
378
|
+
# get ip_address and put it in hostname
|
|
379
|
+
|
|
380
|
+
roles_statuses = [
|
|
381
|
+
RoleStatus(
|
|
382
|
+
role="ray",
|
|
383
|
+
replicas=[
|
|
384
|
+
ReplicaStatus(
|
|
385
|
+
id=0,
|
|
386
|
+
role="ray",
|
|
387
|
+
hostname=NONE,
|
|
388
|
+
state=state,
|
|
389
|
+
)
|
|
390
|
+
],
|
|
410
391
|
)
|
|
392
|
+
]
|
|
393
|
+
return DescribeAppResponse(
|
|
394
|
+
app_id=app_id,
|
|
395
|
+
state=state,
|
|
396
|
+
msg=job_status_info,
|
|
397
|
+
roles_statuses=roles_statuses,
|
|
398
|
+
roles=roles,
|
|
399
|
+
)
|
|
411
400
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
401
|
+
def log_iter(
|
|
402
|
+
self,
|
|
403
|
+
app_id: str,
|
|
404
|
+
role_name: Optional[str] = None,
|
|
405
|
+
k: int = 0,
|
|
406
|
+
regex: Optional[str] = None,
|
|
407
|
+
since: Optional[datetime] = None,
|
|
408
|
+
until: Optional[datetime] = None,
|
|
409
|
+
should_tail: bool = False,
|
|
410
|
+
streams: Optional[Stream] = None,
|
|
411
|
+
) -> Iterable[str]:
|
|
412
|
+
# TODO: support tailing, streams etc..
|
|
413
|
+
addr, app_id = self._parse_app_id(app_id)
|
|
414
|
+
client: JobSubmissionClient = self._get_ray_client(job_submission_netloc=addr)
|
|
415
|
+
logs: str = client.get_job_logs(app_id)
|
|
416
|
+
iterator = split_lines(logs)
|
|
417
|
+
if regex:
|
|
418
|
+
return filter_regex(regex, iterator)
|
|
419
|
+
return iterator
|
|
420
|
+
|
|
421
|
+
def list(self) -> List[ListAppResponse]:
|
|
422
|
+
client = self._get_ray_client()
|
|
423
|
+
jobs = client.list_jobs()
|
|
424
|
+
netloc = urllib3.util.parse_url(client.get_address()).netloc
|
|
425
|
+
return [
|
|
426
|
+
ListAppResponse(
|
|
427
|
+
app_id=f"{netloc}-{details.submission_id}",
|
|
428
|
+
state=_ray_status_to_torchx_appstate[details.status],
|
|
427
429
|
)
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
if regex:
|
|
431
|
-
return filter_regex(regex, iterator)
|
|
432
|
-
return iterator
|
|
433
|
-
|
|
434
|
-
def list(self) -> List[ListAppResponse]:
|
|
435
|
-
client = self._get_ray_client()
|
|
436
|
-
jobs = client.list_jobs()
|
|
437
|
-
netloc = urllib3.util.parse_url(client.get_address()).netloc
|
|
438
|
-
return [
|
|
439
|
-
ListAppResponse(
|
|
440
|
-
app_id=f"{netloc}-{details.submission_id}",
|
|
441
|
-
state=_ray_status_to_torchx_appstate[details.status],
|
|
442
|
-
)
|
|
443
|
-
for details in jobs
|
|
444
|
-
]
|
|
430
|
+
for details in jobs
|
|
431
|
+
]
|
|
445
432
|
|
|
446
433
|
|
|
447
434
|
def create_scheduler(
|
|
448
435
|
session_name: str, ray_client: Optional[JobSubmissionClient] = None, **kwargs: Any
|
|
449
436
|
) -> "RayScheduler":
|
|
450
|
-
if not has_ray(): # pragma: no cover
|
|
451
|
-
raise ModuleNotFoundError(
|
|
452
|
-
"Ray is not installed in the current Python environment."
|
|
453
|
-
)
|
|
454
|
-
|
|
455
437
|
return RayScheduler(session_name=session_name, ray_client=ray_client)
|
torchx/specs/finder.py
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
# pyre-strict
|
|
8
8
|
|
|
9
9
|
import abc
|
|
10
|
+
import copy
|
|
10
11
|
import importlib
|
|
11
12
|
import inspect
|
|
12
13
|
import logging
|
|
@@ -281,7 +282,9 @@ class CustomComponentsFinder(ComponentsFinder):
|
|
|
281
282
|
)
|
|
282
283
|
|
|
283
284
|
file_source = read_conf_file(self._filepath)
|
|
284
|
-
namespace = globals()
|
|
285
|
+
namespace = copy.copy(globals())
|
|
286
|
+
# so that __file__ used inside the component points to the correct file
|
|
287
|
+
namespace["__file__"] = os.path.abspath(self._filepath)
|
|
285
288
|
exec(file_source, namespace) # noqa: P204
|
|
286
289
|
if self._function_name not in namespace:
|
|
287
290
|
raise ComponentNotFoundException(
|
|
@@ -76,7 +76,7 @@ torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=-NHxKAW9bGnQ-4hpFhciZTlFJr
|
|
|
76
76
|
torchx/schedulers/kubernetes_scheduler.py,sha256=7AR3ccfta0NXqahxz9LVrv-vkdZnYTAHzw-sh_aLNDs,28242
|
|
77
77
|
torchx/schedulers/local_scheduler.py,sha256=JMSGAO9RXeUiEz8BOTA_EnHDOd065oJ_tyV1E__m3OQ,41882
|
|
78
78
|
torchx/schedulers/lsf_scheduler.py,sha256=e6BmJC6dNNNzzwATgJu5Sq4HxAPw_hI3EJFRojzAMlE,17690
|
|
79
|
-
torchx/schedulers/ray_scheduler.py,sha256=
|
|
79
|
+
torchx/schedulers/ray_scheduler.py,sha256=9Sqesw3aOw_Z0gua2TY3aYE3OJ9MCi75hqVl_RUQwQY,15750
|
|
80
80
|
torchx/schedulers/slurm_scheduler.py,sha256=RC1ze2w0oaoQDLgercW7yHz1rGv5FVB6em4HYbLmQRg,19434
|
|
81
81
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
82
82
|
torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
@@ -86,7 +86,7 @@ torchx/specs/__init__.py,sha256=c2ALDbqHIhNBhrYxwXXURRwu1Rg5jcwukWF8emEO1Bk,6347
|
|
|
86
86
|
torchx/specs/api.py,sha256=jtasrQUy_6-AmZxsfZ_6J-kfUGKYsO5cVsrCP3imZ-I,38844
|
|
87
87
|
torchx/specs/builders.py,sha256=f5Yy8KoL2OgPUiqJRkZ4E6lboq5Srkh5mD17F0EBdeg,10506
|
|
88
88
|
torchx/specs/file_linter.py,sha256=QCwob5STTBuy8RsxaevTI-Dk6R8siDJn81LyaOwazes,12333
|
|
89
|
-
torchx/specs/finder.py,sha256=
|
|
89
|
+
torchx/specs/finder.py,sha256=GseAruZBuTdQHWhnxqjE0SsyfCDxzg00qK73k-b47NA,17447
|
|
90
90
|
torchx/specs/named_resources_aws.py,sha256=ISjHtifRJqB8u7PeAMiyLyO_S0WCaZiK-CFF3qe6JDU,11415
|
|
91
91
|
torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
|
|
92
92
|
torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
|
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
115
115
|
torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
|
|
116
116
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
117
117
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
118
|
-
torchx_nightly-2025.
|
|
119
|
-
torchx_nightly-2025.
|
|
120
|
-
torchx_nightly-2025.
|
|
121
|
-
torchx_nightly-2025.
|
|
122
|
-
torchx_nightly-2025.
|
|
123
|
-
torchx_nightly-2025.
|
|
118
|
+
torchx_nightly-2025.5.1.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
119
|
+
torchx_nightly-2025.5.1.dist-info/METADATA,sha256=WiS59n2Mm_YFbAEEWQ30PEiGEieyKsq0NuvwJZ5ghl8,6166
|
|
120
|
+
torchx_nightly-2025.5.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
121
|
+
torchx_nightly-2025.5.1.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
122
|
+
torchx_nightly-2025.5.1.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
123
|
+
torchx_nightly-2025.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|