torchx-nightly 2025.4.29__py3-none-any.whl → 2025.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

@@ -18,6 +18,10 @@ from typing import Any, cast, Dict, Final, Iterable, List, Optional, Tuple # no
18
18
 
19
19
  import urllib3
20
20
 
21
+ from ray.autoscaler import sdk as ray_autoscaler_sdk
22
+ from ray.dashboard.modules.job.common import JobStatus
23
+ from ray.dashboard.modules.job.sdk import JobSubmissionClient
24
+
21
25
  from torchx.schedulers.api import (
22
26
  AppDryRunInfo,
23
27
  AppState,
@@ -35,22 +39,6 @@ from torchx.workspace.dir_workspace import TmpDirWorkspaceMixin
35
39
  from typing_extensions import TypedDict
36
40
 
37
41
 
38
- try:
39
- from ray.autoscaler import sdk as ray_autoscaler_sdk
40
- from ray.dashboard.modules.job.common import JobStatus
41
- from ray.dashboard.modules.job.sdk import JobSubmissionClient
42
-
43
- _has_ray = True
44
-
45
- except ImportError:
46
- _has_ray = False
47
-
48
-
49
- def has_ray() -> bool:
50
- """Indicates whether Ray is installed in the current Python environment."""
51
- return _has_ray
52
-
53
-
54
42
  class RayOpts(TypedDict, total=False):
55
43
  cluster_config_file: Optional[str]
56
44
  cluster_name: Optional[str]
@@ -59,397 +47,391 @@ class RayOpts(TypedDict, total=False):
59
47
  requirements: Optional[str]
60
48
 
61
49
 
62
- if _has_ray:
63
- _logger: logging.Logger = logging.getLogger(__name__)
64
-
65
- _ray_status_to_torchx_appstate: Dict[JobStatus, AppState] = {
66
- JobStatus.PENDING: AppState.PENDING,
67
- JobStatus.RUNNING: AppState.RUNNING,
68
- JobStatus.SUCCEEDED: AppState.SUCCEEDED,
69
- JobStatus.FAILED: AppState.FAILED,
70
- JobStatus.STOPPED: AppState.CANCELLED,
71
- }
72
-
73
- class _EnhancedJSONEncoder(json.JSONEncoder):
74
- def default(self, o: RayActor): # pyre-ignore[3]
75
- if dataclasses.is_dataclass(o):
76
- return dataclasses.asdict(o)
77
- return super().default(o)
78
-
79
- def serialize(
80
- actors: List[RayActor], dirpath: str, output_filename: str = "actors.json"
50
+ _logger: logging.Logger = logging.getLogger(__name__)
51
+
52
+ _ray_status_to_torchx_appstate: Dict[JobStatus, AppState] = {
53
+ JobStatus.PENDING: AppState.PENDING,
54
+ JobStatus.RUNNING: AppState.RUNNING,
55
+ JobStatus.SUCCEEDED: AppState.SUCCEEDED,
56
+ JobStatus.FAILED: AppState.FAILED,
57
+ JobStatus.STOPPED: AppState.CANCELLED,
58
+ }
59
+
60
+
61
+ class _EnhancedJSONEncoder(json.JSONEncoder):
62
+ def default(self, o: RayActor): # pyre-ignore[3]
63
+ if dataclasses.is_dataclass(o):
64
+ return dataclasses.asdict(o)
65
+ return super().default(o)
66
+
67
+
68
+ def serialize(
69
+ actors: List[RayActor], dirpath: str, output_filename: str = "actors.json"
70
+ ) -> None:
71
+ actors_json = json.dumps(actors, cls=_EnhancedJSONEncoder)
72
+ with open(os.path.join(dirpath, output_filename), "w") as tmp:
73
+ json.dump(actors_json, tmp)
74
+
75
+
76
+ @dataclass
77
+ class RayJob:
78
+ """Represents a job that should be run on a Ray cluster.
79
+
80
+ Attributes:
81
+ app_id:
82
+ The unique ID of the application (a.k.a. job).
83
+ cluster_config_file:
84
+ The Ray cluster configuration file.
85
+ cluster_name:
86
+ The cluster name to use.
87
+ dashboard_address:
88
+ The existing dashboard IP address to connect to
89
+ working_dir:
90
+ The working directory to copy to the cluster
91
+ requirements:
92
+ The libraries to install on the cluster per requirements.txt
93
+ actors:
94
+ The Ray actors which represent the job to be run. This attribute is
95
+ dumped to a JSON file and copied to the cluster where `ray_main.py`
96
+ uses it to initiate the job.
97
+ """
98
+
99
+ app_id: str
100
+ working_dir: str
101
+ cluster_config_file: Optional[str] = None
102
+ cluster_name: Optional[str] = None
103
+ dashboard_address: Optional[str] = None
104
+ requirements: Optional[str] = None
105
+ actors: List[RayActor] = field(default_factory=list)
106
+
107
+
108
+ class RayScheduler(
109
+ TmpDirWorkspaceMixin, Scheduler[RayOpts, AppDef, AppDryRunInfo[RayJob]]
110
+ ):
111
+ """
112
+ RayScheduler is a TorchX scheduling interface to Ray. The job def
113
+ workers will be launched as Ray actors
114
+
115
+ The job environment is specified by the TorchX workspace. Any files in
116
+ the workspace will be present in the Ray job unless specified in
117
+ ``.torchxignore``. Python dependencies will be read from the
118
+ ``requirements.txt`` file located at the root of the workspace unless
119
+ it's overridden via ``-c ...,requirements=foo/requirements.txt``.
120
+
121
+ **Config Options**
122
+
123
+ .. runopts::
124
+ class: torchx.schedulers.ray_scheduler.create_scheduler
125
+
126
+ **Compatibility**
127
+
128
+ .. compatibility::
129
+ type: scheduler
130
+ features:
131
+ cancel: true
132
+ logs: |
133
+ Partial support. Ray only supports a single log stream so
134
+ only a dummy "ray/0" combined log role is supported.
135
+ Tailing and time seeking are not supported.
136
+ distributed: true
137
+ describe: |
138
+ Partial support. RayScheduler will return job status but
139
+ does not provide the complete original AppSpec.
140
+ workspaces: true
141
+ mounts: false
142
+ elasticity: Partial support. Multi role jobs are not supported.
143
+
144
+ """
145
+
146
+ def __init__(
147
+ self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
81
148
  ) -> None:
82
- actors_json = json.dumps(actors, cls=_EnhancedJSONEncoder)
83
- with open(os.path.join(dirpath, output_filename), "w") as tmp:
84
- json.dump(actors_json, tmp)
85
-
86
- @dataclass
87
- class RayJob:
88
- """Represents a job that should be run on a Ray cluster.
89
-
90
- Attributes:
91
- app_id:
92
- The unique ID of the application (a.k.a. job).
93
- cluster_config_file:
94
- The Ray cluster configuration file.
95
- cluster_name:
96
- The cluster name to use.
97
- dashboard_address:
98
- The existing dashboard IP address to connect to
99
- working_dir:
100
- The working directory to copy to the cluster
101
- requirements:
102
- The libraries to install on the cluster per requirements.txt
103
- actors:
104
- The Ray actors which represent the job to be run. This attribute is
105
- dumped to a JSON file and copied to the cluster where `ray_main.py`
106
- uses it to initiate the job.
107
- """
108
-
109
- app_id: str
110
- working_dir: str
111
- cluster_config_file: Optional[str] = None
112
- cluster_name: Optional[str] = None
113
- dashboard_address: Optional[str] = None
114
- requirements: Optional[str] = None
115
- actors: List[RayActor] = field(default_factory=list)
116
-
117
- class RayScheduler(
118
- TmpDirWorkspaceMixin, Scheduler[RayOpts, AppDef, AppDryRunInfo[RayJob]]
119
- ):
120
- """
121
- RayScheduler is a TorchX scheduling interface to Ray. The job def
122
- workers will be launched as Ray actors
123
-
124
- The job environment is specified by the TorchX workspace. Any files in
125
- the workspace will be present in the Ray job unless specified in
126
- ``.torchxignore``. Python dependencies will be read from the
127
- ``requirements.txt`` file located at the root of the workspace unless
128
- it's overridden via ``-c ...,requirements=foo/requirements.txt``.
129
-
130
- **Config Options**
131
-
132
- .. runopts::
133
- class: torchx.schedulers.ray_scheduler.create_scheduler
134
-
135
- **Compatibility**
136
-
137
- .. compatibility::
138
- type: scheduler
139
- features:
140
- cancel: true
141
- logs: |
142
- Partial support. Ray only supports a single log stream so
143
- only a dummy "ray/0" combined log role is supported.
144
- Tailing and time seeking are not supported.
145
- distributed: true
146
- describe: |
147
- Partial support. RayScheduler will return job status but
148
- does not provide the complete original AppSpec.
149
- workspaces: true
150
- mounts: false
151
- elasticity: Partial support. Multi role jobs are not supported.
152
-
153
- """
154
-
155
- def __init__(
156
- self, session_name: str, ray_client: Optional[JobSubmissionClient] = None
157
- ) -> None:
158
- # NOTE: make sure any new init options are supported in create_scheduler(...)
159
- super().__init__("ray", session_name)
160
-
161
- # w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
162
- self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
163
-
164
- def _get_ray_client(
165
- self, job_submission_netloc: Optional[str] = None
166
- ) -> JobSubmissionClient:
167
- if self._ray_client is not None:
168
- client_netloc = urllib3.util.parse_url(
169
- self._ray_client.get_address()
170
- ).netloc
171
- if job_submission_netloc and job_submission_netloc != client_netloc:
172
- raise ValueError(
173
- f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
174
- )
175
- return self._ray_client
176
- elif os.getenv("RAY_ADDRESS"):
177
- return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
178
- elif not job_submission_netloc:
179
- raise Exception(
180
- "RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
181
- " See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
149
+ # NOTE: make sure any new init options are supported in create_scheduler(...)
150
+ super().__init__("ray", session_name)
151
+
152
+ # w/o Final None check in _get_ray_client does not work as it pyre assumes mutability
153
+ self._ray_client: Final[Optional[JobSubmissionClient]] = ray_client
154
+
155
+ def _get_ray_client(
156
+ self, job_submission_netloc: Optional[str] = None
157
+ ) -> JobSubmissionClient:
158
+ if self._ray_client is not None:
159
+ client_netloc = urllib3.util.parse_url(
160
+ self._ray_client.get_address()
161
+ ).netloc
162
+ if job_submission_netloc and job_submission_netloc != client_netloc:
163
+ raise ValueError(
164
+ f"client netloc ({client_netloc}) does not match job netloc ({job_submission_netloc})"
182
165
  )
183
- return JobSubmissionClient(f"http://{job_submission_netloc}")
184
-
185
- # TODO: Add address as a potential CLI argument after writing ray.status() or passing in config file
186
- def _run_opts(self) -> runopts:
187
- opts = runopts()
188
- opts.add(
189
- "cluster_config_file",
190
- type_=str,
191
- required=False,
192
- help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
193
- )
194
- opts.add(
195
- "cluster_name",
196
- type_=str,
197
- help="Override the configured cluster name.",
166
+ return self._ray_client
167
+ elif os.getenv("RAY_ADDRESS"):
168
+ return JobSubmissionClient(os.getenv("RAY_ADDRESS"))
169
+ elif not job_submission_netloc:
170
+ raise Exception(
171
+ "RAY_ADDRESS env variable or a scheduler with an attached Ray JobSubmissionClient is expected."
172
+ " See https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#job-submission-sdk for more info"
198
173
  )
199
- opts.add(
200
- "dashboard_address",
201
- type_=str,
202
- required=False,
203
- default="127.0.0.1:8265",
204
- help="Use ray status to get the dashboard address you will submit jobs against",
174
+ return JobSubmissionClient(f"http://{job_submission_netloc}")
175
+
176
+ # TODO: Add address as a potential CLI argument after writing ray.status() or passing in config file
177
+ def _run_opts(self) -> runopts:
178
+ opts = runopts()
179
+ opts.add(
180
+ "cluster_config_file",
181
+ type_=str,
182
+ required=False,
183
+ help="Use CLUSTER_CONFIG_FILE to access or create the Ray cluster.",
184
+ )
185
+ opts.add(
186
+ "cluster_name",
187
+ type_=str,
188
+ help="Override the configured cluster name.",
189
+ )
190
+ opts.add(
191
+ "dashboard_address",
192
+ type_=str,
193
+ required=False,
194
+ default="127.0.0.1:8265",
195
+ help="Use ray status to get the dashboard address you will submit jobs against",
196
+ )
197
+ opts.add("requirements", type_=str, help="Path to requirements.txt")
198
+ return opts
199
+
200
+ def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
201
+ cfg: RayJob = dryrun_info.request
202
+
203
+ # Create serialized actors for ray_driver.py
204
+ actors = cfg.actors
205
+ dirpath = cfg.working_dir
206
+ serialize(actors, dirpath)
207
+
208
+ job_submission_addr: str = ""
209
+ if cfg.cluster_config_file:
210
+ job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
211
+ cfg.cluster_config_file
212
+ ) # pragma: no cover
213
+ elif cfg.dashboard_address:
214
+ job_submission_addr = cfg.dashboard_address
215
+ else:
216
+ raise RuntimeError(
217
+ "Either `dashboard_address` or `cluster_config_file` must be specified"
205
218
  )
206
- opts.add("requirements", type_=str, help="Path to requirements.txt")
207
- return opts
208
-
209
- def schedule(self, dryrun_info: AppDryRunInfo[RayJob]) -> str:
210
- cfg: RayJob = dryrun_info.request
211
-
212
- # Create serialized actors for ray_driver.py
213
- actors = cfg.actors
214
- dirpath = cfg.working_dir
215
- serialize(actors, dirpath)
216
-
217
- job_submission_addr: str = ""
218
- if cfg.cluster_config_file:
219
- job_submission_addr = ray_autoscaler_sdk.get_head_node_ip(
220
- cfg.cluster_config_file
221
- ) # pragma: no cover
222
- elif cfg.dashboard_address:
223
- job_submission_addr = cfg.dashboard_address
224
- else:
225
- raise RuntimeError(
226
- "Either `dashboard_address` or `cluster_config_file` must be specified"
227
- )
228
-
229
- # 0. Create Job Client
230
- client = self._get_ray_client(job_submission_netloc=job_submission_addr)
231
-
232
- # 1. Copy Ray driver utilities
233
- current_directory = os.path.dirname(os.path.abspath(__file__))
234
- copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
235
- copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
236
- runtime_env = {"working_dir": dirpath}
237
- if cfg.requirements:
238
- runtime_env["pip"] = cfg.requirements
239
-
240
- # 1. Submit Job via the Ray Job Submission API
241
- try:
242
- job_id: str = client.submit_job(
243
- submission_id=cfg.app_id,
244
- # we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
245
- # and use it to configure your job execution.
246
- entrypoint="python3 ray_driver.py",
247
- runtime_env=runtime_env,
248
- )
249
219
 
250
- finally:
251
- if dirpath.startswith(tempfile.gettempdir()):
252
- rmtree(dirpath)
220
+ # 0. Create Job Client
221
+ client = self._get_ray_client(job_submission_netloc=job_submission_addr)
222
+
223
+ # 1. Copy Ray driver utilities
224
+ current_directory = os.path.dirname(os.path.abspath(__file__))
225
+ copy2(os.path.join(current_directory, "ray", "ray_driver.py"), dirpath)
226
+ copy2(os.path.join(current_directory, "ray", "ray_common.py"), dirpath)
227
+ runtime_env = {"working_dir": dirpath}
228
+ if cfg.requirements:
229
+ runtime_env["pip"] = cfg.requirements
230
+
231
+ # 1. Submit Job via the Ray Job Submission API
232
+ try:
233
+ job_id: str = client.submit_job(
234
+ submission_id=cfg.app_id,
235
+ # we will pack, hash, zip, upload, register working_dir in GCS of ray cluster
236
+ # and use it to configure your job execution.
237
+ entrypoint="python3 ray_driver.py",
238
+ runtime_env=runtime_env,
239
+ )
253
240
 
254
- # Encode job submission client in job_id
255
- return f"{job_submission_addr}-{job_id}"
241
+ finally:
242
+ if dirpath.startswith(tempfile.gettempdir()):
243
+ rmtree(dirpath)
256
244
 
257
- def _submit_dryrun(self, app: AppDef, cfg: RayOpts) -> AppDryRunInfo[RayJob]:
258
- app_id = make_unique(app.name)
245
+ # Encode job submission client in job_id
246
+ return f"{job_submission_addr}-{job_id}"
259
247
 
260
- working_dir = app.roles[0].image
261
- if not os.path.exists(working_dir):
262
- raise RuntimeError(
263
- f"Role image must be a valid directory, got: {working_dir} "
264
- )
248
+ def _submit_dryrun(self, app: AppDef, cfg: RayOpts) -> AppDryRunInfo[RayJob]:
249
+ app_id = make_unique(app.name)
265
250
 
266
- requirements: Optional[str] = cfg.get("requirements")
267
- if requirements is None:
268
- workspace_reqs = os.path.join(working_dir, "requirements.txt")
269
- if os.path.exists(workspace_reqs):
270
- requirements = workspace_reqs
271
-
272
- cluster_cfg = cfg.get("cluster_config_file")
273
- if cluster_cfg:
274
- if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
275
- raise ValueError(
276
- "The cluster configuration file must be a YAML file."
277
- )
251
+ working_dir = app.roles[0].image
252
+ if not os.path.exists(working_dir):
253
+ raise RuntimeError(
254
+ f"Role image must be a valid directory, got: {working_dir} "
255
+ )
278
256
 
279
- job: RayJob = RayJob(
280
- app_id,
281
- cluster_config_file=cluster_cfg,
282
- requirements=requirements,
283
- working_dir=working_dir,
284
- )
257
+ requirements: Optional[str] = cfg.get("requirements")
258
+ if requirements is None:
259
+ workspace_reqs = os.path.join(working_dir, "requirements.txt")
260
+ if os.path.exists(workspace_reqs):
261
+ requirements = workspace_reqs
262
+
263
+ cluster_cfg = cfg.get("cluster_config_file")
264
+ if cluster_cfg:
265
+ if not isinstance(cluster_cfg, str) or not os.path.isfile(cluster_cfg):
266
+ raise ValueError("The cluster configuration file must be a YAML file.")
267
+
268
+ job: RayJob = RayJob(
269
+ app_id,
270
+ cluster_config_file=cluster_cfg,
271
+ requirements=requirements,
272
+ working_dir=working_dir,
273
+ )
285
274
 
286
- else: # pragma: no cover
287
- dashboard_address = cfg.get("dashboard_address")
288
- job: RayJob = RayJob(
275
+ else: # pragma: no cover
276
+ dashboard_address = cfg.get("dashboard_address")
277
+ job: RayJob = RayJob(
278
+ app_id=app_id,
279
+ dashboard_address=dashboard_address,
280
+ requirements=requirements,
281
+ working_dir=working_dir,
282
+ )
283
+ job.cluster_name = cfg.get("cluster_name")
284
+
285
+ for role in app.roles:
286
+ for replica_id in range(role.num_replicas):
287
+ # Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
288
+ # in arguments and environment variables.
289
+ replica_role = macros.Values(
290
+ img_root=role.image,
289
291
  app_id=app_id,
290
- dashboard_address=dashboard_address,
291
- requirements=requirements,
292
- working_dir=working_dir,
292
+ replica_id=str(replica_id),
293
+ rank0_env=TORCHX_RANK0_HOST,
294
+ ).apply(role)
295
+
296
+ actor = RayActor(
297
+ name=role.name,
298
+ min_replicas=role.min_replicas,
299
+ command=[replica_role.entrypoint] + replica_role.args,
300
+ env=replica_role.env,
301
+ num_cpus=max(1, replica_role.resource.cpu),
302
+ num_gpus=max(0, replica_role.resource.gpu),
293
303
  )
294
- job.cluster_name = cfg.get("cluster_name")
295
-
296
- for role in app.roles:
297
- for replica_id in range(role.num_replicas):
298
- # Replace the ${img_root}, ${app_id}, and ${replica_id} placeholders
299
- # in arguments and environment variables.
300
- replica_role = macros.Values(
301
- img_root=role.image,
302
- app_id=app_id,
303
- replica_id=str(replica_id),
304
- rank0_env=TORCHX_RANK0_HOST,
305
- ).apply(role)
306
-
307
- actor = RayActor(
308
- name=role.name,
309
- min_replicas=role.min_replicas,
310
- command=[replica_role.entrypoint] + replica_role.args,
311
- env=replica_role.env,
312
- num_cpus=max(1, replica_role.resource.cpu),
313
- num_gpus=max(0, replica_role.resource.gpu),
314
- )
315
304
 
316
- job.actors.append(actor)
305
+ job.actors.append(actor)
317
306
 
318
- if len(app.roles) > 1 and app.roles[0].min_replicas is not None:
319
- raise ValueError("min_replicas is only supported with single role jobs")
307
+ if len(app.roles) > 1 and app.roles[0].min_replicas is not None:
308
+ raise ValueError("min_replicas is only supported with single role jobs")
320
309
 
321
- return AppDryRunInfo(job, repr)
310
+ return AppDryRunInfo(job, repr)
322
311
 
323
- def _validate(self, app: AppDef, scheduler: str, cfg: RayOpts) -> None:
324
- if scheduler != "ray":
325
- raise ValueError(
326
- f"An unknown scheduler backend '{scheduler}' has been passed to the Ray scheduler."
312
+ def _validate(self, app: AppDef, scheduler: str, cfg: RayOpts) -> None:
313
+ if scheduler != "ray":
314
+ raise ValueError(
315
+ f"An unknown scheduler backend '{scheduler}' has been passed to the Ray scheduler."
316
+ )
317
+
318
+ if app.metadata:
319
+ _logger.warning("The Ray scheduler does not use metadata information.")
320
+
321
+ for role in app.roles:
322
+ if role.resource.capabilities:
323
+ _logger.warning(
324
+ "The Ray scheduler does not support custom resource capabilities."
327
325
  )
326
+ break
328
327
 
329
- if app.metadata:
330
- _logger.warning("The Ray scheduler does not use metadata information.")
328
+ for role in app.roles:
329
+ if role.port_map:
330
+ _logger.warning("The Ray scheduler does not support port mapping.")
331
+ break
331
332
 
332
- for role in app.roles:
333
- if role.resource.capabilities:
334
- _logger.warning(
335
- "The Ray scheduler does not support custom resource capabilities."
336
- )
337
- break
338
-
339
- for role in app.roles:
340
- if role.port_map:
341
- _logger.warning("The Ray scheduler does not support port mapping.")
342
- break
343
-
344
- def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:
345
- """
346
- ``wait_until_finish`` waits until the specified job has finished
347
- with a given timeout. This is intended for testing. Programmatic
348
- usage should use the runner wait method instead.
349
- """
350
-
351
- start = time.time()
352
- while time.time() - start <= timeout:
353
- status_info = self._get_job_status(app_id)
354
- status = status_info
355
- if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
356
- break
357
- time.sleep(1)
358
-
359
- def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
360
- # find index of '-' in the first :\d+-
361
- m = re.search(r":\d+-", app_id)
362
- if m:
363
- sep = m.span()[1]
364
- addr = app_id[: sep - 1]
365
- app_id = app_id[sep:]
366
- return addr, app_id
367
-
368
- addr, _, app_id = app_id.partition("-")
333
+ def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:
334
+ """
335
+ ``wait_until_finish`` waits until the specified job has finished
336
+ with a given timeout. This is intended for testing. Programmatic
337
+ usage should use the runner wait method instead.
338
+ """
339
+
340
+ start = time.time()
341
+ while time.time() - start <= timeout:
342
+ status_info = self._get_job_status(app_id)
343
+ status = status_info
344
+ if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
345
+ break
346
+ time.sleep(1)
347
+
348
+ def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
349
+ # find index of '-' in the first :\d+-
350
+ m = re.search(r":\d+-", app_id)
351
+ if m:
352
+ sep = m.span()[1]
353
+ addr = app_id[: sep - 1]
354
+ app_id = app_id[sep:]
369
355
  return addr, app_id
370
356
 
371
- def _cancel_existing(self, app_id: str) -> None: # pragma: no cover
372
- addr, app_id = self._parse_app_id(app_id)
373
- client = self._get_ray_client(job_submission_netloc=addr)
374
- client.stop_job(app_id)
375
-
376
- def _get_job_status(self, app_id: str) -> JobStatus:
377
- addr, app_id = self._parse_app_id(app_id)
378
- client = self._get_ray_client(job_submission_netloc=addr)
379
- status = client.get_job_status(app_id)
380
- if isinstance(status, str):
381
- return cast(JobStatus, status)
382
- return status.status
383
-
384
- def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
385
- job_status_info = self._get_job_status(app_id)
386
- state = _ray_status_to_torchx_appstate[job_status_info]
387
- roles = [Role(name="ray", num_replicas=1, image="<N/A>")]
388
-
389
- # get ip_address and put it in hostname
390
-
391
- roles_statuses = [
392
- RoleStatus(
393
- role="ray",
394
- replicas=[
395
- ReplicaStatus(
396
- id=0,
397
- role="ray",
398
- hostname=NONE,
399
- state=state,
400
- )
401
- ],
402
- )
403
- ]
404
- return DescribeAppResponse(
405
- app_id=app_id,
406
- state=state,
407
- msg=job_status_info,
408
- roles_statuses=roles_statuses,
409
- roles=roles,
357
+ addr, _, app_id = app_id.partition("-")
358
+ return addr, app_id
359
+
360
+ def _cancel_existing(self, app_id: str) -> None: # pragma: no cover
361
+ addr, app_id = self._parse_app_id(app_id)
362
+ client = self._get_ray_client(job_submission_netloc=addr)
363
+ client.stop_job(app_id)
364
+
365
+ def _get_job_status(self, app_id: str) -> JobStatus:
366
+ addr, app_id = self._parse_app_id(app_id)
367
+ client = self._get_ray_client(job_submission_netloc=addr)
368
+ status = client.get_job_status(app_id)
369
+ if isinstance(status, str):
370
+ return cast(JobStatus, status)
371
+ return status.status
372
+
373
+ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
374
+ job_status_info = self._get_job_status(app_id)
375
+ state = _ray_status_to_torchx_appstate[job_status_info]
376
+ roles = [Role(name="ray", num_replicas=1, image="<N/A>")]
377
+
378
+ # get ip_address and put it in hostname
379
+
380
+ roles_statuses = [
381
+ RoleStatus(
382
+ role="ray",
383
+ replicas=[
384
+ ReplicaStatus(
385
+ id=0,
386
+ role="ray",
387
+ hostname=NONE,
388
+ state=state,
389
+ )
390
+ ],
410
391
  )
392
+ ]
393
+ return DescribeAppResponse(
394
+ app_id=app_id,
395
+ state=state,
396
+ msg=job_status_info,
397
+ roles_statuses=roles_statuses,
398
+ roles=roles,
399
+ )
411
400
 
412
- def log_iter(
413
- self,
414
- app_id: str,
415
- role_name: Optional[str] = None,
416
- k: int = 0,
417
- regex: Optional[str] = None,
418
- since: Optional[datetime] = None,
419
- until: Optional[datetime] = None,
420
- should_tail: bool = False,
421
- streams: Optional[Stream] = None,
422
- ) -> Iterable[str]:
423
- # TODO: support tailing, streams etc..
424
- addr, app_id = self._parse_app_id(app_id)
425
- client: JobSubmissionClient = self._get_ray_client(
426
- job_submission_netloc=addr
401
+ def log_iter(
402
+ self,
403
+ app_id: str,
404
+ role_name: Optional[str] = None,
405
+ k: int = 0,
406
+ regex: Optional[str] = None,
407
+ since: Optional[datetime] = None,
408
+ until: Optional[datetime] = None,
409
+ should_tail: bool = False,
410
+ streams: Optional[Stream] = None,
411
+ ) -> Iterable[str]:
412
+ # TODO: support tailing, streams etc..
413
+ addr, app_id = self._parse_app_id(app_id)
414
+ client: JobSubmissionClient = self._get_ray_client(job_submission_netloc=addr)
415
+ logs: str = client.get_job_logs(app_id)
416
+ iterator = split_lines(logs)
417
+ if regex:
418
+ return filter_regex(regex, iterator)
419
+ return iterator
420
+
421
+ def list(self) -> List[ListAppResponse]:
422
+ client = self._get_ray_client()
423
+ jobs = client.list_jobs()
424
+ netloc = urllib3.util.parse_url(client.get_address()).netloc
425
+ return [
426
+ ListAppResponse(
427
+ app_id=f"{netloc}-{details.submission_id}",
428
+ state=_ray_status_to_torchx_appstate[details.status],
427
429
  )
428
- logs: str = client.get_job_logs(app_id)
429
- iterator = split_lines(logs)
430
- if regex:
431
- return filter_regex(regex, iterator)
432
- return iterator
433
-
434
- def list(self) -> List[ListAppResponse]:
435
- client = self._get_ray_client()
436
- jobs = client.list_jobs()
437
- netloc = urllib3.util.parse_url(client.get_address()).netloc
438
- return [
439
- ListAppResponse(
440
- app_id=f"{netloc}-{details.submission_id}",
441
- state=_ray_status_to_torchx_appstate[details.status],
442
- )
443
- for details in jobs
444
- ]
430
+ for details in jobs
431
+ ]
445
432
 
446
433
 
447
434
  def create_scheduler(
448
435
  session_name: str, ray_client: Optional[JobSubmissionClient] = None, **kwargs: Any
449
436
  ) -> "RayScheduler":
450
- if not has_ray(): # pragma: no cover
451
- raise ModuleNotFoundError(
452
- "Ray is not installed in the current Python environment."
453
- )
454
-
455
437
  return RayScheduler(session_name=session_name, ray_client=ray_client)
torchx/specs/finder.py CHANGED
@@ -7,6 +7,7 @@
7
7
  # pyre-strict
8
8
 
9
9
  import abc
10
+ import copy
10
11
  import importlib
11
12
  import inspect
12
13
  import logging
@@ -281,7 +282,9 @@ class CustomComponentsFinder(ComponentsFinder):
281
282
  )
282
283
 
283
284
  file_source = read_conf_file(self._filepath)
284
- namespace = globals()
285
+ namespace = copy.copy(globals())
286
+ # so that __file__ used inside the component points to the correct file
287
+ namespace["__file__"] = os.path.abspath(self._filepath)
285
288
  exec(file_source, namespace) # noqa: P204
286
289
  if self._function_name not in namespace:
287
290
  raise ComponentNotFoundException(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2025.4.29
3
+ Version: 2025.5.1
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -76,7 +76,7 @@ torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=-NHxKAW9bGnQ-4hpFhciZTlFJr
76
76
  torchx/schedulers/kubernetes_scheduler.py,sha256=7AR3ccfta0NXqahxz9LVrv-vkdZnYTAHzw-sh_aLNDs,28242
77
77
  torchx/schedulers/local_scheduler.py,sha256=JMSGAO9RXeUiEz8BOTA_EnHDOd065oJ_tyV1E__m3OQ,41882
78
78
  torchx/schedulers/lsf_scheduler.py,sha256=e6BmJC6dNNNzzwATgJu5Sq4HxAPw_hI3EJFRojzAMlE,17690
79
- torchx/schedulers/ray_scheduler.py,sha256=0uEuIqsO0QyaDxTRxaVuXmsA3cEKeSXgUSfVzIPJKo0,17507
79
+ torchx/schedulers/ray_scheduler.py,sha256=9Sqesw3aOw_Z0gua2TY3aYE3OJ9MCi75hqVl_RUQwQY,15750
80
80
  torchx/schedulers/slurm_scheduler.py,sha256=RC1ze2w0oaoQDLgercW7yHz1rGv5FVB6em4HYbLmQRg,19434
81
81
  torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
82
82
  torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
@@ -86,7 +86,7 @@ torchx/specs/__init__.py,sha256=c2ALDbqHIhNBhrYxwXXURRwu1Rg5jcwukWF8emEO1Bk,6347
86
86
  torchx/specs/api.py,sha256=jtasrQUy_6-AmZxsfZ_6J-kfUGKYsO5cVsrCP3imZ-I,38844
87
87
  torchx/specs/builders.py,sha256=f5Yy8KoL2OgPUiqJRkZ4E6lboq5Srkh5mD17F0EBdeg,10506
88
88
  torchx/specs/file_linter.py,sha256=QCwob5STTBuy8RsxaevTI-Dk6R8siDJn81LyaOwazes,12333
89
- torchx/specs/finder.py,sha256=OCgGknz9fpVQfV51l5FEsyCjGmMA0pd1otnZXPP4puw,17280
89
+ torchx/specs/finder.py,sha256=GseAruZBuTdQHWhnxqjE0SsyfCDxzg00qK73k-b47NA,17447
90
90
  torchx/specs/named_resources_aws.py,sha256=ISjHtifRJqB8u7PeAMiyLyO_S0WCaZiK-CFF3qe6JDU,11415
91
91
  torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
92
92
  torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
115
115
  torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
116
116
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
117
117
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
118
- torchx_nightly-2025.4.29.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
- torchx_nightly-2025.4.29.dist-info/METADATA,sha256=3K05QW-mY6w5tV9AAqnL1wV5NPWrmpH6oScR8IOfadE,6167
120
- torchx_nightly-2025.4.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
121
- torchx_nightly-2025.4.29.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
- torchx_nightly-2025.4.29.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
- torchx_nightly-2025.4.29.dist-info/RECORD,,
118
+ torchx_nightly-2025.5.1.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
+ torchx_nightly-2025.5.1.dist-info/METADATA,sha256=WiS59n2Mm_YFbAEEWQ30PEiGEieyKsq0NuvwJZ5ghl8,6166
120
+ torchx_nightly-2025.5.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
121
+ torchx_nightly-2025.5.1.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
+ torchx_nightly-2025.5.1.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
+ torchx_nightly-2025.5.1.dist-info/RECORD,,