torchx-nightly 2025.10.16__py3-none-any.whl → 2025.11.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/_version.py +8 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/main.py +2 -0
- torchx/runner/api.py +35 -33
- torchx/schedulers/api.py +47 -2
- torchx/schedulers/kubernetes_scheduler.py +233 -16
- torchx/specs/__init__.py +17 -3
- torchx/specs/api.py +79 -40
- torchx/version.py +2 -2
- torchx/workspace/api.py +63 -42
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info}/METADATA +21 -8
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info}/RECORD +16 -14
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info}/WHEEL +1 -1
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info}/top_level.txt +0 -0
torchx/_version.py
ADDED
torchx/cli/cmd_delete.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This source code is licensed under the BSD-style license found in the
|
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
|
7
|
+
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
from torchx.cli.cmd_base import SubCommand
|
|
14
|
+
from torchx.runner import get_runner
|
|
15
|
+
|
|
16
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CmdDelete(SubCommand):
|
|
20
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
|
21
|
+
subparser.add_argument(
|
|
22
|
+
"app_handle",
|
|
23
|
+
type=str,
|
|
24
|
+
help="torchx app handle (e.g. local://session-name/app-id)",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def run(self, args: argparse.Namespace) -> None:
|
|
28
|
+
app_handle = args.app_handle
|
|
29
|
+
runner = get_runner()
|
|
30
|
+
runner.delete(app_handle)
|
torchx/cli/main.py
CHANGED
|
@@ -16,6 +16,7 @@ import torchx
|
|
|
16
16
|
from torchx.cli.cmd_base import SubCommand
|
|
17
17
|
from torchx.cli.cmd_cancel import CmdCancel
|
|
18
18
|
from torchx.cli.cmd_configure import CmdConfigure
|
|
19
|
+
from torchx.cli.cmd_delete import CmdDelete
|
|
19
20
|
from torchx.cli.cmd_describe import CmdDescribe
|
|
20
21
|
from torchx.cli.cmd_list import CmdList
|
|
21
22
|
from torchx.cli.cmd_log import CmdLog
|
|
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
|
|
|
37
38
|
"builtins": CmdBuiltins(),
|
|
38
39
|
"cancel": CmdCancel(),
|
|
39
40
|
"configure": CmdConfigure(),
|
|
41
|
+
"delete": CmdDelete(),
|
|
40
42
|
"describe": CmdDescribe(),
|
|
41
43
|
"list": CmdList(),
|
|
42
44
|
"log": CmdLog(),
|
torchx/runner/api.py
CHANGED
|
@@ -420,52 +420,44 @@ class Runner:
|
|
|
420
420
|
scheduler,
|
|
421
421
|
runcfg=json.dumps(cfg) if cfg else None,
|
|
422
422
|
workspace=str(workspace),
|
|
423
|
-
):
|
|
423
|
+
) as ctx:
|
|
424
424
|
sched = self._scheduler(scheduler)
|
|
425
425
|
resolved_cfg = sched.run_opts().resolve(cfg)
|
|
426
426
|
|
|
427
427
|
sched._pre_build_validate(app, scheduler, resolved_cfg)
|
|
428
428
|
|
|
429
429
|
if isinstance(sched, WorkspaceMixin):
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
# later, torchx added support for the workspace attr in Role
|
|
436
|
-
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
|
|
437
|
-
if role_workspace:
|
|
438
|
-
logger.info(
|
|
439
|
-
f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
|
|
440
|
-
" To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically." # noqa: B950
|
|
441
|
-
)
|
|
442
|
-
role_workspace = workspace
|
|
443
|
-
|
|
444
|
-
if role_workspace:
|
|
445
|
-
old_img = role.image
|
|
430
|
+
if workspace:
|
|
431
|
+
# NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
|
|
432
|
+
# later, torchx added support for the workspace attr in Role
|
|
433
|
+
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
|
|
434
|
+
if app.roles[0].workspace:
|
|
446
435
|
logger.info(
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
role, role_workspace, resolved_cfg
|
|
436
|
+
"Overriding role[%d] (%s) workspace to `%s`"
|
|
437
|
+
"To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
|
|
438
|
+
0,
|
|
439
|
+
role.name,
|
|
440
|
+
str(app.roles[0].workspace),
|
|
453
441
|
)
|
|
442
|
+
app.roles[0].workspace = (
|
|
443
|
+
Workspace.from_str(workspace)
|
|
444
|
+
if isinstance(workspace, str)
|
|
445
|
+
else workspace
|
|
446
|
+
)
|
|
454
447
|
|
|
455
|
-
|
|
456
|
-
logger.info(
|
|
457
|
-
f"Built new image `{role.image}` based on original image `{old_img}`"
|
|
458
|
-
f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
|
|
459
|
-
)
|
|
460
|
-
else:
|
|
461
|
-
logger.info(
|
|
462
|
-
f"Reusing original image `{old_img}` for role[{i}]={role.name}."
|
|
463
|
-
" Either a patch was built or no changes to workspace was detected."
|
|
464
|
-
)
|
|
448
|
+
sched.build_workspaces(app.roles, resolved_cfg)
|
|
465
449
|
|
|
466
450
|
sched._validate(app, scheduler, resolved_cfg)
|
|
467
451
|
dryrun_info = sched.submit_dryrun(app, resolved_cfg)
|
|
468
452
|
dryrun_info._scheduler = scheduler
|
|
453
|
+
|
|
454
|
+
event = ctx._torchx_event
|
|
455
|
+
event.scheduler = scheduler
|
|
456
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
457
|
+
event.app_id = app.name
|
|
458
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
459
|
+
event.app_metadata = app.metadata
|
|
460
|
+
|
|
469
461
|
return dryrun_info
|
|
470
462
|
|
|
471
463
|
def scheduler_run_opts(self, scheduler: str) -> runopts:
|
|
@@ -595,6 +587,16 @@ class Runner:
|
|
|
595
587
|
if status is not None and not status.is_terminal():
|
|
596
588
|
scheduler.cancel(app_id)
|
|
597
589
|
|
|
590
|
+
def delete(self, app_handle: AppHandle) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Deletes the application from the scheduler.
|
|
593
|
+
"""
|
|
594
|
+
scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
|
|
595
|
+
with log_event("delete", scheduler_backend, app_id):
|
|
596
|
+
status = self.status(app_handle)
|
|
597
|
+
if status is not None:
|
|
598
|
+
scheduler.delete(app_id)
|
|
599
|
+
|
|
598
600
|
def stop(self, app_handle: AppHandle) -> None:
|
|
599
601
|
"""
|
|
600
602
|
See method ``cancel``.
|
torchx/schedulers/api.py
CHANGED
|
@@ -131,7 +131,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
131
131
|
self,
|
|
132
132
|
app: A,
|
|
133
133
|
cfg: T,
|
|
134
|
-
workspace:
|
|
134
|
+
workspace: str | Workspace | None = None,
|
|
135
135
|
) -> str:
|
|
136
136
|
"""
|
|
137
137
|
Submits the application to be run by the scheduler.
|
|
@@ -145,7 +145,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
145
145
|
resolved_cfg = self.run_opts().resolve(cfg)
|
|
146
146
|
if workspace:
|
|
147
147
|
assert isinstance(self, WorkspaceMixin)
|
|
148
|
-
|
|
148
|
+
|
|
149
|
+
if isinstance(workspace, str):
|
|
150
|
+
workspace = Workspace.from_str(workspace)
|
|
151
|
+
|
|
152
|
+
app.roles[0].workspace = workspace
|
|
153
|
+
self.build_workspaces(app.roles, resolved_cfg)
|
|
149
154
|
|
|
150
155
|
# pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
|
|
151
156
|
dryrun_info = self.submit_dryrun(app, resolved_cfg)
|
|
@@ -259,6 +264,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
259
264
|
# do nothing if the app does not exist
|
|
260
265
|
return
|
|
261
266
|
|
|
267
|
+
def delete(self, app_id: str) -> None:
|
|
268
|
+
"""
|
|
269
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
270
|
+
scheduler's data-plane. Basically "deep-purging" the job from the
|
|
271
|
+
scheduler's data-plane. Calling this API on a "live" job (e.g in a
|
|
272
|
+
non-terminal status such as PENDING or RUNNING) cancels the job.
|
|
273
|
+
|
|
274
|
+
Note that this API is only relevant for schedulers for which its
|
|
275
|
+
data-plane persistently stores the "JobDefinition" (which is often
|
|
276
|
+
versioned). AWS Batch and Kubernetes are examples of such schedulers.
|
|
277
|
+
On these schedulers, a finished job may fall out of the data-plane
|
|
278
|
+
(e.g. really old finished jobs get deleted) but the JobDefinition is
|
|
279
|
+
typically permanently stored. In this case, calling
|
|
280
|
+
:py:meth:`~cancel` would not delete the job definition.
|
|
281
|
+
|
|
282
|
+
In schedulers with no such feature (e.g. SLURM)
|
|
283
|
+
:py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
|
|
284
|
+
default implementation. Hence implementors of such schedulers need not
|
|
285
|
+
override this method.
|
|
286
|
+
|
|
287
|
+
.. warning::
|
|
288
|
+
Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
|
|
289
|
+
the scheduler's data-plane does nothing. The user is responsible for
|
|
290
|
+
manually tracking down and cleaning up any dangling resources related
|
|
291
|
+
to the job.
|
|
292
|
+
"""
|
|
293
|
+
if self.exists(app_id):
|
|
294
|
+
self._delete_existing(app_id)
|
|
295
|
+
|
|
296
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
297
|
+
"""
|
|
298
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
299
|
+
scheduler's data-plane. This method will only be called on an
|
|
300
|
+
application that exists.
|
|
301
|
+
|
|
302
|
+
The default implementation calls :py:meth:`~_cancel_existing` which is
|
|
303
|
+
appropriate for schedulers without persistent job definitions.
|
|
304
|
+
"""
|
|
305
|
+
self._cancel_existing(app_id)
|
|
306
|
+
|
|
262
307
|
def log_iter(
|
|
263
308
|
self,
|
|
264
309
|
app_id: str,
|
|
@@ -27,10 +27,81 @@ Install Volcano:
|
|
|
27
27
|
See the
|
|
28
28
|
`Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
|
|
29
29
|
for more information.
|
|
30
|
+
|
|
31
|
+
Pod Overlay
|
|
32
|
+
===========
|
|
33
|
+
|
|
34
|
+
You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
|
|
35
|
+
the ``kubernetes`` metadata on your role. The value can be:
|
|
36
|
+
|
|
37
|
+
- A dict with the overlay structure
|
|
38
|
+
- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
|
|
39
|
+
|
|
40
|
+
Merge semantics:
|
|
41
|
+
- **dict**: recursive merge (upsert)
|
|
42
|
+
- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
|
|
43
|
+
- **primitives**: replace
|
|
44
|
+
|
|
45
|
+
.. code:: python
|
|
46
|
+
|
|
47
|
+
from torchx.specs import Role
|
|
48
|
+
|
|
49
|
+
# Dict overlay - lists append, tuples replace
|
|
50
|
+
role = Role(
|
|
51
|
+
name="trainer",
|
|
52
|
+
image="my-image:latest",
|
|
53
|
+
entrypoint="train.py",
|
|
54
|
+
metadata={
|
|
55
|
+
"kubernetes": {
|
|
56
|
+
"spec": {
|
|
57
|
+
"nodeSelector": {"gpu": "true"},
|
|
58
|
+
"tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
|
|
59
|
+
"volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# File URI overlay
|
|
66
|
+
role = Role(
|
|
67
|
+
name="trainer",
|
|
68
|
+
image="my-image:latest",
|
|
69
|
+
entrypoint="train.py",
|
|
70
|
+
metadata={
|
|
71
|
+
"kubernetes": "file:///path/to/pod_overlay.yaml"
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
CLI usage with builtin components:
|
|
76
|
+
|
|
77
|
+
.. code:: bash
|
|
78
|
+
|
|
79
|
+
$ torchx run --scheduler kubernetes dist.ddp \\
|
|
80
|
+
--metadata kubernetes=file:///path/to/pod_overlay.yaml \\
|
|
81
|
+
--script train.py
|
|
82
|
+
|
|
83
|
+
Example ``pod_overlay.yaml``:
|
|
84
|
+
|
|
85
|
+
.. code:: yaml
|
|
86
|
+
|
|
87
|
+
spec:
|
|
88
|
+
nodeSelector:
|
|
89
|
+
node.kubernetes.io/instance-type: p4d.24xlarge
|
|
90
|
+
tolerations:
|
|
91
|
+
- key: nvidia.com/gpu
|
|
92
|
+
operator: Exists
|
|
93
|
+
effect: NoSchedule
|
|
94
|
+
volumes: !!python/tuple
|
|
95
|
+
- name: my-volume
|
|
96
|
+
emptyDir: {}
|
|
97
|
+
|
|
98
|
+
The overlay is deep-merged with the generated pod, preserving existing fields
|
|
99
|
+
and adding or overriding specified ones.
|
|
30
100
|
"""
|
|
31
101
|
|
|
32
102
|
import json
|
|
33
103
|
import logging
|
|
104
|
+
import re
|
|
34
105
|
import warnings
|
|
35
106
|
from dataclasses import dataclass
|
|
36
107
|
from datetime import datetime
|
|
@@ -45,6 +116,7 @@ from typing import (
|
|
|
45
116
|
Tuple,
|
|
46
117
|
TYPE_CHECKING,
|
|
47
118
|
TypedDict,
|
|
119
|
+
Union,
|
|
48
120
|
)
|
|
49
121
|
|
|
50
122
|
import torchx
|
|
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
169
|
RESERVED_MILLICPU = 100
|
|
98
170
|
RESERVED_MEMMB = 1024
|
|
99
171
|
|
|
172
|
+
|
|
173
|
+
def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
|
|
174
|
+
"""Apply overlay dict to V1Pod object, merging nested fields.
|
|
175
|
+
|
|
176
|
+
Merge semantics:
|
|
177
|
+
- dict: upsert (recursive merge)
|
|
178
|
+
- list: append by default, replace if tuple
|
|
179
|
+
- primitives: replace
|
|
180
|
+
"""
|
|
181
|
+
from kubernetes import client
|
|
182
|
+
|
|
183
|
+
api = client.ApiClient()
|
|
184
|
+
pod_dict = api.sanitize_for_serialization(pod)
|
|
185
|
+
|
|
186
|
+
def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
|
|
187
|
+
for key, value in overlay.items():
|
|
188
|
+
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
|
|
189
|
+
deep_merge(base[key], value)
|
|
190
|
+
elif isinstance(value, tuple):
|
|
191
|
+
base[key] = list(value)
|
|
192
|
+
elif (
|
|
193
|
+
isinstance(value, list) and key in base and isinstance(base[key], list)
|
|
194
|
+
):
|
|
195
|
+
base[key].extend(value)
|
|
196
|
+
else:
|
|
197
|
+
base[key] = value
|
|
198
|
+
|
|
199
|
+
deep_merge(pod_dict, overlay)
|
|
200
|
+
|
|
201
|
+
merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
|
|
202
|
+
pod.spec = merged_pod.spec
|
|
203
|
+
pod.metadata = merged_pod.metadata
|
|
204
|
+
|
|
205
|
+
|
|
100
206
|
RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
|
|
101
207
|
RetryPolicy.REPLICA: [],
|
|
102
208
|
RetryPolicy.APPLICATION: [
|
|
@@ -369,7 +475,7 @@ def app_to_resource(
|
|
|
369
475
|
queue: str,
|
|
370
476
|
service_account: Optional[str],
|
|
371
477
|
priority_class: Optional[str] = None,
|
|
372
|
-
) -> Dict[str,
|
|
478
|
+
) -> Dict[str, Any]:
|
|
373
479
|
"""
|
|
374
480
|
app_to_resource creates a volcano job kubernetes resource definition from
|
|
375
481
|
the provided AppDef. The resource definition can be used to launch the
|
|
@@ -402,6 +508,17 @@ def app_to_resource(
|
|
|
402
508
|
replica_role.env["TORCHX_IMAGE"] = replica_role.image
|
|
403
509
|
|
|
404
510
|
pod = role_to_pod(name, replica_role, service_account)
|
|
511
|
+
if k8s_metadata := role.metadata.get("kubernetes"):
|
|
512
|
+
if isinstance(k8s_metadata, str):
|
|
513
|
+
import fsspec
|
|
514
|
+
|
|
515
|
+
with fsspec.open(k8s_metadata, "r") as f:
|
|
516
|
+
k8s_metadata = yaml.unsafe_load(f)
|
|
517
|
+
elif not isinstance(k8s_metadata, dict):
|
|
518
|
+
raise ValueError(
|
|
519
|
+
f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
|
|
520
|
+
)
|
|
521
|
+
_apply_pod_overlay(pod, k8s_metadata)
|
|
405
522
|
pod.metadata.labels.update(
|
|
406
523
|
pod_labels(
|
|
407
524
|
app=app,
|
|
@@ -444,7 +561,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
444
561
|
if priority_class is not None:
|
|
445
562
|
job_spec["priorityClassName"] = priority_class
|
|
446
563
|
|
|
447
|
-
resource: Dict[str,
|
|
564
|
+
resource: Dict[str, Any] = {
|
|
448
565
|
"apiVersion": "batch.volcano.sh/v1alpha1",
|
|
449
566
|
"kind": "Job",
|
|
450
567
|
"metadata": {"name": f"{unique_app_id}"},
|
|
@@ -456,7 +573,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
456
573
|
@dataclass
|
|
457
574
|
class KubernetesJob:
|
|
458
575
|
images_to_push: Dict[str, Tuple[str, str]]
|
|
459
|
-
resource: Dict[str,
|
|
576
|
+
resource: Dict[str, Any]
|
|
460
577
|
|
|
461
578
|
def __str__(self) -> str:
|
|
462
579
|
return yaml.dump(sanitize_for_serialization(self.resource))
|
|
@@ -471,6 +588,7 @@ class KubernetesOpts(TypedDict, total=False):
|
|
|
471
588
|
image_repo: Optional[str]
|
|
472
589
|
service_account: Optional[str]
|
|
473
590
|
priority_class: Optional[str]
|
|
591
|
+
validate_spec: Optional[bool]
|
|
474
592
|
|
|
475
593
|
|
|
476
594
|
class KubernetesScheduler(
|
|
@@ -504,6 +622,16 @@ class KubernetesScheduler(
|
|
|
504
622
|
$ torchx status kubernetes://torchx_user/1234
|
|
505
623
|
...
|
|
506
624
|
|
|
625
|
+
**Cancellation**
|
|
626
|
+
|
|
627
|
+
Canceling a job aborts it while preserving the job spec for inspection
|
|
628
|
+
and cloning via kubectl apply. Use the delete command to remove the job entirely:
|
|
629
|
+
|
|
630
|
+
.. code-block:: bash
|
|
631
|
+
|
|
632
|
+
$ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
|
|
633
|
+
$ torchx delete kubernetes://namespace/jobname # delete completely
|
|
634
|
+
|
|
507
635
|
**Config Options**
|
|
508
636
|
|
|
509
637
|
.. runopts::
|
|
@@ -636,7 +764,7 @@ class KubernetesScheduler(
|
|
|
636
764
|
else:
|
|
637
765
|
raise
|
|
638
766
|
|
|
639
|
-
return f
|
|
767
|
+
return f"{namespace}:{resp['metadata']['name']}"
|
|
640
768
|
|
|
641
769
|
def _submit_dryrun(
|
|
642
770
|
self, app: AppDef, cfg: KubernetesOpts
|
|
@@ -659,6 +787,36 @@ class KubernetesScheduler(
|
|
|
659
787
|
), "priority_class must be a str"
|
|
660
788
|
|
|
661
789
|
resource = app_to_resource(app, queue, service_account, priority_class)
|
|
790
|
+
|
|
791
|
+
if cfg.get("validate_spec"):
|
|
792
|
+
try:
|
|
793
|
+
self._custom_objects_api().create_namespaced_custom_object(
|
|
794
|
+
group="batch.volcano.sh",
|
|
795
|
+
version="v1alpha1",
|
|
796
|
+
namespace=cfg.get("namespace") or "default",
|
|
797
|
+
plural="jobs",
|
|
798
|
+
body=resource,
|
|
799
|
+
dry_run="All",
|
|
800
|
+
)
|
|
801
|
+
except Exception as e:
|
|
802
|
+
from kubernetes.client.rest import ApiException
|
|
803
|
+
|
|
804
|
+
if isinstance(e, ApiException):
|
|
805
|
+
raise ValueError(f"Invalid job spec: {e.reason}") from e
|
|
806
|
+
raise
|
|
807
|
+
|
|
808
|
+
job_name = resource["metadata"]["name"]
|
|
809
|
+
for task in resource["spec"]["tasks"]:
|
|
810
|
+
task_name = task["name"]
|
|
811
|
+
replicas = task.get("replicas", 1)
|
|
812
|
+
max_index = replicas - 1
|
|
813
|
+
pod_name = f"{job_name}-{task_name}-{max_index}"
|
|
814
|
+
if len(pod_name) > 63:
|
|
815
|
+
raise ValueError(
|
|
816
|
+
f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
|
|
817
|
+
f"Shorten app.name or role names"
|
|
818
|
+
)
|
|
819
|
+
|
|
662
820
|
req = KubernetesJob(
|
|
663
821
|
resource=resource,
|
|
664
822
|
images_to_push=images_to_push,
|
|
@@ -670,6 +828,31 @@ class KubernetesScheduler(
|
|
|
670
828
|
pass
|
|
671
829
|
|
|
672
830
|
def _cancel_existing(self, app_id: str) -> None:
|
|
831
|
+
"""
|
|
832
|
+
Abort a Volcano job while preserving the spec for inspection.
|
|
833
|
+
"""
|
|
834
|
+
namespace, name = app_id.split(":")
|
|
835
|
+
vcjob = self._custom_objects_api().get_namespaced_custom_object(
|
|
836
|
+
group="batch.volcano.sh",
|
|
837
|
+
version="v1alpha1",
|
|
838
|
+
namespace=namespace,
|
|
839
|
+
plural="jobs",
|
|
840
|
+
name=name,
|
|
841
|
+
)
|
|
842
|
+
vcjob["status"]["state"]["phase"] = "Aborted"
|
|
843
|
+
self._custom_objects_api().replace_namespaced_custom_object_status(
|
|
844
|
+
group="batch.volcano.sh",
|
|
845
|
+
version="v1alpha1",
|
|
846
|
+
namespace=namespace,
|
|
847
|
+
plural="jobs",
|
|
848
|
+
name=name,
|
|
849
|
+
body=vcjob,
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
853
|
+
"""
|
|
854
|
+
Delete a Volcano job completely from the cluster.
|
|
855
|
+
"""
|
|
673
856
|
namespace, name = app_id.split(":")
|
|
674
857
|
self._custom_objects_api().delete_namespaced_custom_object(
|
|
675
858
|
group="batch.volcano.sh",
|
|
@@ -703,19 +886,32 @@ class KubernetesScheduler(
|
|
|
703
886
|
type_=str,
|
|
704
887
|
help="The name of the PriorityClass to set on the job specs",
|
|
705
888
|
)
|
|
889
|
+
opts.add(
|
|
890
|
+
"validate_spec",
|
|
891
|
+
type_=bool,
|
|
892
|
+
help="Validate job spec using Kubernetes API dry-run before submission",
|
|
893
|
+
default=True,
|
|
894
|
+
)
|
|
706
895
|
return opts
|
|
707
896
|
|
|
708
897
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
898
|
+
from kubernetes.client.rest import ApiException
|
|
899
|
+
|
|
709
900
|
namespace, name = app_id.split(":")
|
|
710
901
|
roles = {}
|
|
711
902
|
roles_statuses = {}
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
903
|
+
try:
|
|
904
|
+
resp = self._custom_objects_api().get_namespaced_custom_object_status(
|
|
905
|
+
group="batch.volcano.sh",
|
|
906
|
+
version="v1alpha1",
|
|
907
|
+
namespace=namespace,
|
|
908
|
+
plural="jobs",
|
|
909
|
+
name=name,
|
|
910
|
+
)
|
|
911
|
+
except ApiException as e:
|
|
912
|
+
if e.status == 404:
|
|
913
|
+
return None
|
|
914
|
+
raise
|
|
719
915
|
status = resp.get("status")
|
|
720
916
|
if status:
|
|
721
917
|
state_str = status["state"]["phase"]
|
|
@@ -824,13 +1020,34 @@ def create_scheduler(
|
|
|
824
1020
|
def pod_labels(
|
|
825
1021
|
app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
|
|
826
1022
|
) -> Dict[str, str]:
|
|
1023
|
+
|
|
1024
|
+
def clean(label_value: str) -> str:
|
|
1025
|
+
# cleans the provided `label_value` to make it compliant
|
|
1026
|
+
# to pod label specs as described in
|
|
1027
|
+
# https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
|
|
1028
|
+
#
|
|
1029
|
+
# Valid label value:
|
|
1030
|
+
# must be 63 characters or less (can be empty),
|
|
1031
|
+
# unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
|
|
1032
|
+
# could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
|
|
1033
|
+
|
|
1034
|
+
# Replace invalid characters (allow: alphanum, -, _, .) with "."
|
|
1035
|
+
label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
|
|
1036
|
+
# Replace leading non-alphanumeric with "."
|
|
1037
|
+
label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
|
|
1038
|
+
# Replace trailing non-alphanumeric with "."
|
|
1039
|
+
label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
|
|
1040
|
+
|
|
1041
|
+
# Trim to 63 characters
|
|
1042
|
+
return label_value[:63]
|
|
1043
|
+
|
|
827
1044
|
return {
|
|
828
|
-
LABEL_VERSION: torchx.__version__,
|
|
829
|
-
LABEL_APP_NAME: app.name,
|
|
1045
|
+
LABEL_VERSION: clean(torchx.__version__),
|
|
1046
|
+
LABEL_APP_NAME: clean(app.name),
|
|
830
1047
|
LABEL_ROLE_INDEX: str(role_idx),
|
|
831
|
-
LABEL_ROLE_NAME: role.name,
|
|
1048
|
+
LABEL_ROLE_NAME: clean(role.name),
|
|
832
1049
|
LABEL_REPLICA_ID: str(replica_id),
|
|
833
|
-
LABEL_KUBE_APP_NAME: app.name,
|
|
1050
|
+
LABEL_KUBE_APP_NAME: clean(app.name),
|
|
834
1051
|
LABEL_ORGANIZATION: "torchx.pytorch.org",
|
|
835
|
-
LABEL_UNIQUE_NAME: app_id,
|
|
1052
|
+
LABEL_UNIQUE_NAME: clean(app_id),
|
|
836
1053
|
}
|
torchx/specs/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@ scheduler or pipeline adapter.
|
|
|
14
14
|
import difflib
|
|
15
15
|
|
|
16
16
|
import os
|
|
17
|
-
from typing import Callable, Dict, Mapping, Optional
|
|
17
|
+
from typing import Callable, Dict, Iterator, Mapping, Optional
|
|
18
18
|
|
|
19
19
|
from torchx.specs.api import (
|
|
20
20
|
ALL,
|
|
@@ -113,8 +113,22 @@ class _NamedResourcesLibrary:
|
|
|
113
113
|
def __contains__(self, key: str) -> bool:
|
|
114
114
|
return key in _named_resource_factories
|
|
115
115
|
|
|
116
|
-
def __iter__(self) ->
|
|
117
|
-
|
|
116
|
+
def __iter__(self) -> Iterator[str]:
|
|
117
|
+
"""Iterates through the names of the registered named_resources.
|
|
118
|
+
|
|
119
|
+
Usage:
|
|
120
|
+
|
|
121
|
+
.. doctest::
|
|
122
|
+
|
|
123
|
+
from torchx import specs
|
|
124
|
+
|
|
125
|
+
for resource_name in specs.named_resources:
|
|
126
|
+
resource = specs.resource(h=resource_name)
|
|
127
|
+
assert isinstance(resource, specs.Resource)
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
for key in _named_resource_factories:
|
|
131
|
+
yield (key)
|
|
118
132
|
|
|
119
133
|
|
|
120
134
|
named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
|
torchx/specs/api.py
CHANGED
|
@@ -14,10 +14,12 @@ import logging as logger
|
|
|
14
14
|
import os
|
|
15
15
|
import pathlib
|
|
16
16
|
import re
|
|
17
|
+
import shutil
|
|
17
18
|
import typing
|
|
19
|
+
import warnings
|
|
18
20
|
from dataclasses import asdict, dataclass, field
|
|
19
21
|
from datetime import datetime
|
|
20
|
-
from enum import Enum
|
|
22
|
+
from enum import Enum, IntEnum
|
|
21
23
|
from json import JSONDecodeError
|
|
22
24
|
from string import Template
|
|
23
25
|
from typing import (
|
|
@@ -380,6 +382,16 @@ class Workspace:
|
|
|
380
382
|
"""False if no projects mapping. Lets us use workspace object in an if-statement"""
|
|
381
383
|
return bool(self.projects)
|
|
382
384
|
|
|
385
|
+
def __eq__(self, other: object) -> bool:
|
|
386
|
+
if not isinstance(other, Workspace):
|
|
387
|
+
return False
|
|
388
|
+
return self.projects == other.projects
|
|
389
|
+
|
|
390
|
+
def __hash__(self) -> int:
|
|
391
|
+
# makes it possible to use Workspace as the key in the workspace build cache
|
|
392
|
+
# see WorkspaceMixin.caching_build_workspace_and_update_role
|
|
393
|
+
return hash(frozenset(self.projects.items()))
|
|
394
|
+
|
|
383
395
|
def is_unmapped_single_project(self) -> bool:
|
|
384
396
|
"""
|
|
385
397
|
Returns ``True`` if this workspace only has 1 project
|
|
@@ -387,6 +399,39 @@ class Workspace:
|
|
|
387
399
|
"""
|
|
388
400
|
return len(self.projects) == 1 and not next(iter(self.projects.values()))
|
|
389
401
|
|
|
402
|
+
def merge_into(self, outdir: str | pathlib.Path) -> None:
|
|
403
|
+
"""
|
|
404
|
+
Copies each project dir of this workspace into the specified ``outdir``.
|
|
405
|
+
Each project dir is copied into ``{outdir}/{target}`` where ``target`` is
|
|
406
|
+
the target mapping of the project dir.
|
|
407
|
+
|
|
408
|
+
For example:
|
|
409
|
+
|
|
410
|
+
.. code-block:: python
|
|
411
|
+
from os.path import expanduser
|
|
412
|
+
|
|
413
|
+
workspace = Workspace(
|
|
414
|
+
projects={
|
|
415
|
+
expanduser("~/workspace/torch"): "torch",
|
|
416
|
+
expanduser("~/workspace/my_project": "")
|
|
417
|
+
}
|
|
418
|
+
)
|
|
419
|
+
workspace.merge_into(expanduser("~/tmp"))
|
|
420
|
+
|
|
421
|
+
Copies:
|
|
422
|
+
|
|
423
|
+
* ``~/workspace/torch/**`` into ``~/tmp/torch/**``
|
|
424
|
+
* ``~/workspace/my_project/**`` into ``~/tmp/**``
|
|
425
|
+
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
for src, dst in self.projects.items():
|
|
429
|
+
dst_path = pathlib.Path(outdir) / dst
|
|
430
|
+
if pathlib.Path(src).is_file():
|
|
431
|
+
shutil.copy2(src, dst_path)
|
|
432
|
+
else: # src is dir
|
|
433
|
+
shutil.copytree(src, dst_path, dirs_exist_ok=True)
|
|
434
|
+
|
|
390
435
|
@staticmethod
|
|
391
436
|
def from_str(workspace: str | None) -> "Workspace":
|
|
392
437
|
import yaml
|
|
@@ -891,14 +936,12 @@ class runopt:
|
|
|
891
936
|
Represents the metadata about the specific run option
|
|
892
937
|
"""
|
|
893
938
|
|
|
894
|
-
class alias(str):
|
|
895
|
-
pass
|
|
896
|
-
|
|
897
939
|
default: CfgVal
|
|
898
940
|
opt_type: Type[CfgVal]
|
|
899
941
|
is_required: bool
|
|
900
942
|
help: str
|
|
901
|
-
aliases: list[
|
|
943
|
+
aliases: list[str] | None = None
|
|
944
|
+
deprecated_aliases: list[str] | None = None
|
|
902
945
|
|
|
903
946
|
@property
|
|
904
947
|
def is_type_list_of_str(self) -> bool:
|
|
@@ -990,7 +1033,7 @@ class runopts:
|
|
|
990
1033
|
|
|
991
1034
|
def __init__(self) -> None:
|
|
992
1035
|
self._opts: Dict[str, runopt] = {}
|
|
993
|
-
self._alias_to_key: dict[
|
|
1036
|
+
self._alias_to_key: dict[str, str] = {}
|
|
994
1037
|
|
|
995
1038
|
def __iter__(self) -> Iterator[Tuple[str, runopt]]:
|
|
996
1039
|
return self._opts.items().__iter__()
|
|
@@ -1044,12 +1087,24 @@ class runopts:
|
|
|
1044
1087
|
val = resolved_cfg.get(cfg_key)
|
|
1045
1088
|
resolved_name = None
|
|
1046
1089
|
aliases = runopt.aliases or []
|
|
1090
|
+
deprecated_aliases = runopt.deprecated_aliases or []
|
|
1047
1091
|
if val is None:
|
|
1048
1092
|
for alias in aliases:
|
|
1049
1093
|
val = resolved_cfg.get(alias)
|
|
1050
1094
|
if alias in cfg or val is not None:
|
|
1051
1095
|
resolved_name = alias
|
|
1052
1096
|
break
|
|
1097
|
+
for alias in deprecated_aliases:
|
|
1098
|
+
val = resolved_cfg.get(alias)
|
|
1099
|
+
if val is not None:
|
|
1100
|
+
resolved_name = alias
|
|
1101
|
+
use_instead = self._alias_to_key.get(alias)
|
|
1102
|
+
warnings.warn(
|
|
1103
|
+
f"Run option `{alias}` is deprecated, use `{use_instead}` instead",
|
|
1104
|
+
UserWarning,
|
|
1105
|
+
stacklevel=2,
|
|
1106
|
+
)
|
|
1107
|
+
break
|
|
1053
1108
|
else:
|
|
1054
1109
|
resolved_name = cfg_key
|
|
1055
1110
|
for alias in aliases:
|
|
@@ -1172,49 +1227,23 @@ class runopts:
|
|
|
1172
1227
|
cfg[key] = val
|
|
1173
1228
|
return cfg
|
|
1174
1229
|
|
|
1175
|
-
def _get_primary_key_and_aliases(
|
|
1176
|
-
self,
|
|
1177
|
-
cfg_key: list[str] | str,
|
|
1178
|
-
) -> tuple[str, list[runopt.alias]]:
|
|
1179
|
-
"""
|
|
1180
|
-
Returns the primary key and aliases for the given cfg_key.
|
|
1181
|
-
"""
|
|
1182
|
-
if isinstance(cfg_key, str):
|
|
1183
|
-
return cfg_key, []
|
|
1184
|
-
|
|
1185
|
-
if len(cfg_key) == 0:
|
|
1186
|
-
raise ValueError("cfg_key must be a non-empty list")
|
|
1187
|
-
primary_key = None
|
|
1188
|
-
aliases = list[runopt.alias]()
|
|
1189
|
-
for name in cfg_key:
|
|
1190
|
-
if isinstance(name, runopt.alias):
|
|
1191
|
-
aliases.append(name)
|
|
1192
|
-
else:
|
|
1193
|
-
if primary_key is not None:
|
|
1194
|
-
raise ValueError(
|
|
1195
|
-
f" Given more than one primary key: {primary_key}, {name}. Please use runopt.alias type for aliases. "
|
|
1196
|
-
)
|
|
1197
|
-
primary_key = name
|
|
1198
|
-
if primary_key is None or primary_key == "":
|
|
1199
|
-
raise ValueError(
|
|
1200
|
-
"Missing cfg_key. Please provide one other than the aliases."
|
|
1201
|
-
)
|
|
1202
|
-
return primary_key, aliases
|
|
1203
|
-
|
|
1204
1230
|
def add(
|
|
1205
1231
|
self,
|
|
1206
|
-
cfg_key: str
|
|
1232
|
+
cfg_key: str,
|
|
1207
1233
|
type_: Type[CfgVal],
|
|
1208
1234
|
help: str,
|
|
1209
1235
|
default: CfgVal = None,
|
|
1210
1236
|
required: bool = False,
|
|
1237
|
+
aliases: Optional[list[str]] = None,
|
|
1238
|
+
deprecated_aliases: Optional[list[str]] = None,
|
|
1211
1239
|
) -> None:
|
|
1212
1240
|
"""
|
|
1213
1241
|
Adds the ``config`` option with the given help string and ``default``
|
|
1214
1242
|
value (if any). If the ``default`` is not specified then this option
|
|
1215
1243
|
is a required option.
|
|
1216
1244
|
"""
|
|
1217
|
-
|
|
1245
|
+
aliases = aliases or []
|
|
1246
|
+
deprecated_aliases = deprecated_aliases or []
|
|
1218
1247
|
if required and default is not None:
|
|
1219
1248
|
raise ValueError(
|
|
1220
1249
|
f"Required option: {cfg_key} must not specify default value. Given: {default}"
|
|
@@ -1225,10 +1254,20 @@ class runopts:
|
|
|
1225
1254
|
f"Option: {cfg_key}, must be of type: {type_}."
|
|
1226
1255
|
f" Given: {default} ({type(default).__name__})"
|
|
1227
1256
|
)
|
|
1228
|
-
|
|
1257
|
+
|
|
1258
|
+
opt = runopt(
|
|
1259
|
+
default,
|
|
1260
|
+
type_,
|
|
1261
|
+
required,
|
|
1262
|
+
help,
|
|
1263
|
+
list(set(aliases)),
|
|
1264
|
+
list(set(deprecated_aliases)),
|
|
1265
|
+
)
|
|
1229
1266
|
for alias in aliases:
|
|
1230
|
-
self._alias_to_key[alias] =
|
|
1231
|
-
|
|
1267
|
+
self._alias_to_key[alias] = cfg_key
|
|
1268
|
+
for deprecated_alias in deprecated_aliases:
|
|
1269
|
+
self._alias_to_key[deprecated_alias] = cfg_key
|
|
1270
|
+
self._opts[cfg_key] = opt
|
|
1232
1271
|
|
|
1233
1272
|
def update(self, other: "runopts") -> None:
|
|
1234
1273
|
self._opts.update(other._opts)
|
torchx/version.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
2
|
# All rights reserved.
|
|
4
3
|
#
|
|
@@ -7,6 +6,7 @@
|
|
|
7
6
|
|
|
8
7
|
# pyre-strict
|
|
9
8
|
|
|
9
|
+
from torchx._version import BASE_VERSION
|
|
10
10
|
from torchx.util.entrypoints import load
|
|
11
11
|
|
|
12
12
|
# Follows PEP-0440 version scheme guidelines
|
|
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
|
|
|
18
18
|
# 0.1.0bN # Beta release
|
|
19
19
|
# 0.1.0rcN # Release Candidate
|
|
20
20
|
# 0.1.0 # Final release
|
|
21
|
-
__version__ =
|
|
21
|
+
__version__: str = BASE_VERSION
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
# Use the github container registry images corresponding to the current package
|
torchx/workspace/api.py
CHANGED
|
@@ -8,26 +8,17 @@
|
|
|
8
8
|
|
|
9
9
|
import abc
|
|
10
10
|
import fnmatch
|
|
11
|
+
import logging
|
|
11
12
|
import posixpath
|
|
12
|
-
import shutil
|
|
13
13
|
import tempfile
|
|
14
14
|
import warnings
|
|
15
15
|
from dataclasses import dataclass
|
|
16
|
-
from
|
|
17
|
-
from typing import (
|
|
18
|
-
Any,
|
|
19
|
-
Dict,
|
|
20
|
-
Generic,
|
|
21
|
-
Iterable,
|
|
22
|
-
Mapping,
|
|
23
|
-
Tuple,
|
|
24
|
-
TYPE_CHECKING,
|
|
25
|
-
TypeVar,
|
|
26
|
-
Union,
|
|
27
|
-
)
|
|
16
|
+
from typing import Any, Dict, Generic, Iterable, Mapping, Tuple, TYPE_CHECKING, TypeVar
|
|
28
17
|
|
|
29
18
|
from torchx.specs import AppDef, CfgVal, Role, runopts, Workspace
|
|
30
19
|
|
|
20
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
31
22
|
if TYPE_CHECKING:
|
|
32
23
|
from fsspec import AbstractFileSystem
|
|
33
24
|
|
|
@@ -113,45 +104,72 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
|
|
|
113
104
|
"""
|
|
114
105
|
return runopts()
|
|
115
106
|
|
|
116
|
-
def
|
|
107
|
+
def build_workspaces(self, roles: list[Role], cfg: Mapping[str, CfgVal]) -> None:
|
|
108
|
+
"""
|
|
109
|
+
NOTE: this method MUTATES the passed roles!
|
|
110
|
+
|
|
111
|
+
Builds the workspaces (if any) for each role and updates the role to reflect the built workspace.
|
|
112
|
+
Typically ``role.image`` is updated with the newly built image that reflects the local workspace.
|
|
113
|
+
Some workspace implementations may add extra environment variables to make it easier for other
|
|
114
|
+
parts of the program to access the workspace. For example a ``WORKSPACE_DIR`` env var may be added
|
|
115
|
+
to ``role.env`` that scripts can use to refert to the workspace directory in the container.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
build_cache: dict[object, object] = {}
|
|
119
|
+
|
|
120
|
+
for i, role in enumerate(roles):
|
|
121
|
+
if role.workspace:
|
|
122
|
+
old_img = role.image
|
|
123
|
+
self.caching_build_workspace_and_update_role(role, cfg, build_cache)
|
|
124
|
+
|
|
125
|
+
if old_img != role.image:
|
|
126
|
+
logger.info(
|
|
127
|
+
"role[%d]=%s updated with new image to include workspace changes",
|
|
128
|
+
i,
|
|
129
|
+
role.name,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def caching_build_workspace_and_update_role(
|
|
117
133
|
self,
|
|
118
134
|
role: Role,
|
|
119
|
-
workspace: Union[Workspace, str],
|
|
120
135
|
cfg: Mapping[str, CfgVal],
|
|
136
|
+
build_cache: dict[object, object],
|
|
121
137
|
) -> None:
|
|
122
138
|
"""
|
|
123
|
-
Same as :py:meth:`build_workspace_and_update_role` but
|
|
124
|
-
|
|
125
|
-
|
|
139
|
+
Same as :py:meth:`build_workspace_and_update_role` but takes
|
|
140
|
+
a ``build_cache`` that can be used to cache pointers to build artifacts
|
|
141
|
+
between building workspace for each role.
|
|
126
142
|
|
|
127
|
-
|
|
143
|
+
This is useful when an appdef has multiple roles where the image and workspace
|
|
144
|
+
of the roles are the same but other attributes such as entrypoint or args are different.
|
|
145
|
+
|
|
146
|
+
NOTE: ``build_cache``'s lifetime is within :py:meth:`build_workspace_and_update_roles`
|
|
147
|
+
NOTE: the workspace implementation decides what to cache
|
|
148
|
+
|
|
149
|
+
Workspace subclasses should prefer implementing this method over
|
|
128
150
|
:py:meth:`build_workspace_and_update_role`.
|
|
129
151
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
152
|
+
The default implementation of this method simply calls the (deprecated) non-caching
|
|
153
|
+
:py:meth:`build_workspace_and_update_role` and deals with multi-dir workspaces by
|
|
154
|
+
merging them into a single tmpdir before passing it down.
|
|
133
155
|
|
|
134
|
-
Subclasses can override this method to customize multi-project
|
|
135
|
-
workspace building logic.
|
|
136
156
|
"""
|
|
137
|
-
if isinstance(workspace, Workspace):
|
|
138
|
-
if not workspace.is_unmapped_single_project():
|
|
139
|
-
with tempfile.TemporaryDirectory(suffix="torchx_workspace_") as outdir:
|
|
140
|
-
for src, dst in workspace.projects.items():
|
|
141
|
-
dst_path = Path(outdir) / dst
|
|
142
|
-
if Path(src).is_file():
|
|
143
|
-
shutil.copy2(src, dst_path)
|
|
144
|
-
else: # src is dir
|
|
145
|
-
shutil.copytree(src, dst_path, dirs_exist_ok=True)
|
|
146
|
-
|
|
147
|
-
self.build_workspace_and_update_role(role, outdir, cfg)
|
|
148
|
-
return
|
|
149
|
-
else: # single project workspace with no target mapping (treat like a str workspace)
|
|
150
|
-
workspace = str(workspace)
|
|
151
|
-
|
|
152
|
-
self.build_workspace_and_update_role(role, workspace, cfg)
|
|
153
157
|
|
|
154
|
-
|
|
158
|
+
workspace = role.workspace
|
|
159
|
+
|
|
160
|
+
if not workspace:
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
if workspace.is_unmapped_single_project():
|
|
164
|
+
# single-dir workspace with no target map; no need to copy to a tmp dir
|
|
165
|
+
self.build_workspace_and_update_role(role, str(workspace), cfg)
|
|
166
|
+
else:
|
|
167
|
+
# multi-dirs or single-dir with a target map;
|
|
168
|
+
# copy all dirs to a tmp dir and treat the tmp dir as a single-dir workspace
|
|
169
|
+
with tempfile.TemporaryDirectory(suffix="torchx_workspace_") as outdir:
|
|
170
|
+
workspace.merge_into(outdir)
|
|
171
|
+
self.build_workspace_and_update_role(role, outdir, cfg)
|
|
172
|
+
|
|
155
173
|
def build_workspace_and_update_role(
|
|
156
174
|
self,
|
|
157
175
|
role: Role,
|
|
@@ -159,6 +177,9 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
|
|
|
159
177
|
cfg: Mapping[str, CfgVal],
|
|
160
178
|
) -> None:
|
|
161
179
|
"""
|
|
180
|
+
.. note:: DEPRECATED: Workspace subclasses should implement
|
|
181
|
+
:py:meth:`caching_build_workspace_and_update_role` over this method.
|
|
182
|
+
|
|
162
183
|
Builds the specified ``workspace`` with respect to ``img``
|
|
163
184
|
and updates the ``role`` to reflect the built workspace artifacts.
|
|
164
185
|
In the simplest case, this method builds a new image and updates
|
|
@@ -167,7 +188,7 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
|
|
|
167
188
|
|
|
168
189
|
Note: this method mutates the passed ``role``.
|
|
169
190
|
"""
|
|
170
|
-
|
|
191
|
+
raise NotImplementedError("implement `caching_build_workspace_and_update_role`")
|
|
171
192
|
|
|
172
193
|
def dryrun_push_images(self, app: AppDef, cfg: Mapping[str, CfgVal]) -> T:
|
|
173
194
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: torchx-nightly
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.11.20
|
|
4
4
|
Summary: TorchX SDK and Components
|
|
5
5
|
Home-page: https://github.com/meta-pytorch/torchx
|
|
6
6
|
Author: TorchX Devs
|
|
@@ -23,8 +23,10 @@ Requires-Dist: docker
|
|
|
23
23
|
Requires-Dist: filelock
|
|
24
24
|
Requires-Dist: fsspec>=2023.10.0
|
|
25
25
|
Requires-Dist: tabulate
|
|
26
|
-
Provides-Extra:
|
|
26
|
+
Provides-Extra: aws-batch
|
|
27
27
|
Requires-Dist: boto3; extra == "aws-batch"
|
|
28
|
+
Provides-Extra: kubernetes
|
|
29
|
+
Requires-Dist: kubernetes>=11; extra == "kubernetes"
|
|
28
30
|
Provides-Extra: dev
|
|
29
31
|
Requires-Dist: aiobotocore==2.20.0; extra == "dev"
|
|
30
32
|
Requires-Dist: ax-platform[mysql]==0.2.3; extra == "dev"
|
|
@@ -47,18 +49,29 @@ Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
|
|
|
47
49
|
Requires-Dist: tensorboard==2.14.0; extra == "dev"
|
|
48
50
|
Requires-Dist: sagemaker==2.230.0; extra == "dev"
|
|
49
51
|
Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
|
|
50
|
-
Requires-Dist: torch
|
|
52
|
+
Requires-Dist: torch; extra == "dev"
|
|
51
53
|
Requires-Dist: torchmetrics==1.6.3; extra == "dev"
|
|
52
54
|
Requires-Dist: torchserve>=0.10.0; extra == "dev"
|
|
53
|
-
Requires-Dist: torchtext
|
|
54
|
-
Requires-Dist: torchvision
|
|
55
|
+
Requires-Dist: torchtext; extra == "dev"
|
|
56
|
+
Requires-Dist: torchvision; extra == "dev"
|
|
55
57
|
Requires-Dist: typing-extensions; extra == "dev"
|
|
56
58
|
Requires-Dist: ts==0.5.1; extra == "dev"
|
|
57
59
|
Requires-Dist: wheel; extra == "dev"
|
|
58
60
|
Requires-Dist: lintrunner; extra == "dev"
|
|
59
61
|
Requires-Dist: lintrunner-adapters; extra == "dev"
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
Dynamic: author
|
|
63
|
+
Dynamic: author-email
|
|
64
|
+
Dynamic: classifier
|
|
65
|
+
Dynamic: description
|
|
66
|
+
Dynamic: description-content-type
|
|
67
|
+
Dynamic: home-page
|
|
68
|
+
Dynamic: keywords
|
|
69
|
+
Dynamic: license
|
|
70
|
+
Dynamic: license-file
|
|
71
|
+
Dynamic: provides-extra
|
|
72
|
+
Dynamic: requires-dist
|
|
73
|
+
Dynamic: requires-python
|
|
74
|
+
Dynamic: summary
|
|
62
75
|
|
|
63
76
|
[](https://pypi.org/project/torchx/)
|
|
64
77
|
[](https://github.com/meta-pytorch/torchx/blob/main/LICENSE)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
torchx/__init__.py,sha256=QFDTdJacncWYWHL-2QyWdY5MUck3jVfSPRRGdvedcKc,355
|
|
2
|
+
torchx/_version.py,sha256=TzDuXIviDldFbXAhGe33redQcoP33jIsVR_hMyqSgdc,250
|
|
2
3
|
torchx/notebook.py,sha256=Rc6XUMzSq7NXtsYdtVluE6T89LpEhcba-3ANxuaLCCU,1008
|
|
3
|
-
torchx/version.py,sha256=
|
|
4
|
+
torchx/version.py,sha256=YcE66UkBxYHMQMtjVts4jF3l6Qeaj1gK_LzxU77l8Bo,975
|
|
4
5
|
torchx/apps/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
5
6
|
torchx/apps/serve/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
|
|
6
7
|
torchx/apps/serve/serve.py,sha256=u_h8agld1TwIPq5GRosHL3uxhkljNfS65McLB77O0OE,4386
|
|
@@ -13,6 +14,7 @@ torchx/cli/argparse_util.py,sha256=kZb1ubEHDrBsmrxpySFRQCW7wmHuRHD8eAInuEZjlsI,3
|
|
|
13
14
|
torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
|
|
14
15
|
torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
|
|
15
16
|
torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
|
|
17
|
+
torchx/cli/cmd_delete.py,sha256=US1f6Jvyhz4R_0Q0a8GeNTDMrhzo8WE_ECcdOf0MjKE,835
|
|
16
18
|
torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
|
|
17
19
|
torchx/cli/cmd_list.py,sha256=alkS9aIaDI8lX3W8uj8Vtr3IU3G2VeCuokKSd3zOFug,1409
|
|
18
20
|
torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
|
|
@@ -21,7 +23,7 @@ torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,130
|
|
|
21
23
|
torchx/cli/cmd_status.py,sha256=22IAEmKs0qkG6kJi83u9dRX2Q-ntT7yehVx7FxtY-vQ,2114
|
|
22
24
|
torchx/cli/cmd_tracker.py,sha256=9gmOmYi-89qQRGQfSrXCTto7ve54_JKFqs_wa7oRUA8,5223
|
|
23
25
|
torchx/cli/colors.py,sha256=yLMes7e_UoLAfhxE0W6edhc58t83UHAlnCN2ANPeuXw,568
|
|
24
|
-
torchx/cli/main.py,sha256=
|
|
26
|
+
torchx/cli/main.py,sha256=1DJTmKdvPW_7hod8OUVT3Br2uwsZVEDU-2bTE0NJ0zY,3559
|
|
25
27
|
torchx/components/__init__.py,sha256=JaVte0j9Gqi6IrjZKudJ2Kr3gkdHsvlCdRTo-zYpSRo,11815
|
|
26
28
|
torchx/components/component_test_base.py,sha256=22iNSdVa_qTW3SMM30Pw5UEWlK4DZVw0C03EqYiaLOI,4150
|
|
27
29
|
torchx/components/dist.py,sha256=6DNPEvHVqEifmM8g1L7HVY169cQv_7tSfSlh3o6lTp4,14930
|
|
@@ -48,7 +50,7 @@ torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIo
|
|
|
48
50
|
torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
|
|
49
51
|
torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
|
|
50
52
|
torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
|
|
51
|
-
torchx/runner/api.py,sha256=
|
|
53
|
+
torchx/runner/api.py,sha256=Qi12Kjkr_zpQBesbLuCtgKET8JhHnQk22MV7Czi4l1A,30832
|
|
52
54
|
torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
|
|
53
55
|
torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
|
|
54
56
|
torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
|
|
@@ -57,20 +59,20 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
|
|
|
57
59
|
torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
|
|
58
60
|
torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
|
|
59
61
|
torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
|
|
60
|
-
torchx/schedulers/api.py,sha256=
|
|
62
|
+
torchx/schedulers/api.py,sha256=PwXmqMDbwDlwpJsnaXcQSX6lf7YkyK6YsTSviMyflGY,16563
|
|
61
63
|
torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
|
|
62
64
|
torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
|
|
63
65
|
torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
|
|
64
66
|
torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
|
|
65
67
|
torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
|
|
66
68
|
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
|
|
67
|
-
torchx/schedulers/kubernetes_scheduler.py,sha256=
|
|
69
|
+
torchx/schedulers/kubernetes_scheduler.py,sha256=PTCgDLshK5EUsZIGnTafjZ7LrO2YUjHmgR0mPL9VGFM,35672
|
|
68
70
|
torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
|
|
69
71
|
torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
|
|
70
72
|
torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
|
|
71
73
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
72
|
-
torchx/specs/__init__.py,sha256=
|
|
73
|
-
torchx/specs/api.py,sha256=
|
|
74
|
+
torchx/specs/__init__.py,sha256=TaC0AveTebkCMo5hmdY1wGpo09vFDqzWnsT166ionTw,7108
|
|
75
|
+
torchx/specs/api.py,sha256=OrLX4gGa97qtjUbl3x_YnOKCdP0rQkVEruPIbNjo7fk,49230
|
|
74
76
|
torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
|
|
75
77
|
torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
|
|
76
78
|
torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
|
|
@@ -99,12 +101,12 @@ torchx/util/shlex.py,sha256=eXEKu8KC3zIcd8tEy9_s8Ds5oma8BORr-0VGWNpG2dk,463
|
|
|
99
101
|
torchx/util/strings.py,sha256=7Ef1loz2IYMrzeJ6Lewywi5cBIc3X3g7lSPbT1Tn_z4,664
|
|
100
102
|
torchx/util/types.py,sha256=E9dxAWQnsJkIDuHtg-poeOJ4etucSI_xP_Z5kNJX8uI,9229
|
|
101
103
|
torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,798
|
|
102
|
-
torchx/workspace/api.py,sha256=
|
|
104
|
+
torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
|
|
103
105
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
104
106
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
105
|
-
torchx_nightly-2025.
|
|
106
|
-
torchx_nightly-2025.
|
|
107
|
-
torchx_nightly-2025.
|
|
108
|
-
torchx_nightly-2025.
|
|
109
|
-
torchx_nightly-2025.
|
|
110
|
-
torchx_nightly-2025.
|
|
107
|
+
torchx_nightly-2025.11.20.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
108
|
+
torchx_nightly-2025.11.20.dist-info/METADATA,sha256=yeYyvVFSNXDwzGTXtDktxEfyAHvepkZeM7uzQbSoqjk,5324
|
|
109
|
+
torchx_nightly-2025.11.20.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
110
|
+
torchx_nightly-2025.11.20.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
111
|
+
torchx_nightly-2025.11.20.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
112
|
+
torchx_nightly-2025.11.20.dist-info/RECORD,,
|
{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.20.dist-info/licenses}/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|