torchx-nightly 2025.10.27__py3-none-any.whl → 2025.10.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/_version.py +8 -0
- torchx/schedulers/kubernetes_scheduler.py +157 -4
- torchx/version.py +2 -2
- {torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/RECORD +9 -8
- {torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/top_level.txt +0 -0
torchx/_version.py
ADDED
|
@@ -27,6 +27,76 @@ Install Volcano:
|
|
|
27
27
|
See the
|
|
28
28
|
`Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
|
|
29
29
|
for more information.
|
|
30
|
+
|
|
31
|
+
Pod Overlay
|
|
32
|
+
===========
|
|
33
|
+
|
|
34
|
+
You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
|
|
35
|
+
the ``kubernetes`` metadata on your role. The value can be:
|
|
36
|
+
|
|
37
|
+
- A dict with the overlay structure
|
|
38
|
+
- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
|
|
39
|
+
|
|
40
|
+
Merge semantics:
|
|
41
|
+
- **dict**: recursive merge (upsert)
|
|
42
|
+
- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
|
|
43
|
+
- **primitives**: replace
|
|
44
|
+
|
|
45
|
+
.. code:: python
|
|
46
|
+
|
|
47
|
+
from torchx.specs import Role
|
|
48
|
+
|
|
49
|
+
# Dict overlay - lists append, tuples replace
|
|
50
|
+
role = Role(
|
|
51
|
+
name="trainer",
|
|
52
|
+
image="my-image:latest",
|
|
53
|
+
entrypoint="train.py",
|
|
54
|
+
metadata={
|
|
55
|
+
"kubernetes": {
|
|
56
|
+
"spec": {
|
|
57
|
+
"nodeSelector": {"gpu": "true"},
|
|
58
|
+
"tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
|
|
59
|
+
"volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# File URI overlay
|
|
66
|
+
role = Role(
|
|
67
|
+
name="trainer",
|
|
68
|
+
image="my-image:latest",
|
|
69
|
+
entrypoint="train.py",
|
|
70
|
+
metadata={
|
|
71
|
+
"kubernetes": "file:///path/to/pod_overlay.yaml"
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
CLI usage with builtin components:
|
|
76
|
+
|
|
77
|
+
.. code:: bash
|
|
78
|
+
|
|
79
|
+
$ torchx run --scheduler kubernetes dist.ddp \\
|
|
80
|
+
--metadata kubernetes=file:///path/to/pod_overlay.yaml \\
|
|
81
|
+
--script train.py
|
|
82
|
+
|
|
83
|
+
Example ``pod_overlay.yaml``:
|
|
84
|
+
|
|
85
|
+
.. code:: yaml
|
|
86
|
+
|
|
87
|
+
spec:
|
|
88
|
+
nodeSelector:
|
|
89
|
+
node.kubernetes.io/instance-type: p4d.24xlarge
|
|
90
|
+
tolerations:
|
|
91
|
+
- key: nvidia.com/gpu
|
|
92
|
+
operator: Exists
|
|
93
|
+
effect: NoSchedule
|
|
94
|
+
volumes: !!python/tuple
|
|
95
|
+
- name: my-volume
|
|
96
|
+
emptyDir: {}
|
|
97
|
+
|
|
98
|
+
The overlay is deep-merged with the generated pod, preserving existing fields
|
|
99
|
+
and adding or overriding specified ones.
|
|
30
100
|
"""
|
|
31
101
|
|
|
32
102
|
import json
|
|
@@ -45,6 +115,7 @@ from typing import (
|
|
|
45
115
|
Tuple,
|
|
46
116
|
TYPE_CHECKING,
|
|
47
117
|
TypedDict,
|
|
118
|
+
Union,
|
|
48
119
|
)
|
|
49
120
|
|
|
50
121
|
import torchx
|
|
@@ -97,6 +168,40 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
168
|
RESERVED_MILLICPU = 100
|
|
98
169
|
RESERVED_MEMMB = 1024
|
|
99
170
|
|
|
171
|
+
|
|
172
|
+
def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
|
|
173
|
+
"""Apply overlay dict to V1Pod object, merging nested fields.
|
|
174
|
+
|
|
175
|
+
Merge semantics:
|
|
176
|
+
- dict: upsert (recursive merge)
|
|
177
|
+
- list: append by default, replace if tuple
|
|
178
|
+
- primitives: replace
|
|
179
|
+
"""
|
|
180
|
+
from kubernetes import client
|
|
181
|
+
|
|
182
|
+
api = client.ApiClient()
|
|
183
|
+
pod_dict = api.sanitize_for_serialization(pod)
|
|
184
|
+
|
|
185
|
+
def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
|
|
186
|
+
for key, value in overlay.items():
|
|
187
|
+
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
|
|
188
|
+
deep_merge(base[key], value)
|
|
189
|
+
elif isinstance(value, tuple):
|
|
190
|
+
base[key] = list(value)
|
|
191
|
+
elif (
|
|
192
|
+
isinstance(value, list) and key in base and isinstance(base[key], list)
|
|
193
|
+
):
|
|
194
|
+
base[key].extend(value)
|
|
195
|
+
else:
|
|
196
|
+
base[key] = value
|
|
197
|
+
|
|
198
|
+
deep_merge(pod_dict, overlay)
|
|
199
|
+
|
|
200
|
+
merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
|
|
201
|
+
pod.spec = merged_pod.spec
|
|
202
|
+
pod.metadata = merged_pod.metadata
|
|
203
|
+
|
|
204
|
+
|
|
100
205
|
RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
|
|
101
206
|
RetryPolicy.REPLICA: [],
|
|
102
207
|
RetryPolicy.APPLICATION: [
|
|
@@ -369,7 +474,7 @@ def app_to_resource(
|
|
|
369
474
|
queue: str,
|
|
370
475
|
service_account: Optional[str],
|
|
371
476
|
priority_class: Optional[str] = None,
|
|
372
|
-
) -> Dict[str,
|
|
477
|
+
) -> Dict[str, Any]:
|
|
373
478
|
"""
|
|
374
479
|
app_to_resource creates a volcano job kubernetes resource definition from
|
|
375
480
|
the provided AppDef. The resource definition can be used to launch the
|
|
@@ -402,6 +507,17 @@ def app_to_resource(
|
|
|
402
507
|
replica_role.env["TORCHX_IMAGE"] = replica_role.image
|
|
403
508
|
|
|
404
509
|
pod = role_to_pod(name, replica_role, service_account)
|
|
510
|
+
if k8s_metadata := role.metadata.get("kubernetes"):
|
|
511
|
+
if isinstance(k8s_metadata, str):
|
|
512
|
+
import fsspec
|
|
513
|
+
|
|
514
|
+
with fsspec.open(k8s_metadata, "r") as f:
|
|
515
|
+
k8s_metadata = yaml.unsafe_load(f)
|
|
516
|
+
elif not isinstance(k8s_metadata, dict):
|
|
517
|
+
raise ValueError(
|
|
518
|
+
f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
|
|
519
|
+
)
|
|
520
|
+
_apply_pod_overlay(pod, k8s_metadata)
|
|
405
521
|
pod.metadata.labels.update(
|
|
406
522
|
pod_labels(
|
|
407
523
|
app=app,
|
|
@@ -444,7 +560,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
444
560
|
if priority_class is not None:
|
|
445
561
|
job_spec["priorityClassName"] = priority_class
|
|
446
562
|
|
|
447
|
-
resource: Dict[str,
|
|
563
|
+
resource: Dict[str, Any] = {
|
|
448
564
|
"apiVersion": "batch.volcano.sh/v1alpha1",
|
|
449
565
|
"kind": "Job",
|
|
450
566
|
"metadata": {"name": f"{unique_app_id}"},
|
|
@@ -456,7 +572,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
456
572
|
@dataclass
|
|
457
573
|
class KubernetesJob:
|
|
458
574
|
images_to_push: Dict[str, Tuple[str, str]]
|
|
459
|
-
resource: Dict[str,
|
|
575
|
+
resource: Dict[str, Any]
|
|
460
576
|
|
|
461
577
|
def __str__(self) -> str:
|
|
462
578
|
return yaml.dump(sanitize_for_serialization(self.resource))
|
|
@@ -471,6 +587,7 @@ class KubernetesOpts(TypedDict, total=False):
|
|
|
471
587
|
image_repo: Optional[str]
|
|
472
588
|
service_account: Optional[str]
|
|
473
589
|
priority_class: Optional[str]
|
|
590
|
+
validate_spec: Optional[bool]
|
|
474
591
|
|
|
475
592
|
|
|
476
593
|
class KubernetesScheduler(
|
|
@@ -636,7 +753,7 @@ class KubernetesScheduler(
|
|
|
636
753
|
else:
|
|
637
754
|
raise
|
|
638
755
|
|
|
639
|
-
return f
|
|
756
|
+
return f"{namespace}:{resp['metadata']['name']}"
|
|
640
757
|
|
|
641
758
|
def _submit_dryrun(
|
|
642
759
|
self, app: AppDef, cfg: KubernetesOpts
|
|
@@ -659,6 +776,36 @@ class KubernetesScheduler(
|
|
|
659
776
|
), "priority_class must be a str"
|
|
660
777
|
|
|
661
778
|
resource = app_to_resource(app, queue, service_account, priority_class)
|
|
779
|
+
|
|
780
|
+
if cfg.get("validate_spec"):
|
|
781
|
+
try:
|
|
782
|
+
self._custom_objects_api().create_namespaced_custom_object(
|
|
783
|
+
group="batch.volcano.sh",
|
|
784
|
+
version="v1alpha1",
|
|
785
|
+
namespace=cfg.get("namespace") or "default",
|
|
786
|
+
plural="jobs",
|
|
787
|
+
body=resource,
|
|
788
|
+
dry_run="All",
|
|
789
|
+
)
|
|
790
|
+
except Exception as e:
|
|
791
|
+
from kubernetes.client.rest import ApiException
|
|
792
|
+
|
|
793
|
+
if isinstance(e, ApiException):
|
|
794
|
+
raise ValueError(f"Invalid job spec: {e.reason}") from e
|
|
795
|
+
raise
|
|
796
|
+
|
|
797
|
+
job_name = resource["metadata"]["name"]
|
|
798
|
+
for task in resource["spec"]["tasks"]:
|
|
799
|
+
task_name = task["name"]
|
|
800
|
+
replicas = task.get("replicas", 1)
|
|
801
|
+
max_index = replicas - 1
|
|
802
|
+
pod_name = f"{job_name}-{task_name}-{max_index}"
|
|
803
|
+
if len(pod_name) > 63:
|
|
804
|
+
raise ValueError(
|
|
805
|
+
f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
|
|
806
|
+
f"Shorten app.name or role names"
|
|
807
|
+
)
|
|
808
|
+
|
|
662
809
|
req = KubernetesJob(
|
|
663
810
|
resource=resource,
|
|
664
811
|
images_to_push=images_to_push,
|
|
@@ -703,6 +850,12 @@ class KubernetesScheduler(
|
|
|
703
850
|
type_=str,
|
|
704
851
|
help="The name of the PriorityClass to set on the job specs",
|
|
705
852
|
)
|
|
853
|
+
opts.add(
|
|
854
|
+
"validate_spec",
|
|
855
|
+
type_=bool,
|
|
856
|
+
help="Validate job spec using Kubernetes API dry-run before submission",
|
|
857
|
+
default=True,
|
|
858
|
+
)
|
|
706
859
|
return opts
|
|
707
860
|
|
|
708
861
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
torchx/version.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
2
|
# All rights reserved.
|
|
4
3
|
#
|
|
@@ -7,6 +6,7 @@
|
|
|
7
6
|
|
|
8
7
|
# pyre-strict
|
|
9
8
|
|
|
9
|
+
from torchx._version import BASE_VERSION
|
|
10
10
|
from torchx.util.entrypoints import load
|
|
11
11
|
|
|
12
12
|
# Follows PEP-0440 version scheme guidelines
|
|
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
|
|
|
18
18
|
# 0.1.0bN # Beta release
|
|
19
19
|
# 0.1.0rcN # Release Candidate
|
|
20
20
|
# 0.1.0 # Final release
|
|
21
|
-
__version__ =
|
|
21
|
+
__version__: str = BASE_VERSION
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
# Use the github container registry images corresponding to the current package
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
torchx/__init__.py,sha256=QFDTdJacncWYWHL-2QyWdY5MUck3jVfSPRRGdvedcKc,355
|
|
2
|
+
torchx/_version.py,sha256=TzDuXIviDldFbXAhGe33redQcoP33jIsVR_hMyqSgdc,250
|
|
2
3
|
torchx/notebook.py,sha256=Rc6XUMzSq7NXtsYdtVluE6T89LpEhcba-3ANxuaLCCU,1008
|
|
3
|
-
torchx/version.py,sha256=
|
|
4
|
+
torchx/version.py,sha256=YcE66UkBxYHMQMtjVts4jF3l6Qeaj1gK_LzxU77l8Bo,975
|
|
4
5
|
torchx/apps/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
5
6
|
torchx/apps/serve/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
|
|
6
7
|
torchx/apps/serve/serve.py,sha256=u_h8agld1TwIPq5GRosHL3uxhkljNfS65McLB77O0OE,4386
|
|
@@ -64,7 +65,7 @@ torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,
|
|
|
64
65
|
torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
|
|
65
66
|
torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
|
|
66
67
|
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
|
|
67
|
-
torchx/schedulers/kubernetes_scheduler.py,sha256=
|
|
68
|
+
torchx/schedulers/kubernetes_scheduler.py,sha256=Dg4olz-JTjOEzV3TBDqdXVb14yybpHAgZzFbH5UZlWU,33479
|
|
68
69
|
torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
|
|
69
70
|
torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
|
|
70
71
|
torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
|
|
@@ -102,9 +103,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
102
103
|
torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
|
|
103
104
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
104
105
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
105
|
-
torchx_nightly-2025.10.
|
|
106
|
-
torchx_nightly-2025.10.
|
|
107
|
-
torchx_nightly-2025.10.
|
|
108
|
-
torchx_nightly-2025.10.
|
|
109
|
-
torchx_nightly-2025.10.
|
|
110
|
-
torchx_nightly-2025.10.
|
|
106
|
+
torchx_nightly-2025.10.29.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
107
|
+
torchx_nightly-2025.10.29.dist-info/METADATA,sha256=grbnbviugsKDzeDVXKXb7wQX0i7QsosXt7nw2klW2BE,5046
|
|
108
|
+
torchx_nightly-2025.10.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
109
|
+
torchx_nightly-2025.10.29.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
110
|
+
torchx_nightly-2025.10.29.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
111
|
+
torchx_nightly-2025.10.29.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{torchx_nightly-2025.10.27.dist-info → torchx_nightly-2025.10.29.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|