torchx-nightly 2025.10.26__py3-none-any.whl → 2025.10.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/_version.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ BASE_VERSION = "0.8.0dev0"
@@ -27,6 +27,76 @@ Install Volcano:
27
27
  See the
28
28
  `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
29
29
  for more information.
30
+
31
+ Pod Overlay
32
+ ===========
33
+
34
+ You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
35
+ the ``kubernetes`` metadata on your role. The value can be:
36
+
37
+ - A dict with the overlay structure
38
+ - A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
39
+
40
+ Merge semantics:
41
+ - **dict**: recursive merge (upsert)
42
+ - **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
43
+ - **primitives**: replace
44
+
45
+ .. code:: python
46
+
47
+ from torchx.specs import Role
48
+
49
+ # Dict overlay - lists append, tuples replace
50
+ role = Role(
51
+ name="trainer",
52
+ image="my-image:latest",
53
+ entrypoint="train.py",
54
+ metadata={
55
+ "kubernetes": {
56
+ "spec": {
57
+ "nodeSelector": {"gpu": "true"},
58
+ "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
59
+ "volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
60
+ }
61
+ }
62
+ }
63
+ )
64
+
65
+ # File URI overlay
66
+ role = Role(
67
+ name="trainer",
68
+ image="my-image:latest",
69
+ entrypoint="train.py",
70
+ metadata={
71
+ "kubernetes": "file:///path/to/pod_overlay.yaml"
72
+ }
73
+ )
74
+
75
+ CLI usage with builtin components:
76
+
77
+ .. code:: bash
78
+
79
+ $ torchx run --scheduler kubernetes dist.ddp \\
80
+ --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
81
+ --script train.py
82
+
83
+ Example ``pod_overlay.yaml``:
84
+
85
+ .. code:: yaml
86
+
87
+ spec:
88
+ nodeSelector:
89
+ node.kubernetes.io/instance-type: p4d.24xlarge
90
+ tolerations:
91
+ - key: nvidia.com/gpu
92
+ operator: Exists
93
+ effect: NoSchedule
94
+ volumes: !!python/tuple
95
+ - name: my-volume
96
+ emptyDir: {}
97
+
98
+ The overlay is deep-merged with the generated pod, preserving existing fields
99
+ and adding or overriding specified ones.
30
100
  """
31
101
 
32
102
  import json
@@ -45,6 +115,7 @@ from typing import (
45
115
  Tuple,
46
116
  TYPE_CHECKING,
47
117
  TypedDict,
118
+ Union,
48
119
  )
49
120
 
50
121
  import torchx
@@ -97,6 +168,40 @@ logger: logging.Logger = logging.getLogger(__name__)
97
168
  RESERVED_MILLICPU = 100
98
169
  RESERVED_MEMMB = 1024
99
170
 
171
+
172
+ def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
173
+ """Apply overlay dict to V1Pod object, merging nested fields.
174
+
175
+ Merge semantics:
176
+ - dict: upsert (recursive merge)
177
+ - list: append by default, replace if tuple
178
+ - primitives: replace
179
+ """
180
+ from kubernetes import client
181
+
182
+ api = client.ApiClient()
183
+ pod_dict = api.sanitize_for_serialization(pod)
184
+
185
+ def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
186
+ for key, value in overlay.items():
187
+ if isinstance(value, dict) and key in base and isinstance(base[key], dict):
188
+ deep_merge(base[key], value)
189
+ elif isinstance(value, tuple):
190
+ base[key] = list(value)
191
+ elif (
192
+ isinstance(value, list) and key in base and isinstance(base[key], list)
193
+ ):
194
+ base[key].extend(value)
195
+ else:
196
+ base[key] = value
197
+
198
+ deep_merge(pod_dict, overlay)
199
+
200
+ merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
201
+ pod.spec = merged_pod.spec
202
+ pod.metadata = merged_pod.metadata
203
+
204
+
100
205
  RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
101
206
  RetryPolicy.REPLICA: [],
102
207
  RetryPolicy.APPLICATION: [
@@ -369,7 +474,7 @@ def app_to_resource(
369
474
  queue: str,
370
475
  service_account: Optional[str],
371
476
  priority_class: Optional[str] = None,
372
- ) -> Dict[str, object]:
477
+ ) -> Dict[str, Any]:
373
478
  """
374
479
  app_to_resource creates a volcano job kubernetes resource definition from
375
480
  the provided AppDef. The resource definition can be used to launch the
@@ -402,6 +507,17 @@ def app_to_resource(
402
507
  replica_role.env["TORCHX_IMAGE"] = replica_role.image
403
508
 
404
509
  pod = role_to_pod(name, replica_role, service_account)
510
+ if k8s_metadata := role.metadata.get("kubernetes"):
511
+ if isinstance(k8s_metadata, str):
512
+ import fsspec
513
+
514
+ with fsspec.open(k8s_metadata, "r") as f:
515
+ k8s_metadata = yaml.unsafe_load(f)
516
+ elif not isinstance(k8s_metadata, dict):
517
+ raise ValueError(
518
+ f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
519
+ )
520
+ _apply_pod_overlay(pod, k8s_metadata)
405
521
  pod.metadata.labels.update(
406
522
  pod_labels(
407
523
  app=app,
@@ -444,7 +560,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
444
560
  if priority_class is not None:
445
561
  job_spec["priorityClassName"] = priority_class
446
562
 
447
- resource: Dict[str, object] = {
563
+ resource: Dict[str, Any] = {
448
564
  "apiVersion": "batch.volcano.sh/v1alpha1",
449
565
  "kind": "Job",
450
566
  "metadata": {"name": f"{unique_app_id}"},
@@ -456,7 +572,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
456
572
  @dataclass
457
573
  class KubernetesJob:
458
574
  images_to_push: Dict[str, Tuple[str, str]]
459
- resource: Dict[str, object]
575
+ resource: Dict[str, Any]
460
576
 
461
577
  def __str__(self) -> str:
462
578
  return yaml.dump(sanitize_for_serialization(self.resource))
@@ -471,6 +587,7 @@ class KubernetesOpts(TypedDict, total=False):
471
587
  image_repo: Optional[str]
472
588
  service_account: Optional[str]
473
589
  priority_class: Optional[str]
590
+ validate_spec: Optional[bool]
474
591
 
475
592
 
476
593
  class KubernetesScheduler(
@@ -636,7 +753,7 @@ class KubernetesScheduler(
636
753
  else:
637
754
  raise
638
755
 
639
- return f'{namespace}:{resp["metadata"]["name"]}'
756
+ return f"{namespace}:{resp['metadata']['name']}"
640
757
 
641
758
  def _submit_dryrun(
642
759
  self, app: AppDef, cfg: KubernetesOpts
@@ -659,6 +776,36 @@ class KubernetesScheduler(
659
776
  ), "priority_class must be a str"
660
777
 
661
778
  resource = app_to_resource(app, queue, service_account, priority_class)
779
+
780
+ if cfg.get("validate_spec"):
781
+ try:
782
+ self._custom_objects_api().create_namespaced_custom_object(
783
+ group="batch.volcano.sh",
784
+ version="v1alpha1",
785
+ namespace=cfg.get("namespace") or "default",
786
+ plural="jobs",
787
+ body=resource,
788
+ dry_run="All",
789
+ )
790
+ except Exception as e:
791
+ from kubernetes.client.rest import ApiException
792
+
793
+ if isinstance(e, ApiException):
794
+ raise ValueError(f"Invalid job spec: {e.reason}") from e
795
+ raise
796
+
797
+ job_name = resource["metadata"]["name"]
798
+ for task in resource["spec"]["tasks"]:
799
+ task_name = task["name"]
800
+ replicas = task.get("replicas", 1)
801
+ max_index = replicas - 1
802
+ pod_name = f"{job_name}-{task_name}-{max_index}"
803
+ if len(pod_name) > 63:
804
+ raise ValueError(
805
+ f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
806
+ f"Shorten app.name or role names"
807
+ )
808
+
662
809
  req = KubernetesJob(
663
810
  resource=resource,
664
811
  images_to_push=images_to_push,
@@ -703,6 +850,12 @@ class KubernetesScheduler(
703
850
  type_=str,
704
851
  help="The name of the PriorityClass to set on the job specs",
705
852
  )
853
+ opts.add(
854
+ "validate_spec",
855
+ type_=bool,
856
+ help="Validate job spec using Kubernetes API dry-run before submission",
857
+ default=True,
858
+ )
706
859
  return opts
707
860
 
708
861
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
torchx/version.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
@@ -7,6 +6,7 @@
7
6
 
8
7
  # pyre-strict
9
8
 
9
+ from torchx._version import BASE_VERSION
10
10
  from torchx.util.entrypoints import load
11
11
 
12
12
  # Follows PEP-0440 version scheme guidelines
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
18
18
  # 0.1.0bN # Beta release
19
19
  # 0.1.0rcN # Release Candidate
20
20
  # 0.1.0 # Final release
21
- __version__ = "0.8.0dev0"
21
+ __version__: str = BASE_VERSION
22
22
 
23
23
 
24
24
  # Use the github container registry images corresponding to the current package
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2025.10.26
3
+ Version: 2025.10.28
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/meta-pytorch/torchx
6
6
  Author: TorchX Devs
@@ -1,6 +1,7 @@
1
1
  torchx/__init__.py,sha256=QFDTdJacncWYWHL-2QyWdY5MUck3jVfSPRRGdvedcKc,355
2
+ torchx/_version.py,sha256=TzDuXIviDldFbXAhGe33redQcoP33jIsVR_hMyqSgdc,250
2
3
  torchx/notebook.py,sha256=Rc6XUMzSq7NXtsYdtVluE6T89LpEhcba-3ANxuaLCCU,1008
3
- torchx/version.py,sha256=d28ccaZP21nlF8jEmSLjJiidyquMJo02tDpeVD36inc,951
4
+ torchx/version.py,sha256=YcE66UkBxYHMQMtjVts4jF3l6Qeaj1gK_LzxU77l8Bo,975
4
5
  torchx/apps/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
5
6
  torchx/apps/serve/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
6
7
  torchx/apps/serve/serve.py,sha256=u_h8agld1TwIPq5GRosHL3uxhkljNfS65McLB77O0OE,4386
@@ -64,7 +65,7 @@ torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,
64
65
  torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
65
66
  torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
66
67
  torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
67
- torchx/schedulers/kubernetes_scheduler.py,sha256=M1efsLg2keHLrvh4iR1tGBYK4MYPuBSI7Exup17cACE,28498
68
+ torchx/schedulers/kubernetes_scheduler.py,sha256=Dg4olz-JTjOEzV3TBDqdXVb14yybpHAgZzFbH5UZlWU,33479
68
69
  torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
69
70
  torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
70
71
  torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
@@ -102,9 +103,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
102
103
  torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
103
104
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
104
105
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
105
- torchx_nightly-2025.10.26.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
106
- torchx_nightly-2025.10.26.dist-info/METADATA,sha256=3-GMRsszZyTXBcI1DhK9yxcR5iaZq63zrk9ckODdlCs,5046
107
- torchx_nightly-2025.10.26.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
108
- torchx_nightly-2025.10.26.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
109
- torchx_nightly-2025.10.26.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
110
- torchx_nightly-2025.10.26.dist-info/RECORD,,
106
+ torchx_nightly-2025.10.28.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
107
+ torchx_nightly-2025.10.28.dist-info/METADATA,sha256=-Zhur1L-9qsl_u8Jw5x__EEgG-Y_i9wNKu0FI-hxKdg,5046
108
+ torchx_nightly-2025.10.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
109
+ torchx_nightly-2025.10.28.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
110
+ torchx_nightly-2025.10.28.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
111
+ torchx_nightly-2025.10.28.dist-info/RECORD,,