torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show
  1. torchx/__init__.py +2 -0
  2. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  3. torchx/apps/serve/serve.py +2 -0
  4. torchx/apps/utils/booth_main.py +2 -0
  5. torchx/apps/utils/copy_main.py +2 -0
  6. torchx/apps/utils/process_monitor.py +2 -0
  7. torchx/cli/__init__.py +2 -0
  8. torchx/cli/argparse_util.py +38 -3
  9. torchx/cli/cmd_base.py +2 -0
  10. torchx/cli/cmd_cancel.py +2 -0
  11. torchx/cli/cmd_configure.py +2 -0
  12. torchx/cli/cmd_delete.py +30 -0
  13. torchx/cli/cmd_describe.py +2 -0
  14. torchx/cli/cmd_list.py +8 -4
  15. torchx/cli/cmd_log.py +6 -24
  16. torchx/cli/cmd_run.py +269 -45
  17. torchx/cli/cmd_runopts.py +2 -0
  18. torchx/cli/cmd_status.py +12 -1
  19. torchx/cli/cmd_tracker.py +3 -1
  20. torchx/cli/colors.py +2 -0
  21. torchx/cli/main.py +4 -0
  22. torchx/components/__init__.py +3 -8
  23. torchx/components/component_test_base.py +2 -0
  24. torchx/components/dist.py +18 -7
  25. torchx/components/integration_tests/component_provider.py +4 -2
  26. torchx/components/integration_tests/integ_tests.py +2 -0
  27. torchx/components/serve.py +2 -0
  28. torchx/components/structured_arg.py +4 -3
  29. torchx/components/utils.py +15 -4
  30. torchx/distributed/__init__.py +2 -4
  31. torchx/examples/apps/datapreproc/datapreproc.py +2 -0
  32. torchx/examples/apps/lightning/data.py +5 -3
  33. torchx/examples/apps/lightning/model.py +7 -6
  34. torchx/examples/apps/lightning/profiler.py +7 -4
  35. torchx/examples/apps/lightning/train.py +11 -2
  36. torchx/examples/torchx_out_of_sync_training.py +11 -0
  37. torchx/notebook.py +2 -0
  38. torchx/runner/__init__.py +2 -0
  39. torchx/runner/api.py +167 -60
  40. torchx/runner/config.py +43 -10
  41. torchx/runner/events/__init__.py +57 -13
  42. torchx/runner/events/api.py +14 -3
  43. torchx/runner/events/handlers.py +2 -0
  44. torchx/runtime/tracking/__init__.py +2 -0
  45. torchx/runtime/tracking/api.py +2 -0
  46. torchx/schedulers/__init__.py +16 -15
  47. torchx/schedulers/api.py +70 -14
  48. torchx/schedulers/aws_batch_scheduler.py +75 -6
  49. torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
  50. torchx/schedulers/devices.py +17 -4
  51. torchx/schedulers/docker_scheduler.py +43 -11
  52. torchx/schedulers/ids.py +29 -23
  53. torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
  54. torchx/schedulers/kubernetes_scheduler.py +383 -38
  55. torchx/schedulers/local_scheduler.py +100 -27
  56. torchx/schedulers/lsf_scheduler.py +5 -4
  57. torchx/schedulers/slurm_scheduler.py +336 -20
  58. torchx/schedulers/streams.py +2 -0
  59. torchx/specs/__init__.py +89 -12
  60. torchx/specs/api.py +418 -30
  61. torchx/specs/builders.py +176 -38
  62. torchx/specs/file_linter.py +143 -57
  63. torchx/specs/finder.py +68 -28
  64. torchx/specs/named_resources_aws.py +181 -4
  65. torchx/specs/named_resources_generic.py +2 -0
  66. torchx/specs/overlays.py +106 -0
  67. torchx/specs/test/components/__init__.py +2 -0
  68. torchx/specs/test/components/a/__init__.py +2 -0
  69. torchx/specs/test/components/a/b/__init__.py +2 -0
  70. torchx/specs/test/components/a/b/c.py +2 -0
  71. torchx/specs/test/components/c/__init__.py +2 -0
  72. torchx/specs/test/components/c/d.py +2 -0
  73. torchx/tracker/__init__.py +12 -6
  74. torchx/tracker/api.py +15 -18
  75. torchx/tracker/backend/fsspec.py +2 -0
  76. torchx/util/cuda.py +2 -0
  77. torchx/util/datetime.py +2 -0
  78. torchx/util/entrypoints.py +39 -15
  79. torchx/util/io.py +2 -0
  80. torchx/util/log_tee_helpers.py +210 -0
  81. torchx/util/modules.py +65 -0
  82. torchx/util/session.py +42 -0
  83. torchx/util/shlex.py +2 -0
  84. torchx/util/strings.py +3 -1
  85. torchx/util/types.py +90 -29
  86. torchx/version.py +4 -2
  87. torchx/workspace/__init__.py +2 -0
  88. torchx/workspace/api.py +136 -6
  89. torchx/workspace/dir_workspace.py +2 -0
  90. torchx/workspace/docker_workspace.py +30 -2
  91. torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
  92. torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
  93. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
  94. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
  95. torchx/examples/pipelines/__init__.py +0 -0
  96. torchx/examples/pipelines/kfp/__init__.py +0 -0
  97. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
  98. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
  99. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
  100. torchx/pipelines/kfp/__init__.py +0 -28
  101. torchx/pipelines/kfp/adapter.py +0 -271
  102. torchx/pipelines/kfp/version.py +0 -17
  103. torchx/schedulers/gcp_batch_scheduler.py +0 -487
  104. torchx/schedulers/ray/ray_common.py +0 -22
  105. torchx/schedulers/ray/ray_driver.py +0 -307
  106. torchx/schedulers/ray_scheduler.py +0 -453
  107. torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
  108. torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
  109. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
  110. {torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,8 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  """
9
11
 
10
12
  This contains the TorchX Kubernetes scheduler which can be used to run TorchX
@@ -23,12 +25,83 @@ Install Volcano:
23
25
  kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.6.0/installer/volcano-development.yaml
24
26
 
25
27
  See the
26
- `Volcano Quickstart <https://github.com/volcano-sh/volcano#user-content-quick-start-guide>`_
28
+ `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
27
29
  for more information.
30
+
31
+ Pod Overlay
32
+ ===========
33
+
34
+ You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
35
+ the ``kubernetes`` metadata on your role. The value can be:
36
+
37
+ - A dict with the overlay structure
38
+ - A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
39
+
40
+ Merge semantics:
41
+ - **dict**: recursive merge (upsert)
42
+ - **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
43
+ - **primitives**: replace
44
+
45
+ .. code:: python
46
+
47
+ from torchx.specs import Role
48
+
49
+ # Dict overlay - lists append, tuples replace
50
+ role = Role(
51
+ name="trainer",
52
+ image="my-image:latest",
53
+ entrypoint="train.py",
54
+ metadata={
55
+ "kubernetes": {
56
+ "spec": {
57
+ "nodeSelector": {"gpu": "true"},
58
+ "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
59
+ "volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
60
+ }
61
+ }
62
+ }
63
+ )
64
+
65
+ # File URI overlay
66
+ role = Role(
67
+ name="trainer",
68
+ image="my-image:latest",
69
+ entrypoint="train.py",
70
+ metadata={
71
+ "kubernetes": "file:///path/to/pod_overlay.yaml"
72
+ }
73
+ )
74
+
75
+ CLI usage with builtin components:
76
+
77
+ .. code:: bash
78
+
79
+ $ torchx run --scheduler kubernetes dist.ddp \\
80
+ --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
81
+ --script train.py
82
+
83
+ Example ``pod_overlay.yaml``:
84
+
85
+ .. code:: yaml
86
+
87
+ spec:
88
+ nodeSelector:
89
+ node.kubernetes.io/instance-type: p4d.24xlarge
90
+ tolerations:
91
+ - key: nvidia.com/gpu
92
+ operator: Exists
93
+ effect: NoSchedule
94
+ volumes: !!python/tuple
95
+ - name: my-volume
96
+ emptyDir: {}
97
+
98
+ The overlay is deep-merged with the generated pod, preserving existing fields
99
+ and adding or overriding specified ones.
28
100
  """
29
101
 
30
102
  import json
31
103
  import logging
104
+ import re
32
105
  import warnings
33
106
  from dataclasses import dataclass
34
107
  from datetime import datetime
@@ -42,12 +115,13 @@ from typing import (
42
115
  Optional,
43
116
  Tuple,
44
117
  TYPE_CHECKING,
118
+ TypedDict,
119
+ Union,
45
120
  )
46
121
 
47
122
  import torchx
48
123
  import yaml
49
124
  from torchx.schedulers.api import (
50
- AppDryRunInfo,
51
125
  DescribeAppResponse,
52
126
  filter_regex,
53
127
  ListAppResponse,
@@ -58,6 +132,7 @@ from torchx.schedulers.api import (
58
132
  from torchx.schedulers.ids import make_unique
59
133
  from torchx.specs.api import (
60
134
  AppDef,
135
+ AppDryRunInfo,
61
136
  AppState,
62
137
  BindMount,
63
138
  CfgVal,
@@ -73,8 +148,6 @@ from torchx.specs.api import (
73
148
  )
74
149
  from torchx.util.strings import normalize_str
75
150
  from torchx.workspace.docker_workspace import DockerWorkspaceMixin
76
- from typing_extensions import TypedDict
77
-
78
151
 
79
152
  if TYPE_CHECKING:
80
153
  from docker import DockerClient
@@ -85,6 +158,7 @@ if TYPE_CHECKING:
85
158
  )
86
159
  from kubernetes.client.rest import ApiException
87
160
 
161
+
88
162
  logger: logging.Logger = logging.getLogger(__name__)
89
163
 
90
164
  # Kubernetes reserves a small amount of resources per host for the system. For
@@ -95,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
95
169
  RESERVED_MILLICPU = 100
96
170
  RESERVED_MEMMB = 1024
97
171
 
172
+
173
+ def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
174
+ """Apply overlay dict to V1Pod object, merging nested fields.
175
+
176
+ Merge semantics:
177
+ - dict: upsert (recursive merge)
178
+ - list: append by default, replace if tuple
179
+ - primitives: replace
180
+ """
181
+ from kubernetes import client
182
+
183
+ api = client.ApiClient()
184
+ pod_dict = api.sanitize_for_serialization(pod)
185
+
186
+ def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
187
+ for key, value in overlay.items():
188
+ if isinstance(value, dict) and key in base and isinstance(base[key], dict):
189
+ deep_merge(base[key], value)
190
+ elif isinstance(value, tuple):
191
+ base[key] = list(value)
192
+ elif (
193
+ isinstance(value, list) and key in base and isinstance(base[key], list)
194
+ ):
195
+ base[key].extend(value)
196
+ else:
197
+ base[key] = value
198
+
199
+ deep_merge(pod_dict, overlay)
200
+
201
+ merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
202
+ pod.spec = merged_pod.spec
203
+ pod.metadata = merged_pod.metadata
204
+
205
+
98
206
  RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
99
207
  RetryPolicy.REPLICA: [],
100
208
  RetryPolicy.APPLICATION: [
@@ -167,6 +275,17 @@ ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
167
275
 
168
276
  LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
169
277
 
278
+ # role.env translates to static env variables in the yaml
279
+ # {"FOO" : "bar"} =====> - name: FOO
280
+ # value: bar
281
+ # unless this placeholder is present at the start of the role.env value then the env variable
282
+ # in the yaml will be dynamically populated at runtime (placeholder is stripped out of the value)
283
+ # {"FOO" : "[FIELD_PATH]bar"} =====> - name: FOO
284
+ # valueFrom:
285
+ # fieldRef:
286
+ # fieldPath: bar
287
+ PLACEHOLDER_FIELD_PATH = "[FIELD_PATH]"
288
+
170
289
 
171
290
  def sanitize_for_serialization(obj: object) -> object:
172
291
  from kubernetes import client
@@ -175,13 +294,22 @@ def sanitize_for_serialization(obj: object) -> object:
175
294
  return api.sanitize_for_serialization(obj)
176
295
 
177
296
 
178
- def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
297
+ def role_to_pod(
298
+ name: str,
299
+ role: Role,
300
+ service_account: Optional[str],
301
+ reserved_millicpu: int = RESERVED_MILLICPU,
302
+ reserved_memmb: int = RESERVED_MEMMB,
303
+ efa_device_count: Optional[int] = None,
304
+ ) -> "V1Pod":
179
305
  from kubernetes.client.models import ( # noqa: F811 redefinition of unused
180
306
  V1Container,
181
307
  V1ContainerPort,
182
308
  V1EmptyDirVolumeSource,
183
309
  V1EnvVar,
310
+ V1EnvVarSource,
184
311
  V1HostPathVolumeSource,
312
+ V1ObjectFieldSelector,
185
313
  V1ObjectMeta,
186
314
  V1PersistentVolumeClaimVolumeSource,
187
315
  V1Pod,
@@ -203,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
203
331
  if resource.cpu > 0:
204
332
  mcpu = int(resource.cpu * 1000)
205
333
  limits["cpu"] = f"{mcpu}m"
206
- request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
334
+ request_mcpu = max(mcpu - reserved_millicpu, 0)
207
335
  requests["cpu"] = f"{request_mcpu}m"
208
336
  if resource.memMB > 0:
209
337
  limits["memory"] = f"{int(resource.memMB)}M"
210
- request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
338
+ request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
211
339
  requests["memory"] = f"{request_memMB}M"
212
340
  if resource.gpu > 0:
213
341
  requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
214
342
 
343
+ EFA_DEVICE = "vpc.amazonaws.com/efa"
215
344
  for device_name, device_limit in resource.devices.items():
216
345
  limits[device_name] = str(device_limit)
217
346
 
347
+ # Handle EFA device count override:
348
+ # - None (default): use whatever count is in the resource spec (already added above)
349
+ # - 0: remove EFA devices entirely
350
+ # - N > 0: set EFA device count to N (override or add)
351
+ if efa_device_count is not None:
352
+ if efa_device_count == 0:
353
+ limits.pop(EFA_DEVICE, None)
354
+ else:
355
+ limits[EFA_DEVICE] = str(efa_device_count)
356
+
218
357
  resources = V1ResourceRequirements(
219
358
  limits=limits,
220
359
  requests=requests,
@@ -301,9 +440,20 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
301
440
  image=role.image,
302
441
  name=name,
303
442
  env=[
304
- V1EnvVar(
305
- name=name,
306
- value=value,
443
+ (
444
+ V1EnvVar(
445
+ name=name,
446
+ value_from=V1EnvVarSource(
447
+ field_ref=V1ObjectFieldSelector(
448
+ field_path=value.strip(PLACEHOLDER_FIELD_PATH)
449
+ )
450
+ ),
451
+ )
452
+ if value.startswith(PLACEHOLDER_FIELD_PATH)
453
+ else V1EnvVar(
454
+ name=name,
455
+ value=value,
456
+ )
307
457
  )
308
458
  for name, value in role.env.items()
309
459
  ],
@@ -343,7 +493,10 @@ def app_to_resource(
343
493
  queue: str,
344
494
  service_account: Optional[str],
345
495
  priority_class: Optional[str] = None,
346
- ) -> Dict[str, object]:
496
+ reserved_millicpu: int = RESERVED_MILLICPU,
497
+ reserved_memmb: int = RESERVED_MEMMB,
498
+ efa_device_count: Optional[int] = None,
499
+ ) -> Dict[str, Any]:
347
500
  """
348
501
  app_to_resource creates a volcano job kubernetes resource definition from
349
502
  the provided AppDef. The resource definition can be used to launch the
@@ -373,8 +526,27 @@ def app_to_resource(
373
526
  replica_role = values.apply(role)
374
527
  if role_idx == 0 and replica_id == 0:
375
528
  replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
376
-
377
- pod = role_to_pod(name, replica_role, service_account)
529
+ replica_role.env["TORCHX_IMAGE"] = replica_role.image
530
+
531
+ pod = role_to_pod(
532
+ name,
533
+ replica_role,
534
+ service_account,
535
+ reserved_millicpu,
536
+ reserved_memmb,
537
+ efa_device_count,
538
+ )
539
+ if k8s_metadata := role.metadata.get("kubernetes"):
540
+ if isinstance(k8s_metadata, str):
541
+ import fsspec
542
+
543
+ with fsspec.open(k8s_metadata, "r") as f:
544
+ k8s_metadata = yaml.unsafe_load(f)
545
+ elif not isinstance(k8s_metadata, dict):
546
+ raise ValueError(
547
+ f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
548
+ )
549
+ _apply_pod_overlay(pod, k8s_metadata)
378
550
  pod.metadata.labels.update(
379
551
  pod_labels(
380
552
  app=app,
@@ -417,7 +589,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
417
589
  if priority_class is not None:
418
590
  job_spec["priorityClassName"] = priority_class
419
591
 
420
- resource: Dict[str, object] = {
592
+ resource: Dict[str, Any] = {
421
593
  "apiVersion": "batch.volcano.sh/v1alpha1",
422
594
  "kind": "Job",
423
595
  "metadata": {"name": f"{unique_app_id}"},
@@ -429,7 +601,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
429
601
  @dataclass
430
602
  class KubernetesJob:
431
603
  images_to_push: Dict[str, Tuple[str, str]]
432
- resource: Dict[str, object]
604
+ resource: Dict[str, Any]
433
605
 
434
606
  def __str__(self) -> str:
435
607
  return yaml.dump(sanitize_for_serialization(self.resource))
@@ -444,6 +616,10 @@ class KubernetesOpts(TypedDict, total=False):
444
616
  image_repo: Optional[str]
445
617
  service_account: Optional[str]
446
618
  priority_class: Optional[str]
619
+ validate_spec: Optional[bool]
620
+ reserved_millicpu: Optional[int]
621
+ reserved_memmb: Optional[int]
622
+ efa_device_count: Optional[int]
447
623
 
448
624
 
449
625
  class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
@@ -456,7 +632,7 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
456
632
  For installation instructions see: https://github.com/volcano-sh/volcano
457
633
 
458
634
  This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
459
- v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
635
+ v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
460
636
  tracking Volcano support for Kubernetes v1.22.
461
637
 
462
638
  .. note::
@@ -474,6 +650,16 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
474
650
  $ torchx status kubernetes://torchx_user/1234
475
651
  ...
476
652
 
653
+ **Cancellation**
654
+
655
+ Canceling a job aborts it while preserving the job spec for inspection
656
+ and cloning via kubectl apply. Use the delete command to remove the job entirely:
657
+
658
+ .. code-block:: bash
659
+
660
+ $ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
661
+ $ torchx delete kubernetes://namespace/jobname # delete completely
662
+
477
663
  **Config Options**
478
664
 
479
665
  .. runopts::
@@ -552,9 +738,14 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
552
738
  if c is None:
553
739
  configuration = client.Configuration()
554
740
  try:
555
- config.load_kube_config(client_configuration=configuration)
556
- except config.ConfigException as e:
557
- warnings.warn(f"failed to load kube config: {e}")
741
+ # Try in-cluster config first (for pods with ServiceAccount)
742
+ config.load_incluster_config(client_configuration=configuration)
743
+ except config.ConfigException:
744
+ # Fall back to kubeconfig (for local development)
745
+ try:
746
+ config.load_kube_config(client_configuration=configuration)
747
+ except config.ConfigException as e:
748
+ warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
558
749
 
559
750
  c = self._client = client.ApiClient(configuration)
560
751
 
@@ -606,7 +797,7 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
606
797
  else:
607
798
  raise
608
799
 
609
- return f'{namespace}:{resp["metadata"]["name"]}'
800
+ return f"{namespace}:{resp['metadata']['name']}"
610
801
 
611
802
  def _submit_dryrun(
612
803
  self, app: AppDef, cfg: KubernetesOpts
@@ -628,18 +819,92 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
628
819
  priority_class, str
629
820
  ), "priority_class must be a str"
630
821
 
631
- resource = app_to_resource(app, queue, service_account, priority_class)
822
+ reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
823
+ assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
824
+
825
+ reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
826
+ assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
827
+
828
+ efa_device_count = cfg.get("efa_device_count")
829
+ assert efa_device_count is None or isinstance(
830
+ efa_device_count, int
831
+ ), "efa_device_count must be an int or None"
832
+
833
+ resource = app_to_resource(
834
+ app,
835
+ queue,
836
+ service_account,
837
+ priority_class,
838
+ reserved_millicpu,
839
+ reserved_memmb,
840
+ efa_device_count,
841
+ )
842
+
843
+ if cfg.get("validate_spec"):
844
+ try:
845
+ self._custom_objects_api().create_namespaced_custom_object(
846
+ group="batch.volcano.sh",
847
+ version="v1alpha1",
848
+ namespace=cfg.get("namespace") or "default",
849
+ plural="jobs",
850
+ body=resource,
851
+ dry_run="All",
852
+ )
853
+ except Exception as e:
854
+ from kubernetes.client.rest import ApiException
855
+
856
+ if isinstance(e, ApiException):
857
+ raise ValueError(f"Invalid job spec: {e.reason}") from e
858
+ raise
859
+
860
+ job_name = resource["metadata"]["name"]
861
+ for task in resource["spec"]["tasks"]:
862
+ task_name = task["name"]
863
+ replicas = task.get("replicas", 1)
864
+ max_index = replicas - 1
865
+ pod_name = f"{job_name}-{task_name}-{max_index}"
866
+ if len(pod_name) > 63:
867
+ raise ValueError(
868
+ f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
869
+ f"Shorten app.name or role names"
870
+ )
871
+
632
872
  req = KubernetesJob(
633
873
  resource=resource,
634
874
  images_to_push=images_to_push,
635
875
  )
636
876
  return AppDryRunInfo(req, repr)
637
877
 
638
- def _validate(self, app: AppDef, scheduler: str) -> None:
878
+ def _validate(self, app: AppDef, scheduler: str, cfg: KubernetesOpts) -> None:
639
879
  # Skip validation step
640
880
  pass
641
881
 
642
882
  def _cancel_existing(self, app_id: str) -> None:
883
+ """
884
+ Abort a Volcano job while preserving the spec for inspection.
885
+ """
886
+ namespace, name = app_id.split(":")
887
+ vcjob = self._custom_objects_api().get_namespaced_custom_object(
888
+ group="batch.volcano.sh",
889
+ version="v1alpha1",
890
+ namespace=namespace,
891
+ plural="jobs",
892
+ name=name,
893
+ )
894
+ vcjob["status"]["state"]["phase"] = "Aborted"
895
+ self._custom_objects_api().replace_namespaced_custom_object_status(
896
+ group="batch.volcano.sh",
897
+ version="v1alpha1",
898
+ namespace=namespace,
899
+ plural="jobs",
900
+ name=name,
901
+ body=vcjob,
902
+ )
903
+
904
+ def _delete_existing(self, app_id: str) -> None:
905
+ """
906
+ Delete a Volcano job completely from the cluster.
907
+ """
643
908
  namespace, name = app_id.split(":")
644
909
  self._custom_objects_api().delete_namespaced_custom_object(
645
910
  group="batch.volcano.sh",
@@ -673,19 +938,52 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
673
938
  type_=str,
674
939
  help="The name of the PriorityClass to set on the job specs",
675
940
  )
941
+ opts.add(
942
+ "validate_spec",
943
+ type_=bool,
944
+ help="Validate job spec using Kubernetes API dry-run before submission",
945
+ default=True,
946
+ )
947
+ opts.add(
948
+ "reserved_millicpu",
949
+ type_=int,
950
+ help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
951
+ default=RESERVED_MILLICPU,
952
+ )
953
+ opts.add(
954
+ "reserved_memmb",
955
+ type_=int,
956
+ help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
957
+ default=RESERVED_MEMMB,
958
+ )
959
+ opts.add(
960
+ "efa_device_count",
961
+ type_=int,
962
+ help="EFA device count override: None/unset=use resource spec, "
963
+ "0=remove EFA, N>0=set EFA count to N",
964
+ default=None,
965
+ )
676
966
  return opts
677
967
 
678
968
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
969
+ from kubernetes import client
970
+ from kubernetes.client.rest import ApiException
971
+
679
972
  namespace, name = app_id.split(":")
680
973
  roles = {}
681
974
  roles_statuses = {}
682
- resp = self._custom_objects_api().get_namespaced_custom_object_status(
683
- group="batch.volcano.sh",
684
- version="v1alpha1",
685
- namespace=namespace,
686
- plural="jobs",
687
- name=name,
688
- )
975
+ try:
976
+ resp = self._custom_objects_api().get_namespaced_custom_object_status(
977
+ group="batch.volcano.sh",
978
+ version="v1alpha1",
979
+ namespace=namespace,
980
+ plural="jobs",
981
+ name=name,
982
+ )
983
+ except ApiException as e:
984
+ if e.status == 404:
985
+ return None
986
+ raise
689
987
  status = resp.get("status")
690
988
  if status:
691
989
  state_str = status["state"]["phase"]
@@ -694,18 +992,44 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
694
992
  TASK_STATUS_COUNT = "taskStatusCount"
695
993
 
696
994
  if TASK_STATUS_COUNT in status:
697
- for name, status in status[TASK_STATUS_COUNT].items():
698
- role, _, idx = name.rpartition("-")
995
+ for task_name, task_status in status[TASK_STATUS_COUNT].items():
996
+ role, _, idx = task_name.rpartition("-")
699
997
 
700
- state_str = next(iter(status["phase"].keys()))
998
+ state_str = next(iter(task_status["phase"].keys()))
701
999
  state = TASK_STATE[state_str]
702
1000
 
703
1001
  if role not in roles:
704
1002
  roles[role] = Role(name=role, num_replicas=0, image="")
705
1003
  roles_statuses[role] = RoleStatus(role, [])
706
1004
  roles[role].num_replicas += 1
1005
+
1006
+ # Pod name follows the pattern: {job_name}-{task_name}-0
1007
+ # Get the pod to retrieve its IP address
1008
+ pod_name_k8s = f"{name}-{task_name}-0"
1009
+ hostname = ""
1010
+ try:
1011
+ core_api = client.CoreV1Api(self._api_client())
1012
+ pod = core_api.read_namespaced_pod(
1013
+ name=pod_name_k8s, namespace=namespace
1014
+ )
1015
+ pod_ip = pod.status.pod_ip
1016
+
1017
+ if pod_ip is not None:
1018
+ # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
1019
+ pod_ip_dashed = pod_ip.replace(".", "-")
1020
+
1021
+ # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
1022
+ # Note: This will only be useful if the client using the IPs is in the cluster.
1023
+ hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
1024
+
1025
+ except ApiException:
1026
+ # Pod not found - hostname remains empty
1027
+ pass
1028
+
707
1029
  roles_statuses[role].replicas.append(
708
- ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
1030
+ ReplicaStatus(
1031
+ id=int(idx), role=role, state=state, hostname=hostname
1032
+ )
709
1033
  )
710
1034
  else:
711
1035
  app_state = AppState.UNKNOWN
@@ -794,13 +1118,34 @@ def create_scheduler(
794
1118
  def pod_labels(
795
1119
  app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
796
1120
  ) -> Dict[str, str]:
1121
+
1122
+ def clean(label_value: str) -> str:
1123
+ # cleans the provided `label_value` to make it compliant
1124
+ # to pod label specs as described in
1125
+ # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
1126
+ #
1127
+ # Valid label value:
1128
+ # must be 63 characters or less (can be empty),
1129
+ # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
1130
+ # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
1131
+
1132
+ # Replace invalid characters (allow: alphanum, -, _, .) with "."
1133
+ label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
1134
+ # Replace leading non-alphanumeric with "."
1135
+ label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
1136
+ # Replace trailing non-alphanumeric with "."
1137
+ label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
1138
+
1139
+ # Trim to 63 characters
1140
+ return label_value[:63]
1141
+
797
1142
  return {
798
- LABEL_VERSION: torchx.__version__,
799
- LABEL_APP_NAME: app.name,
1143
+ LABEL_VERSION: clean(torchx.__version__),
1144
+ LABEL_APP_NAME: clean(app.name),
800
1145
  LABEL_ROLE_INDEX: str(role_idx),
801
- LABEL_ROLE_NAME: role.name,
1146
+ LABEL_ROLE_NAME: clean(role.name),
802
1147
  LABEL_REPLICA_ID: str(replica_id),
803
- LABEL_KUBE_APP_NAME: app.name,
1148
+ LABEL_KUBE_APP_NAME: clean(app.name),
804
1149
  LABEL_ORGANIZATION: "torchx.pytorch.org",
805
- LABEL_UNIQUE_NAME: app_id,
1150
+ LABEL_UNIQUE_NAME: clean(app_id),
806
1151
  }