torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_list.py +1 -2
- torchx/cli/cmd_run.py +202 -28
- torchx/cli/cmd_tracker.py +1 -1
- torchx/cli/main.py +2 -0
- torchx/components/__init__.py +1 -8
- torchx/components/dist.py +9 -3
- torchx/components/integration_tests/component_provider.py +2 -2
- torchx/components/utils.py +1 -1
- torchx/distributed/__init__.py +1 -1
- torchx/runner/api.py +102 -81
- torchx/runner/config.py +3 -1
- torchx/runner/events/__init__.py +20 -10
- torchx/runner/events/api.py +1 -1
- torchx/schedulers/__init__.py +7 -10
- torchx/schedulers/api.py +66 -25
- torchx/schedulers/aws_batch_scheduler.py +47 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
- torchx/schedulers/docker_scheduler.py +4 -3
- torchx/schedulers/ids.py +27 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
- torchx/schedulers/kubernetes_scheduler.py +355 -36
- torchx/schedulers/local_scheduler.py +2 -1
- torchx/schedulers/lsf_scheduler.py +1 -1
- torchx/schedulers/slurm_scheduler.py +102 -27
- torchx/specs/__init__.py +40 -9
- torchx/specs/api.py +222 -12
- torchx/specs/builders.py +109 -28
- torchx/specs/file_linter.py +117 -53
- torchx/specs/finder.py +25 -37
- torchx/specs/named_resources_aws.py +13 -2
- torchx/specs/overlays.py +106 -0
- torchx/tracker/__init__.py +2 -2
- torchx/tracker/api.py +1 -1
- torchx/util/entrypoints.py +1 -6
- torchx/util/strings.py +1 -1
- torchx/util/types.py +12 -1
- torchx/version.py +2 -2
- torchx/workspace/api.py +102 -5
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
- torchx/pipelines/kfp/__init__.py +0 -30
- torchx/pipelines/kfp/adapter.py +0 -274
- torchx/pipelines/kfp/version.py +0 -19
- torchx/schedulers/gcp_batch_scheduler.py +0 -497
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -454
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
|
@@ -27,10 +27,81 @@ Install Volcano:
|
|
|
27
27
|
See the
|
|
28
28
|
`Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
|
|
29
29
|
for more information.
|
|
30
|
+
|
|
31
|
+
Pod Overlay
|
|
32
|
+
===========
|
|
33
|
+
|
|
34
|
+
You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
|
|
35
|
+
the ``kubernetes`` metadata on your role. The value can be:
|
|
36
|
+
|
|
37
|
+
- A dict with the overlay structure
|
|
38
|
+
- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
|
|
39
|
+
|
|
40
|
+
Merge semantics:
|
|
41
|
+
- **dict**: recursive merge (upsert)
|
|
42
|
+
- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
|
|
43
|
+
- **primitives**: replace
|
|
44
|
+
|
|
45
|
+
.. code:: python
|
|
46
|
+
|
|
47
|
+
from torchx.specs import Role
|
|
48
|
+
|
|
49
|
+
# Dict overlay - lists append, tuples replace
|
|
50
|
+
role = Role(
|
|
51
|
+
name="trainer",
|
|
52
|
+
image="my-image:latest",
|
|
53
|
+
entrypoint="train.py",
|
|
54
|
+
metadata={
|
|
55
|
+
"kubernetes": {
|
|
56
|
+
"spec": {
|
|
57
|
+
"nodeSelector": {"gpu": "true"},
|
|
58
|
+
"tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
|
|
59
|
+
"volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# File URI overlay
|
|
66
|
+
role = Role(
|
|
67
|
+
name="trainer",
|
|
68
|
+
image="my-image:latest",
|
|
69
|
+
entrypoint="train.py",
|
|
70
|
+
metadata={
|
|
71
|
+
"kubernetes": "file:///path/to/pod_overlay.yaml"
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
CLI usage with builtin components:
|
|
76
|
+
|
|
77
|
+
.. code:: bash
|
|
78
|
+
|
|
79
|
+
$ torchx run --scheduler kubernetes dist.ddp \\
|
|
80
|
+
--metadata kubernetes=file:///path/to/pod_overlay.yaml \\
|
|
81
|
+
--script train.py
|
|
82
|
+
|
|
83
|
+
Example ``pod_overlay.yaml``:
|
|
84
|
+
|
|
85
|
+
.. code:: yaml
|
|
86
|
+
|
|
87
|
+
spec:
|
|
88
|
+
nodeSelector:
|
|
89
|
+
node.kubernetes.io/instance-type: p4d.24xlarge
|
|
90
|
+
tolerations:
|
|
91
|
+
- key: nvidia.com/gpu
|
|
92
|
+
operator: Exists
|
|
93
|
+
effect: NoSchedule
|
|
94
|
+
volumes: !!python/tuple
|
|
95
|
+
- name: my-volume
|
|
96
|
+
emptyDir: {}
|
|
97
|
+
|
|
98
|
+
The overlay is deep-merged with the generated pod, preserving existing fields
|
|
99
|
+
and adding or overriding specified ones.
|
|
30
100
|
"""
|
|
31
101
|
|
|
32
102
|
import json
|
|
33
103
|
import logging
|
|
104
|
+
import re
|
|
34
105
|
import warnings
|
|
35
106
|
from dataclasses import dataclass
|
|
36
107
|
from datetime import datetime
|
|
@@ -45,6 +116,7 @@ from typing import (
|
|
|
45
116
|
Tuple,
|
|
46
117
|
TYPE_CHECKING,
|
|
47
118
|
TypedDict,
|
|
119
|
+
Union,
|
|
48
120
|
)
|
|
49
121
|
|
|
50
122
|
import torchx
|
|
@@ -77,7 +149,6 @@ from torchx.specs.api import (
|
|
|
77
149
|
from torchx.util.strings import normalize_str
|
|
78
150
|
from torchx.workspace.docker_workspace import DockerWorkspaceMixin
|
|
79
151
|
|
|
80
|
-
|
|
81
152
|
if TYPE_CHECKING:
|
|
82
153
|
from docker import DockerClient
|
|
83
154
|
from kubernetes.client import ApiClient, CustomObjectsApi
|
|
@@ -87,6 +158,7 @@ if TYPE_CHECKING:
|
|
|
87
158
|
)
|
|
88
159
|
from kubernetes.client.rest import ApiException
|
|
89
160
|
|
|
161
|
+
|
|
90
162
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
91
163
|
|
|
92
164
|
# Kubernetes reserves a small amount of resources per host for the system. For
|
|
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
169
|
RESERVED_MILLICPU = 100
|
|
98
170
|
RESERVED_MEMMB = 1024
|
|
99
171
|
|
|
172
|
+
|
|
173
|
+
def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
|
|
174
|
+
"""Apply overlay dict to V1Pod object, merging nested fields.
|
|
175
|
+
|
|
176
|
+
Merge semantics:
|
|
177
|
+
- dict: upsert (recursive merge)
|
|
178
|
+
- list: append by default, replace if tuple
|
|
179
|
+
- primitives: replace
|
|
180
|
+
"""
|
|
181
|
+
from kubernetes import client
|
|
182
|
+
|
|
183
|
+
api = client.ApiClient()
|
|
184
|
+
pod_dict = api.sanitize_for_serialization(pod)
|
|
185
|
+
|
|
186
|
+
def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
|
|
187
|
+
for key, value in overlay.items():
|
|
188
|
+
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
|
|
189
|
+
deep_merge(base[key], value)
|
|
190
|
+
elif isinstance(value, tuple):
|
|
191
|
+
base[key] = list(value)
|
|
192
|
+
elif (
|
|
193
|
+
isinstance(value, list) and key in base and isinstance(base[key], list)
|
|
194
|
+
):
|
|
195
|
+
base[key].extend(value)
|
|
196
|
+
else:
|
|
197
|
+
base[key] = value
|
|
198
|
+
|
|
199
|
+
deep_merge(pod_dict, overlay)
|
|
200
|
+
|
|
201
|
+
merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
|
|
202
|
+
pod.spec = merged_pod.spec
|
|
203
|
+
pod.metadata = merged_pod.metadata
|
|
204
|
+
|
|
205
|
+
|
|
100
206
|
RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
|
|
101
207
|
RetryPolicy.REPLICA: [],
|
|
102
208
|
RetryPolicy.APPLICATION: [
|
|
@@ -188,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
|
|
|
188
294
|
return api.sanitize_for_serialization(obj)
|
|
189
295
|
|
|
190
296
|
|
|
191
|
-
def role_to_pod(
|
|
297
|
+
def role_to_pod(
|
|
298
|
+
name: str,
|
|
299
|
+
role: Role,
|
|
300
|
+
service_account: Optional[str],
|
|
301
|
+
reserved_millicpu: int = RESERVED_MILLICPU,
|
|
302
|
+
reserved_memmb: int = RESERVED_MEMMB,
|
|
303
|
+
efa_device_count: Optional[int] = None,
|
|
304
|
+
) -> "V1Pod":
|
|
192
305
|
from kubernetes.client.models import ( # noqa: F811 redefinition of unused
|
|
193
306
|
V1Container,
|
|
194
307
|
V1ContainerPort,
|
|
@@ -218,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
|
|
|
218
331
|
if resource.cpu > 0:
|
|
219
332
|
mcpu = int(resource.cpu * 1000)
|
|
220
333
|
limits["cpu"] = f"{mcpu}m"
|
|
221
|
-
request_mcpu = max(mcpu -
|
|
334
|
+
request_mcpu = max(mcpu - reserved_millicpu, 0)
|
|
222
335
|
requests["cpu"] = f"{request_mcpu}m"
|
|
223
336
|
if resource.memMB > 0:
|
|
224
337
|
limits["memory"] = f"{int(resource.memMB)}M"
|
|
225
|
-
request_memMB = max(int(resource.memMB) -
|
|
338
|
+
request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
|
|
226
339
|
requests["memory"] = f"{request_memMB}M"
|
|
227
340
|
if resource.gpu > 0:
|
|
228
341
|
requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
|
|
229
342
|
|
|
343
|
+
EFA_DEVICE = "vpc.amazonaws.com/efa"
|
|
230
344
|
for device_name, device_limit in resource.devices.items():
|
|
231
345
|
limits[device_name] = str(device_limit)
|
|
232
346
|
|
|
347
|
+
# Handle EFA device count override:
|
|
348
|
+
# - None (default): use whatever count is in the resource spec (already added above)
|
|
349
|
+
# - 0: remove EFA devices entirely
|
|
350
|
+
# - N > 0: set EFA device count to N (override or add)
|
|
351
|
+
if efa_device_count is not None:
|
|
352
|
+
if efa_device_count == 0:
|
|
353
|
+
limits.pop(EFA_DEVICE, None)
|
|
354
|
+
else:
|
|
355
|
+
limits[EFA_DEVICE] = str(efa_device_count)
|
|
356
|
+
|
|
233
357
|
resources = V1ResourceRequirements(
|
|
234
358
|
limits=limits,
|
|
235
359
|
requests=requests,
|
|
@@ -369,7 +493,10 @@ def app_to_resource(
|
|
|
369
493
|
queue: str,
|
|
370
494
|
service_account: Optional[str],
|
|
371
495
|
priority_class: Optional[str] = None,
|
|
372
|
-
|
|
496
|
+
reserved_millicpu: int = RESERVED_MILLICPU,
|
|
497
|
+
reserved_memmb: int = RESERVED_MEMMB,
|
|
498
|
+
efa_device_count: Optional[int] = None,
|
|
499
|
+
) -> Dict[str, Any]:
|
|
373
500
|
"""
|
|
374
501
|
app_to_resource creates a volcano job kubernetes resource definition from
|
|
375
502
|
the provided AppDef. The resource definition can be used to launch the
|
|
@@ -399,8 +526,27 @@ def app_to_resource(
|
|
|
399
526
|
replica_role = values.apply(role)
|
|
400
527
|
if role_idx == 0 and replica_id == 0:
|
|
401
528
|
replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
|
|
402
|
-
|
|
403
|
-
|
|
529
|
+
replica_role.env["TORCHX_IMAGE"] = replica_role.image
|
|
530
|
+
|
|
531
|
+
pod = role_to_pod(
|
|
532
|
+
name,
|
|
533
|
+
replica_role,
|
|
534
|
+
service_account,
|
|
535
|
+
reserved_millicpu,
|
|
536
|
+
reserved_memmb,
|
|
537
|
+
efa_device_count,
|
|
538
|
+
)
|
|
539
|
+
if k8s_metadata := role.metadata.get("kubernetes"):
|
|
540
|
+
if isinstance(k8s_metadata, str):
|
|
541
|
+
import fsspec
|
|
542
|
+
|
|
543
|
+
with fsspec.open(k8s_metadata, "r") as f:
|
|
544
|
+
k8s_metadata = yaml.unsafe_load(f)
|
|
545
|
+
elif not isinstance(k8s_metadata, dict):
|
|
546
|
+
raise ValueError(
|
|
547
|
+
f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
|
|
548
|
+
)
|
|
549
|
+
_apply_pod_overlay(pod, k8s_metadata)
|
|
404
550
|
pod.metadata.labels.update(
|
|
405
551
|
pod_labels(
|
|
406
552
|
app=app,
|
|
@@ -443,7 +589,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
443
589
|
if priority_class is not None:
|
|
444
590
|
job_spec["priorityClassName"] = priority_class
|
|
445
591
|
|
|
446
|
-
resource: Dict[str,
|
|
592
|
+
resource: Dict[str, Any] = {
|
|
447
593
|
"apiVersion": "batch.volcano.sh/v1alpha1",
|
|
448
594
|
"kind": "Job",
|
|
449
595
|
"metadata": {"name": f"{unique_app_id}"},
|
|
@@ -455,7 +601,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
455
601
|
@dataclass
|
|
456
602
|
class KubernetesJob:
|
|
457
603
|
images_to_push: Dict[str, Tuple[str, str]]
|
|
458
|
-
resource: Dict[str,
|
|
604
|
+
resource: Dict[str, Any]
|
|
459
605
|
|
|
460
606
|
def __str__(self) -> str:
|
|
461
607
|
return yaml.dump(sanitize_for_serialization(self.resource))
|
|
@@ -470,12 +616,13 @@ class KubernetesOpts(TypedDict, total=False):
|
|
|
470
616
|
image_repo: Optional[str]
|
|
471
617
|
service_account: Optional[str]
|
|
472
618
|
priority_class: Optional[str]
|
|
619
|
+
validate_spec: Optional[bool]
|
|
620
|
+
reserved_millicpu: Optional[int]
|
|
621
|
+
reserved_memmb: Optional[int]
|
|
622
|
+
efa_device_count: Optional[int]
|
|
473
623
|
|
|
474
624
|
|
|
475
|
-
class KubernetesScheduler(
|
|
476
|
-
DockerWorkspaceMixin,
|
|
477
|
-
Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
|
|
478
|
-
):
|
|
625
|
+
class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
479
626
|
"""
|
|
480
627
|
KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
|
|
481
628
|
|
|
@@ -485,7 +632,7 @@ class KubernetesScheduler(
|
|
|
485
632
|
For installation instructions see: https://github.com/volcano-sh/volcano
|
|
486
633
|
|
|
487
634
|
This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
|
|
488
|
-
v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
|
|
635
|
+
v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
|
|
489
636
|
tracking Volcano support for Kubernetes v1.22.
|
|
490
637
|
|
|
491
638
|
.. note::
|
|
@@ -503,6 +650,16 @@ class KubernetesScheduler(
|
|
|
503
650
|
$ torchx status kubernetes://torchx_user/1234
|
|
504
651
|
...
|
|
505
652
|
|
|
653
|
+
**Cancellation**
|
|
654
|
+
|
|
655
|
+
Canceling a job aborts it while preserving the job spec for inspection
|
|
656
|
+
and cloning via kubectl apply. Use the delete command to remove the job entirely:
|
|
657
|
+
|
|
658
|
+
.. code-block:: bash
|
|
659
|
+
|
|
660
|
+
$ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
|
|
661
|
+
$ torchx delete kubernetes://namespace/jobname # delete completely
|
|
662
|
+
|
|
506
663
|
**Config Options**
|
|
507
664
|
|
|
508
665
|
.. runopts::
|
|
@@ -581,9 +738,14 @@ class KubernetesScheduler(
|
|
|
581
738
|
if c is None:
|
|
582
739
|
configuration = client.Configuration()
|
|
583
740
|
try:
|
|
584
|
-
config
|
|
585
|
-
|
|
586
|
-
|
|
741
|
+
# Try in-cluster config first (for pods with ServiceAccount)
|
|
742
|
+
config.load_incluster_config(client_configuration=configuration)
|
|
743
|
+
except config.ConfigException:
|
|
744
|
+
# Fall back to kubeconfig (for local development)
|
|
745
|
+
try:
|
|
746
|
+
config.load_kube_config(client_configuration=configuration)
|
|
747
|
+
except config.ConfigException as e:
|
|
748
|
+
warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
|
|
587
749
|
|
|
588
750
|
c = self._client = client.ApiClient(configuration)
|
|
589
751
|
|
|
@@ -635,7 +797,7 @@ class KubernetesScheduler(
|
|
|
635
797
|
else:
|
|
636
798
|
raise
|
|
637
799
|
|
|
638
|
-
return f
|
|
800
|
+
return f"{namespace}:{resp['metadata']['name']}"
|
|
639
801
|
|
|
640
802
|
def _submit_dryrun(
|
|
641
803
|
self, app: AppDef, cfg: KubernetesOpts
|
|
@@ -657,7 +819,56 @@ class KubernetesScheduler(
|
|
|
657
819
|
priority_class, str
|
|
658
820
|
), "priority_class must be a str"
|
|
659
821
|
|
|
660
|
-
|
|
822
|
+
reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
|
|
823
|
+
assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
|
|
824
|
+
|
|
825
|
+
reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
|
|
826
|
+
assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
|
|
827
|
+
|
|
828
|
+
efa_device_count = cfg.get("efa_device_count")
|
|
829
|
+
assert efa_device_count is None or isinstance(
|
|
830
|
+
efa_device_count, int
|
|
831
|
+
), "efa_device_count must be an int or None"
|
|
832
|
+
|
|
833
|
+
resource = app_to_resource(
|
|
834
|
+
app,
|
|
835
|
+
queue,
|
|
836
|
+
service_account,
|
|
837
|
+
priority_class,
|
|
838
|
+
reserved_millicpu,
|
|
839
|
+
reserved_memmb,
|
|
840
|
+
efa_device_count,
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
if cfg.get("validate_spec"):
|
|
844
|
+
try:
|
|
845
|
+
self._custom_objects_api().create_namespaced_custom_object(
|
|
846
|
+
group="batch.volcano.sh",
|
|
847
|
+
version="v1alpha1",
|
|
848
|
+
namespace=cfg.get("namespace") or "default",
|
|
849
|
+
plural="jobs",
|
|
850
|
+
body=resource,
|
|
851
|
+
dry_run="All",
|
|
852
|
+
)
|
|
853
|
+
except Exception as e:
|
|
854
|
+
from kubernetes.client.rest import ApiException
|
|
855
|
+
|
|
856
|
+
if isinstance(e, ApiException):
|
|
857
|
+
raise ValueError(f"Invalid job spec: {e.reason}") from e
|
|
858
|
+
raise
|
|
859
|
+
|
|
860
|
+
job_name = resource["metadata"]["name"]
|
|
861
|
+
for task in resource["spec"]["tasks"]:
|
|
862
|
+
task_name = task["name"]
|
|
863
|
+
replicas = task.get("replicas", 1)
|
|
864
|
+
max_index = replicas - 1
|
|
865
|
+
pod_name = f"{job_name}-{task_name}-{max_index}"
|
|
866
|
+
if len(pod_name) > 63:
|
|
867
|
+
raise ValueError(
|
|
868
|
+
f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
|
|
869
|
+
f"Shorten app.name or role names"
|
|
870
|
+
)
|
|
871
|
+
|
|
661
872
|
req = KubernetesJob(
|
|
662
873
|
resource=resource,
|
|
663
874
|
images_to_push=images_to_push,
|
|
@@ -669,6 +880,31 @@ class KubernetesScheduler(
|
|
|
669
880
|
pass
|
|
670
881
|
|
|
671
882
|
def _cancel_existing(self, app_id: str) -> None:
|
|
883
|
+
"""
|
|
884
|
+
Abort a Volcano job while preserving the spec for inspection.
|
|
885
|
+
"""
|
|
886
|
+
namespace, name = app_id.split(":")
|
|
887
|
+
vcjob = self._custom_objects_api().get_namespaced_custom_object(
|
|
888
|
+
group="batch.volcano.sh",
|
|
889
|
+
version="v1alpha1",
|
|
890
|
+
namespace=namespace,
|
|
891
|
+
plural="jobs",
|
|
892
|
+
name=name,
|
|
893
|
+
)
|
|
894
|
+
vcjob["status"]["state"]["phase"] = "Aborted"
|
|
895
|
+
self._custom_objects_api().replace_namespaced_custom_object_status(
|
|
896
|
+
group="batch.volcano.sh",
|
|
897
|
+
version="v1alpha1",
|
|
898
|
+
namespace=namespace,
|
|
899
|
+
plural="jobs",
|
|
900
|
+
name=name,
|
|
901
|
+
body=vcjob,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
905
|
+
"""
|
|
906
|
+
Delete a Volcano job completely from the cluster.
|
|
907
|
+
"""
|
|
672
908
|
namespace, name = app_id.split(":")
|
|
673
909
|
self._custom_objects_api().delete_namespaced_custom_object(
|
|
674
910
|
group="batch.volcano.sh",
|
|
@@ -702,19 +938,52 @@ class KubernetesScheduler(
|
|
|
702
938
|
type_=str,
|
|
703
939
|
help="The name of the PriorityClass to set on the job specs",
|
|
704
940
|
)
|
|
941
|
+
opts.add(
|
|
942
|
+
"validate_spec",
|
|
943
|
+
type_=bool,
|
|
944
|
+
help="Validate job spec using Kubernetes API dry-run before submission",
|
|
945
|
+
default=True,
|
|
946
|
+
)
|
|
947
|
+
opts.add(
|
|
948
|
+
"reserved_millicpu",
|
|
949
|
+
type_=int,
|
|
950
|
+
help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
|
|
951
|
+
default=RESERVED_MILLICPU,
|
|
952
|
+
)
|
|
953
|
+
opts.add(
|
|
954
|
+
"reserved_memmb",
|
|
955
|
+
type_=int,
|
|
956
|
+
help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
|
|
957
|
+
default=RESERVED_MEMMB,
|
|
958
|
+
)
|
|
959
|
+
opts.add(
|
|
960
|
+
"efa_device_count",
|
|
961
|
+
type_=int,
|
|
962
|
+
help="EFA device count override: None/unset=use resource spec, "
|
|
963
|
+
"0=remove EFA, N>0=set EFA count to N",
|
|
964
|
+
default=None,
|
|
965
|
+
)
|
|
705
966
|
return opts
|
|
706
967
|
|
|
707
968
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
969
|
+
from kubernetes import client
|
|
970
|
+
from kubernetes.client.rest import ApiException
|
|
971
|
+
|
|
708
972
|
namespace, name = app_id.split(":")
|
|
709
973
|
roles = {}
|
|
710
974
|
roles_statuses = {}
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
975
|
+
try:
|
|
976
|
+
resp = self._custom_objects_api().get_namespaced_custom_object_status(
|
|
977
|
+
group="batch.volcano.sh",
|
|
978
|
+
version="v1alpha1",
|
|
979
|
+
namespace=namespace,
|
|
980
|
+
plural="jobs",
|
|
981
|
+
name=name,
|
|
982
|
+
)
|
|
983
|
+
except ApiException as e:
|
|
984
|
+
if e.status == 404:
|
|
985
|
+
return None
|
|
986
|
+
raise
|
|
718
987
|
status = resp.get("status")
|
|
719
988
|
if status:
|
|
720
989
|
state_str = status["state"]["phase"]
|
|
@@ -723,18 +992,44 @@ class KubernetesScheduler(
|
|
|
723
992
|
TASK_STATUS_COUNT = "taskStatusCount"
|
|
724
993
|
|
|
725
994
|
if TASK_STATUS_COUNT in status:
|
|
726
|
-
for
|
|
727
|
-
role, _, idx =
|
|
995
|
+
for task_name, task_status in status[TASK_STATUS_COUNT].items():
|
|
996
|
+
role, _, idx = task_name.rpartition("-")
|
|
728
997
|
|
|
729
|
-
state_str = next(iter(
|
|
998
|
+
state_str = next(iter(task_status["phase"].keys()))
|
|
730
999
|
state = TASK_STATE[state_str]
|
|
731
1000
|
|
|
732
1001
|
if role not in roles:
|
|
733
1002
|
roles[role] = Role(name=role, num_replicas=0, image="")
|
|
734
1003
|
roles_statuses[role] = RoleStatus(role, [])
|
|
735
1004
|
roles[role].num_replicas += 1
|
|
1005
|
+
|
|
1006
|
+
# Pod name follows the pattern: {job_name}-{task_name}-0
|
|
1007
|
+
# Get the pod to retrieve its IP address
|
|
1008
|
+
pod_name_k8s = f"{name}-{task_name}-0"
|
|
1009
|
+
hostname = ""
|
|
1010
|
+
try:
|
|
1011
|
+
core_api = client.CoreV1Api(self._api_client())
|
|
1012
|
+
pod = core_api.read_namespaced_pod(
|
|
1013
|
+
name=pod_name_k8s, namespace=namespace
|
|
1014
|
+
)
|
|
1015
|
+
pod_ip = pod.status.pod_ip
|
|
1016
|
+
|
|
1017
|
+
if pod_ip is not None:
|
|
1018
|
+
# Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
|
|
1019
|
+
pod_ip_dashed = pod_ip.replace(".", "-")
|
|
1020
|
+
|
|
1021
|
+
# Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
|
|
1022
|
+
# Note: This will only be useful if the client using the IPs is in the cluster.
|
|
1023
|
+
hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
|
|
1024
|
+
|
|
1025
|
+
except ApiException:
|
|
1026
|
+
# Pod not found - hostname remains empty
|
|
1027
|
+
pass
|
|
1028
|
+
|
|
736
1029
|
roles_statuses[role].replicas.append(
|
|
737
|
-
ReplicaStatus(
|
|
1030
|
+
ReplicaStatus(
|
|
1031
|
+
id=int(idx), role=role, state=state, hostname=hostname
|
|
1032
|
+
)
|
|
738
1033
|
)
|
|
739
1034
|
else:
|
|
740
1035
|
app_state = AppState.UNKNOWN
|
|
@@ -778,7 +1073,10 @@ class KubernetesScheduler(
|
|
|
778
1073
|
core_api = client.CoreV1Api(self._api_client())
|
|
779
1074
|
if should_tail:
|
|
780
1075
|
w = watch.Watch()
|
|
781
|
-
iterator =
|
|
1076
|
+
iterator = (
|
|
1077
|
+
f"{line}\n"
|
|
1078
|
+
for line in w.stream(core_api.read_namespaced_pod_log, **args)
|
|
1079
|
+
)
|
|
782
1080
|
else:
|
|
783
1081
|
resp = core_api.read_namespaced_pod_log(**args)
|
|
784
1082
|
iterator = split_lines(resp)
|
|
@@ -823,13 +1121,34 @@ def create_scheduler(
|
|
|
823
1121
|
def pod_labels(
|
|
824
1122
|
app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
|
|
825
1123
|
) -> Dict[str, str]:
|
|
1124
|
+
|
|
1125
|
+
def clean(label_value: str) -> str:
|
|
1126
|
+
# cleans the provided `label_value` to make it compliant
|
|
1127
|
+
# to pod label specs as described in
|
|
1128
|
+
# https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
|
|
1129
|
+
#
|
|
1130
|
+
# Valid label value:
|
|
1131
|
+
# must be 63 characters or less (can be empty),
|
|
1132
|
+
# unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
|
|
1133
|
+
# could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
|
|
1134
|
+
|
|
1135
|
+
# Replace invalid characters (allow: alphanum, -, _, .) with "."
|
|
1136
|
+
label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
|
|
1137
|
+
# Replace leading non-alphanumeric with "."
|
|
1138
|
+
label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
|
|
1139
|
+
# Replace trailing non-alphanumeric with "."
|
|
1140
|
+
label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
|
|
1141
|
+
|
|
1142
|
+
# Trim to 63 characters
|
|
1143
|
+
return label_value[:63]
|
|
1144
|
+
|
|
826
1145
|
return {
|
|
827
|
-
LABEL_VERSION: torchx.__version__,
|
|
828
|
-
LABEL_APP_NAME: app.name,
|
|
1146
|
+
LABEL_VERSION: clean(torchx.__version__),
|
|
1147
|
+
LABEL_APP_NAME: clean(app.name),
|
|
829
1148
|
LABEL_ROLE_INDEX: str(role_idx),
|
|
830
|
-
LABEL_ROLE_NAME: role.name,
|
|
1149
|
+
LABEL_ROLE_NAME: clean(role.name),
|
|
831
1150
|
LABEL_REPLICA_ID: str(replica_id),
|
|
832
|
-
LABEL_KUBE_APP_NAME: app.name,
|
|
1151
|
+
LABEL_KUBE_APP_NAME: clean(app.name),
|
|
833
1152
|
LABEL_ORGANIZATION: "torchx.pytorch.org",
|
|
834
|
-
LABEL_UNIQUE_NAME: app_id,
|
|
1153
|
+
LABEL_UNIQUE_NAME: clean(app_id),
|
|
835
1154
|
}
|
|
@@ -529,7 +529,7 @@ def _register_termination_signals() -> None:
|
|
|
529
529
|
signal.signal(signal.SIGINT, _terminate_process_handler)
|
|
530
530
|
|
|
531
531
|
|
|
532
|
-
class LocalScheduler(Scheduler[LocalOpts
|
|
532
|
+
class LocalScheduler(Scheduler[LocalOpts]):
|
|
533
533
|
"""
|
|
534
534
|
Schedules on localhost. Containers are modeled as processes and
|
|
535
535
|
certain properties of the container that are either not relevant
|
|
@@ -1159,6 +1159,7 @@ class LogIterator:
|
|
|
1159
1159
|
self._check_finished() # check to see if app has finished running
|
|
1160
1160
|
|
|
1161
1161
|
if os.path.isfile(self._log_file):
|
|
1162
|
+
time.sleep(0.1) # fix timing issue
|
|
1162
1163
|
self._log_fp = open(
|
|
1163
1164
|
self._log_file,
|
|
1164
1165
|
mode="rt",
|