torchx-nightly 2023.10.21__py3-none-any.whl → 2025.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/__init__.py +2 -0
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/apps/serve/serve.py +2 -0
- torchx/apps/utils/booth_main.py +2 -0
- torchx/apps/utils/copy_main.py +2 -0
- torchx/apps/utils/process_monitor.py +2 -0
- torchx/cli/__init__.py +2 -0
- torchx/cli/argparse_util.py +38 -3
- torchx/cli/cmd_base.py +2 -0
- torchx/cli/cmd_cancel.py +2 -0
- torchx/cli/cmd_configure.py +2 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_describe.py +2 -0
- torchx/cli/cmd_list.py +8 -4
- torchx/cli/cmd_log.py +6 -24
- torchx/cli/cmd_run.py +269 -45
- torchx/cli/cmd_runopts.py +2 -0
- torchx/cli/cmd_status.py +12 -1
- torchx/cli/cmd_tracker.py +3 -1
- torchx/cli/colors.py +2 -0
- torchx/cli/main.py +4 -0
- torchx/components/__init__.py +3 -8
- torchx/components/component_test_base.py +2 -0
- torchx/components/dist.py +18 -7
- torchx/components/integration_tests/component_provider.py +4 -2
- torchx/components/integration_tests/integ_tests.py +2 -0
- torchx/components/serve.py +2 -0
- torchx/components/structured_arg.py +7 -6
- torchx/components/utils.py +15 -4
- torchx/distributed/__init__.py +2 -4
- torchx/examples/apps/datapreproc/datapreproc.py +2 -0
- torchx/examples/apps/lightning/data.py +5 -3
- torchx/examples/apps/lightning/model.py +7 -6
- torchx/examples/apps/lightning/profiler.py +7 -4
- torchx/examples/apps/lightning/train.py +11 -2
- torchx/examples/torchx_out_of_sync_training.py +11 -0
- torchx/notebook.py +2 -0
- torchx/runner/__init__.py +2 -0
- torchx/runner/api.py +167 -60
- torchx/runner/config.py +43 -10
- torchx/runner/events/__init__.py +57 -13
- torchx/runner/events/api.py +14 -3
- torchx/runner/events/handlers.py +2 -0
- torchx/runtime/tracking/__init__.py +2 -0
- torchx/runtime/tracking/api.py +2 -0
- torchx/schedulers/__init__.py +16 -15
- torchx/schedulers/api.py +70 -14
- torchx/schedulers/aws_batch_scheduler.py +79 -5
- torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
- torchx/schedulers/devices.py +17 -4
- torchx/schedulers/docker_scheduler.py +43 -11
- torchx/schedulers/ids.py +29 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +10 -8
- torchx/schedulers/kubernetes_scheduler.py +383 -38
- torchx/schedulers/local_scheduler.py +100 -27
- torchx/schedulers/lsf_scheduler.py +5 -4
- torchx/schedulers/slurm_scheduler.py +336 -20
- torchx/schedulers/streams.py +2 -0
- torchx/specs/__init__.py +89 -12
- torchx/specs/api.py +431 -32
- torchx/specs/builders.py +176 -38
- torchx/specs/file_linter.py +143 -57
- torchx/specs/finder.py +68 -28
- torchx/specs/named_resources_aws.py +254 -22
- torchx/specs/named_resources_generic.py +2 -0
- torchx/specs/overlays.py +106 -0
- torchx/specs/test/components/__init__.py +2 -0
- torchx/specs/test/components/a/__init__.py +2 -0
- torchx/specs/test/components/a/b/__init__.py +2 -0
- torchx/specs/test/components/a/b/c.py +2 -0
- torchx/specs/test/components/c/__init__.py +2 -0
- torchx/specs/test/components/c/d.py +2 -0
- torchx/tracker/__init__.py +12 -6
- torchx/tracker/api.py +15 -18
- torchx/tracker/backend/fsspec.py +2 -0
- torchx/util/cuda.py +2 -0
- torchx/util/datetime.py +2 -0
- torchx/util/entrypoints.py +39 -15
- torchx/util/io.py +2 -0
- torchx/util/log_tee_helpers.py +210 -0
- torchx/util/modules.py +65 -0
- torchx/util/session.py +42 -0
- torchx/util/shlex.py +2 -0
- torchx/util/strings.py +3 -1
- torchx/util/types.py +90 -29
- torchx/version.py +4 -2
- torchx/workspace/__init__.py +2 -0
- torchx/workspace/api.py +136 -6
- torchx/workspace/dir_workspace.py +2 -0
- torchx/workspace/docker_workspace.py +30 -2
- torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
- torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
- torchx/pipelines/kfp/__init__.py +0 -28
- torchx/pipelines/kfp/adapter.py +0 -271
- torchx/pipelines/kfp/version.py +0 -17
- torchx/schedulers/gcp_batch_scheduler.py +0 -487
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -453
- torchx_nightly-2023.10.21.dist-info/METADATA +0 -174
- torchx_nightly-2023.10.21.dist-info/RECORD +0 -118
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
12
|
This contains the TorchX Kubernetes scheduler which can be used to run TorchX
|
|
@@ -23,12 +25,83 @@ Install Volcano:
|
|
|
23
25
|
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.6.0/installer/volcano-development.yaml
|
|
24
26
|
|
|
25
27
|
See the
|
|
26
|
-
`Volcano Quickstart <https://github.com/volcano-sh/volcano
|
|
28
|
+
`Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
|
|
27
29
|
for more information.
|
|
30
|
+
|
|
31
|
+
Pod Overlay
|
|
32
|
+
===========
|
|
33
|
+
|
|
34
|
+
You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
|
|
35
|
+
the ``kubernetes`` metadata on your role. The value can be:
|
|
36
|
+
|
|
37
|
+
- A dict with the overlay structure
|
|
38
|
+
- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
|
|
39
|
+
|
|
40
|
+
Merge semantics:
|
|
41
|
+
- **dict**: recursive merge (upsert)
|
|
42
|
+
- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
|
|
43
|
+
- **primitives**: replace
|
|
44
|
+
|
|
45
|
+
.. code:: python
|
|
46
|
+
|
|
47
|
+
from torchx.specs import Role
|
|
48
|
+
|
|
49
|
+
# Dict overlay - lists append, tuples replace
|
|
50
|
+
role = Role(
|
|
51
|
+
name="trainer",
|
|
52
|
+
image="my-image:latest",
|
|
53
|
+
entrypoint="train.py",
|
|
54
|
+
metadata={
|
|
55
|
+
"kubernetes": {
|
|
56
|
+
"spec": {
|
|
57
|
+
"nodeSelector": {"gpu": "true"},
|
|
58
|
+
"tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
|
|
59
|
+
"volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# File URI overlay
|
|
66
|
+
role = Role(
|
|
67
|
+
name="trainer",
|
|
68
|
+
image="my-image:latest",
|
|
69
|
+
entrypoint="train.py",
|
|
70
|
+
metadata={
|
|
71
|
+
"kubernetes": "file:///path/to/pod_overlay.yaml"
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
CLI usage with builtin components:
|
|
76
|
+
|
|
77
|
+
.. code:: bash
|
|
78
|
+
|
|
79
|
+
$ torchx run --scheduler kubernetes dist.ddp \\
|
|
80
|
+
--metadata kubernetes=file:///path/to/pod_overlay.yaml \\
|
|
81
|
+
--script train.py
|
|
82
|
+
|
|
83
|
+
Example ``pod_overlay.yaml``:
|
|
84
|
+
|
|
85
|
+
.. code:: yaml
|
|
86
|
+
|
|
87
|
+
spec:
|
|
88
|
+
nodeSelector:
|
|
89
|
+
node.kubernetes.io/instance-type: p4d.24xlarge
|
|
90
|
+
tolerations:
|
|
91
|
+
- key: nvidia.com/gpu
|
|
92
|
+
operator: Exists
|
|
93
|
+
effect: NoSchedule
|
|
94
|
+
volumes: !!python/tuple
|
|
95
|
+
- name: my-volume
|
|
96
|
+
emptyDir: {}
|
|
97
|
+
|
|
98
|
+
The overlay is deep-merged with the generated pod, preserving existing fields
|
|
99
|
+
and adding or overriding specified ones.
|
|
28
100
|
"""
|
|
29
101
|
|
|
30
102
|
import json
|
|
31
103
|
import logging
|
|
104
|
+
import re
|
|
32
105
|
import warnings
|
|
33
106
|
from dataclasses import dataclass
|
|
34
107
|
from datetime import datetime
|
|
@@ -42,12 +115,13 @@ from typing import (
|
|
|
42
115
|
Optional,
|
|
43
116
|
Tuple,
|
|
44
117
|
TYPE_CHECKING,
|
|
118
|
+
TypedDict,
|
|
119
|
+
Union,
|
|
45
120
|
)
|
|
46
121
|
|
|
47
122
|
import torchx
|
|
48
123
|
import yaml
|
|
49
124
|
from torchx.schedulers.api import (
|
|
50
|
-
AppDryRunInfo,
|
|
51
125
|
DescribeAppResponse,
|
|
52
126
|
filter_regex,
|
|
53
127
|
ListAppResponse,
|
|
@@ -58,6 +132,7 @@ from torchx.schedulers.api import (
|
|
|
58
132
|
from torchx.schedulers.ids import make_unique
|
|
59
133
|
from torchx.specs.api import (
|
|
60
134
|
AppDef,
|
|
135
|
+
AppDryRunInfo,
|
|
61
136
|
AppState,
|
|
62
137
|
BindMount,
|
|
63
138
|
CfgVal,
|
|
@@ -73,8 +148,6 @@ from torchx.specs.api import (
|
|
|
73
148
|
)
|
|
74
149
|
from torchx.util.strings import normalize_str
|
|
75
150
|
from torchx.workspace.docker_workspace import DockerWorkspaceMixin
|
|
76
|
-
from typing_extensions import TypedDict
|
|
77
|
-
|
|
78
151
|
|
|
79
152
|
if TYPE_CHECKING:
|
|
80
153
|
from docker import DockerClient
|
|
@@ -85,6 +158,7 @@ if TYPE_CHECKING:
|
|
|
85
158
|
)
|
|
86
159
|
from kubernetes.client.rest import ApiException
|
|
87
160
|
|
|
161
|
+
|
|
88
162
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
89
163
|
|
|
90
164
|
# Kubernetes reserves a small amount of resources per host for the system. For
|
|
@@ -95,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
95
169
|
RESERVED_MILLICPU = 100
|
|
96
170
|
RESERVED_MEMMB = 1024
|
|
97
171
|
|
|
172
|
+
|
|
173
|
+
def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
|
|
174
|
+
"""Apply overlay dict to V1Pod object, merging nested fields.
|
|
175
|
+
|
|
176
|
+
Merge semantics:
|
|
177
|
+
- dict: upsert (recursive merge)
|
|
178
|
+
- list: append by default, replace if tuple
|
|
179
|
+
- primitives: replace
|
|
180
|
+
"""
|
|
181
|
+
from kubernetes import client
|
|
182
|
+
|
|
183
|
+
api = client.ApiClient()
|
|
184
|
+
pod_dict = api.sanitize_for_serialization(pod)
|
|
185
|
+
|
|
186
|
+
def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
|
|
187
|
+
for key, value in overlay.items():
|
|
188
|
+
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
|
|
189
|
+
deep_merge(base[key], value)
|
|
190
|
+
elif isinstance(value, tuple):
|
|
191
|
+
base[key] = list(value)
|
|
192
|
+
elif (
|
|
193
|
+
isinstance(value, list) and key in base and isinstance(base[key], list)
|
|
194
|
+
):
|
|
195
|
+
base[key].extend(value)
|
|
196
|
+
else:
|
|
197
|
+
base[key] = value
|
|
198
|
+
|
|
199
|
+
deep_merge(pod_dict, overlay)
|
|
200
|
+
|
|
201
|
+
merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
|
|
202
|
+
pod.spec = merged_pod.spec
|
|
203
|
+
pod.metadata = merged_pod.metadata
|
|
204
|
+
|
|
205
|
+
|
|
98
206
|
RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
|
|
99
207
|
RetryPolicy.REPLICA: [],
|
|
100
208
|
RetryPolicy.APPLICATION: [
|
|
@@ -167,6 +275,17 @@ ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
|
|
|
167
275
|
|
|
168
276
|
LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
|
|
169
277
|
|
|
278
|
+
# role.env translates to static env variables in the yaml
|
|
279
|
+
# {"FOO" : "bar"} =====> - name: FOO
|
|
280
|
+
# value: bar
|
|
281
|
+
# unless this placeholder is present at the start of the role.env value then the env variable
|
|
282
|
+
# in the yaml will be dynamically populated at runtime (placeholder is stripped out of the value)
|
|
283
|
+
# {"FOO" : "[FIELD_PATH]bar"} =====> - name: FOO
|
|
284
|
+
# valueFrom:
|
|
285
|
+
# fieldRef:
|
|
286
|
+
# fieldPath: bar
|
|
287
|
+
PLACEHOLDER_FIELD_PATH = "[FIELD_PATH]"
|
|
288
|
+
|
|
170
289
|
|
|
171
290
|
def sanitize_for_serialization(obj: object) -> object:
|
|
172
291
|
from kubernetes import client
|
|
@@ -175,13 +294,22 @@ def sanitize_for_serialization(obj: object) -> object:
|
|
|
175
294
|
return api.sanitize_for_serialization(obj)
|
|
176
295
|
|
|
177
296
|
|
|
178
|
-
def role_to_pod(
|
|
297
|
+
def role_to_pod(
|
|
298
|
+
name: str,
|
|
299
|
+
role: Role,
|
|
300
|
+
service_account: Optional[str],
|
|
301
|
+
reserved_millicpu: int = RESERVED_MILLICPU,
|
|
302
|
+
reserved_memmb: int = RESERVED_MEMMB,
|
|
303
|
+
efa_device_count: Optional[int] = None,
|
|
304
|
+
) -> "V1Pod":
|
|
179
305
|
from kubernetes.client.models import ( # noqa: F811 redefinition of unused
|
|
180
306
|
V1Container,
|
|
181
307
|
V1ContainerPort,
|
|
182
308
|
V1EmptyDirVolumeSource,
|
|
183
309
|
V1EnvVar,
|
|
310
|
+
V1EnvVarSource,
|
|
184
311
|
V1HostPathVolumeSource,
|
|
312
|
+
V1ObjectFieldSelector,
|
|
185
313
|
V1ObjectMeta,
|
|
186
314
|
V1PersistentVolumeClaimVolumeSource,
|
|
187
315
|
V1Pod,
|
|
@@ -203,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
|
|
|
203
331
|
if resource.cpu > 0:
|
|
204
332
|
mcpu = int(resource.cpu * 1000)
|
|
205
333
|
limits["cpu"] = f"{mcpu}m"
|
|
206
|
-
request_mcpu = max(mcpu -
|
|
334
|
+
request_mcpu = max(mcpu - reserved_millicpu, 0)
|
|
207
335
|
requests["cpu"] = f"{request_mcpu}m"
|
|
208
336
|
if resource.memMB > 0:
|
|
209
337
|
limits["memory"] = f"{int(resource.memMB)}M"
|
|
210
|
-
request_memMB = max(int(resource.memMB) -
|
|
338
|
+
request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
|
|
211
339
|
requests["memory"] = f"{request_memMB}M"
|
|
212
340
|
if resource.gpu > 0:
|
|
213
341
|
requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
|
|
214
342
|
|
|
343
|
+
EFA_DEVICE = "vpc.amazonaws.com/efa"
|
|
215
344
|
for device_name, device_limit in resource.devices.items():
|
|
216
345
|
limits[device_name] = str(device_limit)
|
|
217
346
|
|
|
347
|
+
# Handle EFA device count override:
|
|
348
|
+
# - None (default): use whatever count is in the resource spec (already added above)
|
|
349
|
+
# - 0: remove EFA devices entirely
|
|
350
|
+
# - N > 0: set EFA device count to N (override or add)
|
|
351
|
+
if efa_device_count is not None:
|
|
352
|
+
if efa_device_count == 0:
|
|
353
|
+
limits.pop(EFA_DEVICE, None)
|
|
354
|
+
else:
|
|
355
|
+
limits[EFA_DEVICE] = str(efa_device_count)
|
|
356
|
+
|
|
218
357
|
resources = V1ResourceRequirements(
|
|
219
358
|
limits=limits,
|
|
220
359
|
requests=requests,
|
|
@@ -301,9 +440,20 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
|
|
|
301
440
|
image=role.image,
|
|
302
441
|
name=name,
|
|
303
442
|
env=[
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
443
|
+
(
|
|
444
|
+
V1EnvVar(
|
|
445
|
+
name=name,
|
|
446
|
+
value_from=V1EnvVarSource(
|
|
447
|
+
field_ref=V1ObjectFieldSelector(
|
|
448
|
+
field_path=value.strip(PLACEHOLDER_FIELD_PATH)
|
|
449
|
+
)
|
|
450
|
+
),
|
|
451
|
+
)
|
|
452
|
+
if value.startswith(PLACEHOLDER_FIELD_PATH)
|
|
453
|
+
else V1EnvVar(
|
|
454
|
+
name=name,
|
|
455
|
+
value=value,
|
|
456
|
+
)
|
|
307
457
|
)
|
|
308
458
|
for name, value in role.env.items()
|
|
309
459
|
],
|
|
@@ -343,7 +493,10 @@ def app_to_resource(
|
|
|
343
493
|
queue: str,
|
|
344
494
|
service_account: Optional[str],
|
|
345
495
|
priority_class: Optional[str] = None,
|
|
346
|
-
|
|
496
|
+
reserved_millicpu: int = RESERVED_MILLICPU,
|
|
497
|
+
reserved_memmb: int = RESERVED_MEMMB,
|
|
498
|
+
efa_device_count: Optional[int] = None,
|
|
499
|
+
) -> Dict[str, Any]:
|
|
347
500
|
"""
|
|
348
501
|
app_to_resource creates a volcano job kubernetes resource definition from
|
|
349
502
|
the provided AppDef. The resource definition can be used to launch the
|
|
@@ -373,8 +526,27 @@ def app_to_resource(
|
|
|
373
526
|
replica_role = values.apply(role)
|
|
374
527
|
if role_idx == 0 and replica_id == 0:
|
|
375
528
|
replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
|
|
376
|
-
|
|
377
|
-
|
|
529
|
+
replica_role.env["TORCHX_IMAGE"] = replica_role.image
|
|
530
|
+
|
|
531
|
+
pod = role_to_pod(
|
|
532
|
+
name,
|
|
533
|
+
replica_role,
|
|
534
|
+
service_account,
|
|
535
|
+
reserved_millicpu,
|
|
536
|
+
reserved_memmb,
|
|
537
|
+
efa_device_count,
|
|
538
|
+
)
|
|
539
|
+
if k8s_metadata := role.metadata.get("kubernetes"):
|
|
540
|
+
if isinstance(k8s_metadata, str):
|
|
541
|
+
import fsspec
|
|
542
|
+
|
|
543
|
+
with fsspec.open(k8s_metadata, "r") as f:
|
|
544
|
+
k8s_metadata = yaml.unsafe_load(f)
|
|
545
|
+
elif not isinstance(k8s_metadata, dict):
|
|
546
|
+
raise ValueError(
|
|
547
|
+
f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
|
|
548
|
+
)
|
|
549
|
+
_apply_pod_overlay(pod, k8s_metadata)
|
|
378
550
|
pod.metadata.labels.update(
|
|
379
551
|
pod_labels(
|
|
380
552
|
app=app,
|
|
@@ -417,7 +589,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
417
589
|
if priority_class is not None:
|
|
418
590
|
job_spec["priorityClassName"] = priority_class
|
|
419
591
|
|
|
420
|
-
resource: Dict[str,
|
|
592
|
+
resource: Dict[str, Any] = {
|
|
421
593
|
"apiVersion": "batch.volcano.sh/v1alpha1",
|
|
422
594
|
"kind": "Job",
|
|
423
595
|
"metadata": {"name": f"{unique_app_id}"},
|
|
@@ -429,7 +601,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
|
|
|
429
601
|
@dataclass
|
|
430
602
|
class KubernetesJob:
|
|
431
603
|
images_to_push: Dict[str, Tuple[str, str]]
|
|
432
|
-
resource: Dict[str,
|
|
604
|
+
resource: Dict[str, Any]
|
|
433
605
|
|
|
434
606
|
def __str__(self) -> str:
|
|
435
607
|
return yaml.dump(sanitize_for_serialization(self.resource))
|
|
@@ -444,6 +616,10 @@ class KubernetesOpts(TypedDict, total=False):
|
|
|
444
616
|
image_repo: Optional[str]
|
|
445
617
|
service_account: Optional[str]
|
|
446
618
|
priority_class: Optional[str]
|
|
619
|
+
validate_spec: Optional[bool]
|
|
620
|
+
reserved_millicpu: Optional[int]
|
|
621
|
+
reserved_memmb: Optional[int]
|
|
622
|
+
efa_device_count: Optional[int]
|
|
447
623
|
|
|
448
624
|
|
|
449
625
|
class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
@@ -456,7 +632,7 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
456
632
|
For installation instructions see: https://github.com/volcano-sh/volcano
|
|
457
633
|
|
|
458
634
|
This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
|
|
459
|
-
v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
|
|
635
|
+
v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
|
|
460
636
|
tracking Volcano support for Kubernetes v1.22.
|
|
461
637
|
|
|
462
638
|
.. note::
|
|
@@ -474,6 +650,16 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
474
650
|
$ torchx status kubernetes://torchx_user/1234
|
|
475
651
|
...
|
|
476
652
|
|
|
653
|
+
**Cancellation**
|
|
654
|
+
|
|
655
|
+
Canceling a job aborts it while preserving the job spec for inspection
|
|
656
|
+
and cloning via kubectl apply. Use the delete command to remove the job entirely:
|
|
657
|
+
|
|
658
|
+
.. code-block:: bash
|
|
659
|
+
|
|
660
|
+
$ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
|
|
661
|
+
$ torchx delete kubernetes://namespace/jobname # delete completely
|
|
662
|
+
|
|
477
663
|
**Config Options**
|
|
478
664
|
|
|
479
665
|
.. runopts::
|
|
@@ -552,9 +738,14 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
552
738
|
if c is None:
|
|
553
739
|
configuration = client.Configuration()
|
|
554
740
|
try:
|
|
555
|
-
config
|
|
556
|
-
|
|
557
|
-
|
|
741
|
+
# Try in-cluster config first (for pods with ServiceAccount)
|
|
742
|
+
config.load_incluster_config(client_configuration=configuration)
|
|
743
|
+
except config.ConfigException:
|
|
744
|
+
# Fall back to kubeconfig (for local development)
|
|
745
|
+
try:
|
|
746
|
+
config.load_kube_config(client_configuration=configuration)
|
|
747
|
+
except config.ConfigException as e:
|
|
748
|
+
warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
|
|
558
749
|
|
|
559
750
|
c = self._client = client.ApiClient(configuration)
|
|
560
751
|
|
|
@@ -606,7 +797,7 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
606
797
|
else:
|
|
607
798
|
raise
|
|
608
799
|
|
|
609
|
-
return f
|
|
800
|
+
return f"{namespace}:{resp['metadata']['name']}"
|
|
610
801
|
|
|
611
802
|
def _submit_dryrun(
|
|
612
803
|
self, app: AppDef, cfg: KubernetesOpts
|
|
@@ -628,18 +819,92 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
628
819
|
priority_class, str
|
|
629
820
|
), "priority_class must be a str"
|
|
630
821
|
|
|
631
|
-
|
|
822
|
+
reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
|
|
823
|
+
assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
|
|
824
|
+
|
|
825
|
+
reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
|
|
826
|
+
assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
|
|
827
|
+
|
|
828
|
+
efa_device_count = cfg.get("efa_device_count")
|
|
829
|
+
assert efa_device_count is None or isinstance(
|
|
830
|
+
efa_device_count, int
|
|
831
|
+
), "efa_device_count must be an int or None"
|
|
832
|
+
|
|
833
|
+
resource = app_to_resource(
|
|
834
|
+
app,
|
|
835
|
+
queue,
|
|
836
|
+
service_account,
|
|
837
|
+
priority_class,
|
|
838
|
+
reserved_millicpu,
|
|
839
|
+
reserved_memmb,
|
|
840
|
+
efa_device_count,
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
if cfg.get("validate_spec"):
|
|
844
|
+
try:
|
|
845
|
+
self._custom_objects_api().create_namespaced_custom_object(
|
|
846
|
+
group="batch.volcano.sh",
|
|
847
|
+
version="v1alpha1",
|
|
848
|
+
namespace=cfg.get("namespace") or "default",
|
|
849
|
+
plural="jobs",
|
|
850
|
+
body=resource,
|
|
851
|
+
dry_run="All",
|
|
852
|
+
)
|
|
853
|
+
except Exception as e:
|
|
854
|
+
from kubernetes.client.rest import ApiException
|
|
855
|
+
|
|
856
|
+
if isinstance(e, ApiException):
|
|
857
|
+
raise ValueError(f"Invalid job spec: {e.reason}") from e
|
|
858
|
+
raise
|
|
859
|
+
|
|
860
|
+
job_name = resource["metadata"]["name"]
|
|
861
|
+
for task in resource["spec"]["tasks"]:
|
|
862
|
+
task_name = task["name"]
|
|
863
|
+
replicas = task.get("replicas", 1)
|
|
864
|
+
max_index = replicas - 1
|
|
865
|
+
pod_name = f"{job_name}-{task_name}-{max_index}"
|
|
866
|
+
if len(pod_name) > 63:
|
|
867
|
+
raise ValueError(
|
|
868
|
+
f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
|
|
869
|
+
f"Shorten app.name or role names"
|
|
870
|
+
)
|
|
871
|
+
|
|
632
872
|
req = KubernetesJob(
|
|
633
873
|
resource=resource,
|
|
634
874
|
images_to_push=images_to_push,
|
|
635
875
|
)
|
|
636
876
|
return AppDryRunInfo(req, repr)
|
|
637
877
|
|
|
638
|
-
def _validate(self, app: AppDef, scheduler: str) -> None:
|
|
878
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: KubernetesOpts) -> None:
|
|
639
879
|
# Skip validation step
|
|
640
880
|
pass
|
|
641
881
|
|
|
642
882
|
def _cancel_existing(self, app_id: str) -> None:
|
|
883
|
+
"""
|
|
884
|
+
Abort a Volcano job while preserving the spec for inspection.
|
|
885
|
+
"""
|
|
886
|
+
namespace, name = app_id.split(":")
|
|
887
|
+
vcjob = self._custom_objects_api().get_namespaced_custom_object(
|
|
888
|
+
group="batch.volcano.sh",
|
|
889
|
+
version="v1alpha1",
|
|
890
|
+
namespace=namespace,
|
|
891
|
+
plural="jobs",
|
|
892
|
+
name=name,
|
|
893
|
+
)
|
|
894
|
+
vcjob["status"]["state"]["phase"] = "Aborted"
|
|
895
|
+
self._custom_objects_api().replace_namespaced_custom_object_status(
|
|
896
|
+
group="batch.volcano.sh",
|
|
897
|
+
version="v1alpha1",
|
|
898
|
+
namespace=namespace,
|
|
899
|
+
plural="jobs",
|
|
900
|
+
name=name,
|
|
901
|
+
body=vcjob,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
905
|
+
"""
|
|
906
|
+
Delete a Volcano job completely from the cluster.
|
|
907
|
+
"""
|
|
643
908
|
namespace, name = app_id.split(":")
|
|
644
909
|
self._custom_objects_api().delete_namespaced_custom_object(
|
|
645
910
|
group="batch.volcano.sh",
|
|
@@ -673,19 +938,52 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
673
938
|
type_=str,
|
|
674
939
|
help="The name of the PriorityClass to set on the job specs",
|
|
675
940
|
)
|
|
941
|
+
opts.add(
|
|
942
|
+
"validate_spec",
|
|
943
|
+
type_=bool,
|
|
944
|
+
help="Validate job spec using Kubernetes API dry-run before submission",
|
|
945
|
+
default=True,
|
|
946
|
+
)
|
|
947
|
+
opts.add(
|
|
948
|
+
"reserved_millicpu",
|
|
949
|
+
type_=int,
|
|
950
|
+
help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
|
|
951
|
+
default=RESERVED_MILLICPU,
|
|
952
|
+
)
|
|
953
|
+
opts.add(
|
|
954
|
+
"reserved_memmb",
|
|
955
|
+
type_=int,
|
|
956
|
+
help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
|
|
957
|
+
default=RESERVED_MEMMB,
|
|
958
|
+
)
|
|
959
|
+
opts.add(
|
|
960
|
+
"efa_device_count",
|
|
961
|
+
type_=int,
|
|
962
|
+
help="EFA device count override: None/unset=use resource spec, "
|
|
963
|
+
"0=remove EFA, N>0=set EFA count to N",
|
|
964
|
+
default=None,
|
|
965
|
+
)
|
|
676
966
|
return opts
|
|
677
967
|
|
|
678
968
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
969
|
+
from kubernetes import client
|
|
970
|
+
from kubernetes.client.rest import ApiException
|
|
971
|
+
|
|
679
972
|
namespace, name = app_id.split(":")
|
|
680
973
|
roles = {}
|
|
681
974
|
roles_statuses = {}
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
975
|
+
try:
|
|
976
|
+
resp = self._custom_objects_api().get_namespaced_custom_object_status(
|
|
977
|
+
group="batch.volcano.sh",
|
|
978
|
+
version="v1alpha1",
|
|
979
|
+
namespace=namespace,
|
|
980
|
+
plural="jobs",
|
|
981
|
+
name=name,
|
|
982
|
+
)
|
|
983
|
+
except ApiException as e:
|
|
984
|
+
if e.status == 404:
|
|
985
|
+
return None
|
|
986
|
+
raise
|
|
689
987
|
status = resp.get("status")
|
|
690
988
|
if status:
|
|
691
989
|
state_str = status["state"]["phase"]
|
|
@@ -694,18 +992,44 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
|
694
992
|
TASK_STATUS_COUNT = "taskStatusCount"
|
|
695
993
|
|
|
696
994
|
if TASK_STATUS_COUNT in status:
|
|
697
|
-
for
|
|
698
|
-
role, _, idx =
|
|
995
|
+
for task_name, task_status in status[TASK_STATUS_COUNT].items():
|
|
996
|
+
role, _, idx = task_name.rpartition("-")
|
|
699
997
|
|
|
700
|
-
state_str = next(iter(
|
|
998
|
+
state_str = next(iter(task_status["phase"].keys()))
|
|
701
999
|
state = TASK_STATE[state_str]
|
|
702
1000
|
|
|
703
1001
|
if role not in roles:
|
|
704
1002
|
roles[role] = Role(name=role, num_replicas=0, image="")
|
|
705
1003
|
roles_statuses[role] = RoleStatus(role, [])
|
|
706
1004
|
roles[role].num_replicas += 1
|
|
1005
|
+
|
|
1006
|
+
# Pod name follows the pattern: {job_name}-{task_name}-0
|
|
1007
|
+
# Get the pod to retrieve its IP address
|
|
1008
|
+
pod_name_k8s = f"{name}-{task_name}-0"
|
|
1009
|
+
hostname = ""
|
|
1010
|
+
try:
|
|
1011
|
+
core_api = client.CoreV1Api(self._api_client())
|
|
1012
|
+
pod = core_api.read_namespaced_pod(
|
|
1013
|
+
name=pod_name_k8s, namespace=namespace
|
|
1014
|
+
)
|
|
1015
|
+
pod_ip = pod.status.pod_ip
|
|
1016
|
+
|
|
1017
|
+
if pod_ip is not None:
|
|
1018
|
+
# Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
|
|
1019
|
+
pod_ip_dashed = pod_ip.replace(".", "-")
|
|
1020
|
+
|
|
1021
|
+
# Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
|
|
1022
|
+
# Note: This will only be useful if the client using the IPs is in the cluster.
|
|
1023
|
+
hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
|
|
1024
|
+
|
|
1025
|
+
except ApiException:
|
|
1026
|
+
# Pod not found - hostname remains empty
|
|
1027
|
+
pass
|
|
1028
|
+
|
|
707
1029
|
roles_statuses[role].replicas.append(
|
|
708
|
-
ReplicaStatus(
|
|
1030
|
+
ReplicaStatus(
|
|
1031
|
+
id=int(idx), role=role, state=state, hostname=hostname
|
|
1032
|
+
)
|
|
709
1033
|
)
|
|
710
1034
|
else:
|
|
711
1035
|
app_state = AppState.UNKNOWN
|
|
@@ -794,13 +1118,34 @@ def create_scheduler(
|
|
|
794
1118
|
def pod_labels(
|
|
795
1119
|
app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
|
|
796
1120
|
) -> Dict[str, str]:
|
|
1121
|
+
|
|
1122
|
+
def clean(label_value: str) -> str:
|
|
1123
|
+
# cleans the provided `label_value` to make it compliant
|
|
1124
|
+
# to pod label specs as described in
|
|
1125
|
+
# https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
|
|
1126
|
+
#
|
|
1127
|
+
# Valid label value:
|
|
1128
|
+
# must be 63 characters or less (can be empty),
|
|
1129
|
+
# unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
|
|
1130
|
+
# could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
|
|
1131
|
+
|
|
1132
|
+
# Replace invalid characters (allow: alphanum, -, _, .) with "."
|
|
1133
|
+
label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
|
|
1134
|
+
# Replace leading non-alphanumeric with "."
|
|
1135
|
+
label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
|
|
1136
|
+
# Replace trailing non-alphanumeric with "."
|
|
1137
|
+
label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
|
|
1138
|
+
|
|
1139
|
+
# Trim to 63 characters
|
|
1140
|
+
return label_value[:63]
|
|
1141
|
+
|
|
797
1142
|
return {
|
|
798
|
-
LABEL_VERSION: torchx.__version__,
|
|
799
|
-
LABEL_APP_NAME: app.name,
|
|
1143
|
+
LABEL_VERSION: clean(torchx.__version__),
|
|
1144
|
+
LABEL_APP_NAME: clean(app.name),
|
|
800
1145
|
LABEL_ROLE_INDEX: str(role_idx),
|
|
801
|
-
LABEL_ROLE_NAME: role.name,
|
|
1146
|
+
LABEL_ROLE_NAME: clean(role.name),
|
|
802
1147
|
LABEL_REPLICA_ID: str(replica_id),
|
|
803
|
-
LABEL_KUBE_APP_NAME: app.name,
|
|
1148
|
+
LABEL_KUBE_APP_NAME: clean(app.name),
|
|
804
1149
|
LABEL_ORGANIZATION: "torchx.pytorch.org",
|
|
805
|
-
LABEL_UNIQUE_NAME: app_id,
|
|
1150
|
+
LABEL_UNIQUE_NAME: clean(app_id),
|
|
806
1151
|
}
|