xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
{% for flavor in flavors %}
|
|
2
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
3
|
+
kind: ResourceFlavor
|
|
4
|
+
metadata:
|
|
5
|
+
name: "{{ flavor.name }}"
|
|
6
|
+
spec:
|
|
7
|
+
nodeLabels: {{ flavor.nodeLabels | tojson }}
|
|
8
|
+
{% if flavor.topologyLabel %}
|
|
9
|
+
{{ flavor.topologyLabel }}
|
|
10
|
+
{% endif %}
|
|
11
|
+
---
|
|
12
|
+
{% endfor %}
|
|
13
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
14
|
+
kind: AdmissionCheck
|
|
15
|
+
metadata:
|
|
16
|
+
name: dws-prov
|
|
17
|
+
spec:
|
|
18
|
+
controllerName: kueue.x-k8s.io/provisioning-request
|
|
19
|
+
parameters:
|
|
20
|
+
apiGroup: kueue.x-k8s.io
|
|
21
|
+
kind: ProvisioningRequestConfig
|
|
22
|
+
name: dws-config
|
|
23
|
+
---
|
|
24
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
25
|
+
kind: ProvisioningRequestConfig
|
|
26
|
+
metadata:
|
|
27
|
+
name: dws-config
|
|
28
|
+
spec:
|
|
29
|
+
provisioningClassName: queued-provisioning.gke.io
|
|
30
|
+
podSetUpdates:
|
|
31
|
+
nodeSelector:
|
|
32
|
+
- key: autoscaling.gke.io/provisioning-request
|
|
33
|
+
valueFromProvisioningClassDetail: ResizeRequestName
|
|
34
|
+
managedResources:
|
|
35
|
+
- {{ managed_resource }}
|
|
36
|
+
---
|
|
37
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
38
|
+
kind: ClusterQueue
|
|
39
|
+
metadata:
|
|
40
|
+
name: "{{ cluster_queue_name }}"
|
|
41
|
+
spec:
|
|
42
|
+
preemption:
|
|
43
|
+
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
|
|
44
|
+
withinClusterQueue: LowerPriority
|
|
45
|
+
namespaceSelector: {} # match all.
|
|
46
|
+
resourceGroups: {{ resource_groups }}
|
|
47
|
+
{{ admission_checks | indent(2) }}
|
|
48
|
+
---
|
|
49
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
50
|
+
kind: LocalQueue
|
|
51
|
+
metadata:
|
|
52
|
+
namespace: default
|
|
53
|
+
name: {{ local_queue_name }}
|
|
54
|
+
spec:
|
|
55
|
+
clusterQueue: {{ cluster_queue_name }}
|
|
56
|
+
---
|
|
57
|
+
apiVersion: scheduling.k8s.io/v1
|
|
58
|
+
kind: PriorityClass
|
|
59
|
+
metadata:
|
|
60
|
+
name: very-low
|
|
61
|
+
value: 100
|
|
62
|
+
globalDefault: false
|
|
63
|
+
description: "Very Low"
|
|
64
|
+
---
|
|
65
|
+
apiVersion: scheduling.k8s.io/v1
|
|
66
|
+
kind: PriorityClass
|
|
67
|
+
metadata:
|
|
68
|
+
name: low
|
|
69
|
+
value: 250
|
|
70
|
+
globalDefault: false
|
|
71
|
+
description: "Low"
|
|
72
|
+
---
|
|
73
|
+
apiVersion: scheduling.k8s.io/v1
|
|
74
|
+
kind: PriorityClass
|
|
75
|
+
metadata:
|
|
76
|
+
name: medium
|
|
77
|
+
value: 500
|
|
78
|
+
globalDefault: false
|
|
79
|
+
description: "Medium"
|
|
80
|
+
---
|
|
81
|
+
apiVersion: scheduling.k8s.io/v1
|
|
82
|
+
kind: PriorityClass
|
|
83
|
+
metadata:
|
|
84
|
+
name: high
|
|
85
|
+
value: 750
|
|
86
|
+
globalDefault: false
|
|
87
|
+
description: "High"
|
|
88
|
+
---
|
|
89
|
+
apiVersion: scheduling.k8s.io/v1
|
|
90
|
+
kind: PriorityClass
|
|
91
|
+
metadata:
|
|
92
|
+
name: very-high
|
|
93
|
+
value: 1000
|
|
94
|
+
globalDefault: false
|
|
95
|
+
description: "Very High"
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
2
|
+
kind: Topology
|
|
3
|
+
metadata:
|
|
4
|
+
name: "gke-default"
|
|
5
|
+
spec:
|
|
6
|
+
levels:
|
|
7
|
+
- nodeLabel: "cloud.google.com/gce-topology-block"
|
|
8
|
+
- nodeLabel: "cloud.google.com/gce-topology-subblock"
|
|
9
|
+
- nodeLabel: "cloud.google.com/gce-topology-host"
|
|
10
|
+
- nodeLabel: "kubernetes.io/hostname"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
2
|
+
kind: Topology
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ sub_slice_topology_name }}
|
|
5
|
+
spec:
|
|
6
|
+
levels:
|
|
7
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-16x16-id"
|
|
8
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-8x16-id"
|
|
9
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-8x8-id"
|
|
10
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-4x8-id"
|
|
11
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-4x4-id"
|
|
12
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-2x4-id"
|
|
13
|
+
- nodeLabel: "cloud.google.com/gke-tpu-slice-2x2-id"
|
|
14
|
+
- nodeLabel: "kubernetes.io/hostname"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
apiVersion: checkpointing.gke.io/v1
|
|
2
|
+
kind: CheckpointConfiguration
|
|
3
|
+
metadata:
|
|
4
|
+
name: my-checkpointconfiguration
|
|
5
|
+
spec:
|
|
6
|
+
cloudStorageBucketName:
|
|
7
|
+
# This field is optional
|
|
8
|
+
nodeSelector:
|
|
9
|
+
node.kubernetes.io/instance-type:
|
|
10
|
+
# This field is optional
|
|
11
|
+
tolerations:
|
|
12
|
+
- key:
|
|
13
|
+
operator: Exists
|
|
14
|
+
effect: NoSchedule
|
|
15
|
+
inMemoryVolumeSize:
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_boolean_flag(flag: str, default: bool) -> bool:
|
|
21
|
+
return os.getenv(flag, str(default)).lower() == "true"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _FeatureFlags:
|
|
25
|
+
SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
FeatureFlags = _FeatureFlags()
|
xpk/utils/kueue.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_queued_cluster(num_slices: int) -> bool:
|
|
19
|
+
"""Determines if admission checks should be enabled and cluster queued."""
|
|
20
|
+
return num_slices <= 1
|
xpk/utils/templates.py
CHANGED
|
@@ -18,6 +18,8 @@ import os
|
|
|
18
18
|
|
|
19
19
|
import ruamel.yaml
|
|
20
20
|
|
|
21
|
+
TEMPLATE_PATH = "templates"
|
|
22
|
+
|
|
21
23
|
yaml = ruamel.yaml.YAML()
|
|
22
24
|
|
|
23
25
|
|
|
@@ -26,3 +28,16 @@ def load(path: str) -> dict:
|
|
|
26
28
|
with open(template_path, "r", encoding="utf-8") as file:
|
|
27
29
|
data: dict = yaml.load(file)
|
|
28
30
|
return data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_templates_absolute_path(templates_path: str = TEMPLATE_PATH) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Return the absolute path to the templates folder
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
templates_path: The path to the templates folder relative to the src/xpk directory
|
|
39
|
+
"""
|
|
40
|
+
current_file_path = os.path.abspath(__file__)
|
|
41
|
+
current_dir = os.path.dirname(current_file_path)
|
|
42
|
+
xpk_package_dir = os.path.dirname(current_dir)
|
|
43
|
+
return os.path.join(xpk_package_dir, templates_path)
|
xpk/utils/topology.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from functools import reduce
|
|
18
|
+
from operator import mul
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_topology_valid(topology: str) -> bool:
|
|
22
|
+
try:
|
|
23
|
+
parse_topology(topology)
|
|
24
|
+
return True
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_topology_product(topology: str) -> int:
|
|
30
|
+
return reduce(mul, parse_topology(topology), 1)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def parse_topology(topology: str) -> list[int]:
|
|
34
|
+
if len(topology) <= 0:
|
|
35
|
+
raise ValueError("Topology is an empty string")
|
|
36
|
+
|
|
37
|
+
return [int(el) for el in topology.lower().split("x")]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def is_topology_contained(contained: str, container: str) -> bool:
|
|
41
|
+
contained_parsed = parse_topology(contained)
|
|
42
|
+
container_parsed = parse_topology(container)
|
|
43
|
+
return len(contained_parsed) == len(container_parsed) and all(
|
|
44
|
+
contained <= container
|
|
45
|
+
for contained, container in zip(contained_parsed, container_parsed)
|
|
46
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from .topology import is_topology_valid, get_topology_product, parse_topology, is_topology_contained
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_is_topology_valid_with_invalid_topology():
|
|
22
|
+
result = is_topology_valid("N/A")
|
|
23
|
+
assert result is False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_is_topology_valid_with_valid_topology():
|
|
27
|
+
result = is_topology_valid("1x1x1")
|
|
28
|
+
assert result is True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_parse_topology_with_valid_topology():
|
|
32
|
+
result = parse_topology("1x2x3")
|
|
33
|
+
assert result == [1, 2, 3]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_parse_topology_with_empty_input():
|
|
37
|
+
with pytest.raises(ValueError):
|
|
38
|
+
parse_topology("")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_get_topology_product():
|
|
42
|
+
result = get_topology_product("1x2x3")
|
|
43
|
+
assert result == 6
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_is_topology_contained_with_container_smaller_than_contained_returns_false():
|
|
47
|
+
result = is_topology_contained(contained="3x3x3", container="2x2x2")
|
|
48
|
+
assert result is False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_is_topology_contained_with_container_larger_than_contained_returns_true():
|
|
52
|
+
result = is_topology_contained(contained="1x1x1", container="2x2x2")
|
|
53
|
+
assert result is True
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_is_topology_contained_with_container_equal_to_contained_returns_true():
|
|
57
|
+
result = is_topology_contained(contained="2x2x2", container="2x2x2")
|
|
58
|
+
assert result is True
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_is_topology_contained_with_different_topologies_dimensions_returns_false():
|
|
62
|
+
result = is_topology_contained(contained="2x2", container="2x2x2")
|
|
63
|
+
assert result is False
|
xpk/utils/validation.py
CHANGED
|
@@ -15,66 +15,90 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_for_value
|
|
18
|
+
from ..core.config import __version__ as xpk_version
|
|
18
19
|
from .console import xpk_exit, xpk_print
|
|
19
20
|
from ..commands.config import xpk_cfg
|
|
20
21
|
from ..core.config import DEPENDENCIES_KEY
|
|
21
|
-
from
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class _SystemDependency:
|
|
28
|
+
command: str
|
|
29
|
+
message: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SystemDependency(Enum):
|
|
33
|
+
"""Represents required system dependencies."""
|
|
34
|
+
|
|
35
|
+
KUBECTL = _SystemDependency(
|
|
36
|
+
command='kubectl --help',
|
|
37
|
+
message=(
|
|
38
|
+
'`kubectl` not installed. Please follow'
|
|
39
|
+
' https://github.com/AI-Hypercomputer/xpk?tab=readme-ov-file#prerequisites'
|
|
40
|
+
' to install xpk prerequisites.'
|
|
41
|
+
),
|
|
42
|
+
)
|
|
43
|
+
KJOB = _SystemDependency(
|
|
44
|
+
command='kubectl kjob --help',
|
|
45
|
+
message=(
|
|
46
|
+
'`kjobctl` not installed. Please follow'
|
|
47
|
+
' https://github.com/AI-Hypercomputer/xpk?tab=readme-ov-file#prerequisites'
|
|
48
|
+
' to install xpk prerequisites.'
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
GCLOUD = _SystemDependency(
|
|
52
|
+
command='gcloud version',
|
|
53
|
+
message=(
|
|
54
|
+
'`gcloud not installed. Please follow'
|
|
55
|
+
' https://github.com/AI-Hypercomputer/xpk?tab=readme-ov-file#prerequisites'
|
|
56
|
+
' to install xpk prerequisites.'
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
DOCKER = _SystemDependency(
|
|
60
|
+
command='docker version',
|
|
61
|
+
message=(
|
|
62
|
+
'`docker` not installed. Please follow'
|
|
63
|
+
' https://github.com/AI-Hypercomputer/xpk?tab=readme-ov-file#prerequisites'
|
|
64
|
+
' to install xpk prerequisites.'
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
KUEUECTL = _SystemDependency(
|
|
68
|
+
command='kubectl kueue --help',
|
|
69
|
+
message=(
|
|
70
|
+
'`kueuectl` not installed. Please follow'
|
|
71
|
+
' https://github.com/AI-Hypercomputer/xpk?tab=readme-ov-file#prerequisites'
|
|
72
|
+
' to install xpk prerequisites.'
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def should_validate_dependencies(args):
|
|
78
|
+
skip_validation = 'skip_validation' in args and args.skip_validation
|
|
79
|
+
dry_run = 'dry_run' in args and args.dry_run
|
|
80
|
+
return not skip_validation and not dry_run
|
|
66
81
|
|
|
67
82
|
|
|
68
83
|
def validate_dependencies():
|
|
84
|
+
"""Validates all system dependencies if validation has not been done with current XPK version."""
|
|
69
85
|
deps_version = xpk_cfg.get(DEPENDENCIES_KEY)
|
|
70
|
-
xpk_version = get_xpk_version()
|
|
71
86
|
if deps_version is None or deps_version != xpk_version:
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
87
|
+
validate_dependencies_list(list(SystemDependency))
|
|
88
|
+
xpk_cfg.set(DEPENDENCIES_KEY, xpk_version)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def validate_dependencies_list(dependencies: list[SystemDependency]):
|
|
92
|
+
"""Validates a list of system dependencies and returns none or exits with error."""
|
|
93
|
+
for dependency in dependencies:
|
|
94
|
+
_validate_dependency(dependency)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _validate_dependency(dependency: SystemDependency) -> None:
|
|
98
|
+
"""Validates system dependency and returns none or exits with error."""
|
|
99
|
+
name, value = dependency.name, dependency.value
|
|
100
|
+
cmd, message = value.command, value.message
|
|
101
|
+
code, _ = run_command_for_value(cmd, f'Validate {name} installation.')
|
|
102
|
+
if code != 0:
|
|
103
|
+
xpk_print(message)
|
|
104
|
+
xpk_exit(code)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from .validation import validate_dependencies_list, SystemDependency
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_validate_dependencies_list_returns_nothing_for_successful_validation(
|
|
22
|
+
mocker,
|
|
23
|
+
):
|
|
24
|
+
mocker.patch(
|
|
25
|
+
'xpk.utils.validation.run_command_for_value', return_value=(0, '')
|
|
26
|
+
)
|
|
27
|
+
validate_dependencies_list([SystemDependency.DOCKER])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_validate_dependencies_list_exits_with_error_for_failed_validation(
|
|
31
|
+
mocker,
|
|
32
|
+
):
|
|
33
|
+
mocker.patch(
|
|
34
|
+
'xpk.utils.validation.run_command_for_value', return_value=(1, '')
|
|
35
|
+
)
|
|
36
|
+
with pytest.raises(SystemExit):
|
|
37
|
+
validate_dependencies_list([SystemDependency.DOCKER])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.1
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -22,15 +22,18 @@ Requires-Dist: google-api-core==2.24.1
|
|
|
22
22
|
Requires-Dist: packaging==24.2
|
|
23
23
|
Requires-Dist: google-cloud-filestore==1.12.0
|
|
24
24
|
Requires-Dist: google-cloud-storage
|
|
25
|
+
Requires-Dist: Jinja2==3.1.6
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Requires-Dist: pyink==24.3.0; extra == "dev"
|
|
27
28
|
Requires-Dist: pylint>=2.6.0; extra == "dev"
|
|
28
29
|
Requires-Dist: pre-commit; extra == "dev"
|
|
29
30
|
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-mock==3.15.1; extra == "dev"
|
|
30
32
|
Requires-Dist: docker==7.1.0; extra == "dev"
|
|
31
33
|
Requires-Dist: mypy~=1.17; extra == "dev"
|
|
32
34
|
Requires-Dist: types-PyYAML==6.0.2; extra == "dev"
|
|
33
35
|
Requires-Dist: types-docker~=7.1.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pylint-per-file-ignores==1.4.0; extra == "dev"
|
|
34
37
|
Dynamic: license-file
|
|
35
38
|
|
|
36
39
|
<!--
|
|
@@ -76,6 +79,7 @@ XPK supports the following TPU types:
|
|
|
76
79
|
* v5e
|
|
77
80
|
* v5p
|
|
78
81
|
* Trillium (v6e)
|
|
82
|
+
* Ironwood (tpu7x)
|
|
79
83
|
|
|
80
84
|
and the following GPU types:
|
|
81
85
|
* A100
|
|
@@ -83,6 +87,7 @@ and the following GPU types:
|
|
|
83
87
|
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
84
88
|
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
85
89
|
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
90
|
+
* A4X (gb200)
|
|
86
91
|
|
|
87
92
|
and the following CPU types:
|
|
88
93
|
* n2-standard-32
|