xpk 0.13.0__tar.gz → 0.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.13.0/src/xpk.egg-info → xpk-0.14.0}/PKG-INFO +6 -1
- {xpk-0.13.0 → xpk-0.14.0}/README.md +2 -0
- {xpk-0.13.0 → xpk-0.14.0}/pyproject.toml +4 -1
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/batch.py +9 -2
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/cluster.py +128 -115
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/cluster_gcluster.py +77 -14
- xpk-0.14.0/src/xpk/commands/cluster_gcluster_test.py +177 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/common.py +10 -28
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/info.py +11 -9
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/inspector.py +21 -10
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/job.py +25 -9
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/kind.py +38 -40
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/kjob_common.py +4 -4
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/run.py +9 -2
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/shell.py +13 -10
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/storage.py +21 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/version.py +0 -4
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/workload.py +43 -22
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_generator.py +4 -40
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_test.py +0 -6
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/capacity.py +6 -5
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/cluster.py +91 -194
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/cluster_private.py +6 -11
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/commands.py +11 -18
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/config.py +1 -1
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_image.py +3 -4
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/gcloud_context.py +26 -2
- xpk-0.14.0/src/xpk/core/gcloud_context_test.py +96 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/gcluster_manager.py +0 -3
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/jobset.py +4 -7
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/kjob.py +14 -27
- xpk-0.14.0/src/xpk/core/kueue_manager.py +383 -0
- xpk-0.14.0/src/xpk/core/kueue_manager_test.py +542 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/monitoring.py +1 -1
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/nap.py +10 -15
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/network.py +17 -18
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/nodepool.py +66 -77
- xpk-0.14.0/src/xpk/core/nodepool_test.py +279 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/pathways.py +5 -5
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/ray.py +10 -14
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/resources.py +6 -11
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/scheduling.py +19 -1
- xpk-0.14.0/src/xpk/core/scheduling_test.py +31 -0
- xpk-0.14.0/src/xpk/core/system_characteristics.py +733 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/vertex.py +1 -1
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload.py +7 -8
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/main.py +2 -4
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/cluster.py +7 -0
- xpk-0.14.0/src/xpk/parser/cluster_test.py +66 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/common.py +11 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/workload.py +62 -25
- xpk-0.14.0/src/xpk/parser/workload_test.py +82 -0
- xpk-0.14.0/src/xpk/utils/feature_flags.py +28 -0
- xpk-0.14.0/src/xpk/utils/kueue.py +20 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/templates.py +2 -0
- xpk-0.14.0/src/xpk/utils/topology.py +37 -0
- xpk-0.14.0/src/xpk/utils/topology_test.py +43 -0
- xpk-0.14.0/src/xpk/utils/validation.py +104 -0
- xpk-0.14.0/src/xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0 → xpk-0.14.0/src/xpk.egg-info}/PKG-INFO +6 -1
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/SOURCES.txt +12 -1
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/requires.txt +3 -0
- xpk-0.13.0/src/xpk/core/kueue.py +0 -561
- xpk-0.13.0/src/xpk/core/nodepool_test.py +0 -82
- xpk-0.13.0/src/xpk/core/system_characteristics.py +0 -627
- xpk-0.13.0/src/xpk/utils/validation.py +0 -80
- {xpk-0.13.0 → xpk-0.14.0}/LICENSE +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/setup.cfg +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/config_test.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/docker_resources.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/filestore.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/mtc.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/storage.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/core/workload_test.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/storage.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/execution_context.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.13.0 → xpk-0.14.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -22,15 +22,18 @@ Requires-Dist: google-api-core==2.24.1
|
|
|
22
22
|
Requires-Dist: packaging==24.2
|
|
23
23
|
Requires-Dist: google-cloud-filestore==1.12.0
|
|
24
24
|
Requires-Dist: google-cloud-storage
|
|
25
|
+
Requires-Dist: Jinja2==3.1.6
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Requires-Dist: pyink==24.3.0; extra == "dev"
|
|
27
28
|
Requires-Dist: pylint>=2.6.0; extra == "dev"
|
|
28
29
|
Requires-Dist: pre-commit; extra == "dev"
|
|
29
30
|
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-mock==3.15.1; extra == "dev"
|
|
30
32
|
Requires-Dist: docker==7.1.0; extra == "dev"
|
|
31
33
|
Requires-Dist: mypy~=1.17; extra == "dev"
|
|
32
34
|
Requires-Dist: types-PyYAML==6.0.2; extra == "dev"
|
|
33
35
|
Requires-Dist: types-docker~=7.1.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pylint-per-file-ignores==1.4.0; extra == "dev"
|
|
34
37
|
Dynamic: license-file
|
|
35
38
|
|
|
36
39
|
<!--
|
|
@@ -76,6 +79,7 @@ XPK supports the following TPU types:
|
|
|
76
79
|
* v5e
|
|
77
80
|
* v5p
|
|
78
81
|
* Trillium (v6e)
|
|
82
|
+
* Ironwood (tpu7x)
|
|
79
83
|
|
|
80
84
|
and the following GPU types:
|
|
81
85
|
* A100
|
|
@@ -83,6 +87,7 @@ and the following GPU types:
|
|
|
83
87
|
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
84
88
|
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
85
89
|
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
90
|
+
* A4X (gb200)
|
|
86
91
|
|
|
87
92
|
and the following CPU types:
|
|
88
93
|
* n2-standard-32
|
|
@@ -41,6 +41,7 @@ XPK supports the following TPU types:
|
|
|
41
41
|
* v5e
|
|
42
42
|
* v5p
|
|
43
43
|
* Trillium (v6e)
|
|
44
|
+
* Ironwood (tpu7x)
|
|
44
45
|
|
|
45
46
|
and the following GPU types:
|
|
46
47
|
* A100
|
|
@@ -48,6 +49,7 @@ and the following GPU types:
|
|
|
48
49
|
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
49
50
|
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
50
51
|
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
|
|
52
|
+
* A4X (gb200)
|
|
51
53
|
|
|
52
54
|
and the following CPU types:
|
|
53
55
|
* n2-standard-32
|
|
@@ -40,7 +40,8 @@ dependencies = [
|
|
|
40
40
|
"google-api-core==2.24.1",
|
|
41
41
|
"packaging==24.2",
|
|
42
42
|
"google-cloud-filestore==1.12.0",
|
|
43
|
-
"google-cloud-storage"
|
|
43
|
+
"google-cloud-storage",
|
|
44
|
+
"Jinja2==3.1.6"
|
|
44
45
|
]
|
|
45
46
|
|
|
46
47
|
[project.urls]
|
|
@@ -62,10 +63,12 @@ dev = [
|
|
|
62
63
|
"pylint>=2.6.0",
|
|
63
64
|
"pre-commit",
|
|
64
65
|
"pytest",
|
|
66
|
+
"pytest-mock==3.15.1",
|
|
65
67
|
"docker==7.1.0",
|
|
66
68
|
"mypy ~= 1.17",
|
|
67
69
|
"types-PyYAML == 6.0.2",
|
|
68
70
|
"types-docker ~= 7.1.0.0",
|
|
71
|
+
"pylint-per-file-ignores == 1.4.0",
|
|
69
72
|
]
|
|
70
73
|
|
|
71
74
|
[tool.setuptools.dynamic]
|
|
@@ -29,9 +29,10 @@ from ..core.kjob import (
|
|
|
29
29
|
get_storage_annotations,
|
|
30
30
|
prepare_kjob,
|
|
31
31
|
)
|
|
32
|
-
from ..core.
|
|
32
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
33
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
34
|
from ..utils.execution_context import is_dry_run
|
|
35
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
35
36
|
from .kind import set_local_cluster_command
|
|
36
37
|
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
37
38
|
|
|
@@ -44,6 +45,12 @@ def batch(args: Namespace) -> None:
|
|
|
44
45
|
Returns:
|
|
45
46
|
None
|
|
46
47
|
"""
|
|
48
|
+
if should_validate_dependencies(args):
|
|
49
|
+
validate_dependencies_list([
|
|
50
|
+
SystemDependency.KUBECTL,
|
|
51
|
+
SystemDependency.KJOB,
|
|
52
|
+
SystemDependency.GCLOUD,
|
|
53
|
+
])
|
|
47
54
|
if not args.kind_cluster:
|
|
48
55
|
add_zone_and_project(args)
|
|
49
56
|
get_cluster_credentials(args)
|
|
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
|
|
|
126
133
|
if args.time is not None:
|
|
127
134
|
cmd += f' --time {args.time}'
|
|
128
135
|
|
|
129
|
-
return_code, return_value = run_command_for_value(cmd, 'submit job'
|
|
136
|
+
return_code, return_value = run_command_for_value(cmd, 'submit job')
|
|
130
137
|
|
|
131
138
|
if return_code != 0:
|
|
132
139
|
xpk_print(f'Running batch job returned ERROR {return_code}')
|