xpk 0.13.0__tar.gz → 0.14.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. {xpk-0.13.0/src/xpk.egg-info → xpk-0.14.1}/PKG-INFO +6 -1
  2. {xpk-0.13.0 → xpk-0.14.1}/README.md +2 -0
  3. {xpk-0.13.0 → xpk-0.14.1}/pyproject.toml +9 -3
  4. xpk-0.14.1/src/integration/docker_manager_test.py +102 -0
  5. xpk-0.14.1/src/integration/gcluster_a3mega_test.py +204 -0
  6. xpk-0.14.1/src/integration/gcluster_a3ultra_test.py +176 -0
  7. xpk-0.14.1/src/integration/gcluster_a4_test.py +176 -0
  8. xpk-0.14.1/src/integration/gcluster_test.py +107 -0
  9. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/batch.py +9 -2
  10. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/cluster.py +143 -117
  11. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/cluster_gcluster.py +81 -14
  12. xpk-0.14.1/src/xpk/commands/cluster_gcluster_test.py +177 -0
  13. xpk-0.14.1/src/xpk/commands/cluster_test.py +92 -0
  14. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/common.py +14 -26
  15. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/info.py +11 -9
  16. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/inspector.py +21 -10
  17. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/job.py +25 -9
  18. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/kind.py +39 -40
  19. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/kjob_common.py +4 -4
  20. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/run.py +9 -2
  21. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/shell.py +13 -10
  22. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/storage.py +21 -0
  23. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/version.py +0 -4
  24. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/workload.py +84 -29
  25. xpk-0.14.1/src/xpk/commands/workload_test.py +81 -0
  26. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/blueprint/blueprint_generator.py +4 -40
  27. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/blueprint/blueprint_test.py +0 -6
  28. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/capacity.py +6 -5
  29. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/cluster.py +91 -194
  30. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/cluster_private.py +6 -11
  31. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/commands.py +11 -18
  32. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/config.py +1 -1
  33. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/docker_image.py +3 -4
  34. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/gcloud_context.py +26 -2
  35. xpk-0.14.1/src/xpk/core/gcloud_context_test.py +96 -0
  36. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/gcluster_manager.py +0 -3
  37. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/jobset.py +4 -7
  38. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/kjob.py +14 -27
  39. xpk-0.14.1/src/xpk/core/kueue_manager.py +423 -0
  40. xpk-0.14.1/src/xpk/core/kueue_manager_test.py +574 -0
  41. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/monitoring.py +1 -1
  42. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/nap.py +10 -15
  43. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/network.py +17 -18
  44. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/nodepool.py +66 -77
  45. xpk-0.14.1/src/xpk/core/nodepool_test.py +279 -0
  46. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/pathways.py +5 -5
  47. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/ray.py +10 -14
  48. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/resources.py +6 -11
  49. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/scheduling.py +19 -1
  50. xpk-0.14.1/src/xpk/core/scheduling_test.py +31 -0
  51. xpk-0.14.1/src/xpk/core/system_characteristics.py +745 -0
  52. xpk-0.14.1/src/xpk/core/system_characteristics_test.py +73 -0
  53. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/vertex.py +1 -1
  54. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload.py +7 -8
  55. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/main.py +2 -4
  56. xpk-0.14.1/src/xpk/parser/__init__.py +15 -0
  57. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/cluster.py +7 -0
  58. xpk-0.14.1/src/xpk/parser/cluster_test.py +66 -0
  59. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/common.py +11 -0
  60. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/workload.py +62 -25
  61. xpk-0.14.1/src/xpk/parser/workload_test.py +82 -0
  62. xpk-0.14.1/src/xpk/templates/cluster_preheat.yaml.j2 +31 -0
  63. xpk-0.14.1/src/xpk/templates/filestore-pv.yaml +17 -0
  64. xpk-0.14.1/src/xpk/templates/filestore-pvc.yaml +11 -0
  65. xpk-0.14.1/src/xpk/templates/filestore-sc.yaml +10 -0
  66. xpk-0.14.1/src/xpk/templates/fuse-pv.yaml +17 -0
  67. xpk-0.14.1/src/xpk/templates/fuse-pvc.yaml +13 -0
  68. xpk-0.14.1/src/xpk/templates/kueue_config.yaml.j2 +95 -0
  69. xpk-0.14.1/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  70. xpk-0.14.1/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  71. xpk-0.14.1/src/xpk/templates/mtc-cpc.yaml +15 -0
  72. xpk-0.14.1/src/xpk/templates/volume_bundle.yaml +7 -0
  73. xpk-0.14.1/src/xpk/utils/__init__.py +15 -0
  74. xpk-0.13.0/src/xpk/utils/templates.py → xpk-0.14.1/src/xpk/utils/feature_flags.py +7 -7
  75. xpk-0.14.1/src/xpk/utils/kueue.py +20 -0
  76. xpk-0.14.1/src/xpk/utils/templates.py +43 -0
  77. xpk-0.14.1/src/xpk/utils/topology.py +46 -0
  78. xpk-0.14.1/src/xpk/utils/topology_test.py +63 -0
  79. xpk-0.14.1/src/xpk/utils/validation.py +104 -0
  80. xpk-0.14.1/src/xpk/utils/validation_test.py +37 -0
  81. {xpk-0.13.0 → xpk-0.14.1/src/xpk.egg-info}/PKG-INFO +6 -1
  82. {xpk-0.13.0 → xpk-0.14.1}/src/xpk.egg-info/SOURCES.txt +33 -1
  83. {xpk-0.13.0 → xpk-0.14.1}/src/xpk.egg-info/requires.txt +3 -0
  84. xpk-0.14.1/src/xpk.egg-info/top_level.txt +2 -0
  85. xpk-0.13.0/src/xpk/core/kueue.py +0 -561
  86. xpk-0.13.0/src/xpk/core/nodepool_test.py +0 -82
  87. xpk-0.13.0/src/xpk/core/system_characteristics.py +0 -627
  88. xpk-0.13.0/src/xpk/utils/validation.py +0 -80
  89. xpk-0.13.0/src/xpk.egg-info/top_level.txt +0 -1
  90. {xpk-0.13.0 → xpk-0.14.1}/LICENSE +0 -0
  91. {xpk-0.13.0 → xpk-0.14.1}/setup.cfg +0 -0
  92. {xpk-0.13.0/src/xpk/api → xpk-0.14.1/src/integration}/__init__.py +0 -0
  93. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/__init__.py +0 -0
  94. {xpk-0.13.0/src/xpk/commands → xpk-0.14.1/src/xpk/api}/__init__.py +0 -0
  95. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/api/storage_crd.yaml +0 -0
  96. {xpk-0.13.0/src/xpk/core → xpk-0.14.1/src/xpk/commands}/__init__.py +0 -0
  97. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/commands/config.py +0 -0
  98. {xpk-0.13.0/src/xpk/core/blueprint → xpk-0.14.1/src/xpk/core}/__init__.py +0 -0
  99. {xpk-0.13.0/src/xpk/core/workload_decorators → xpk-0.14.1/src/xpk/core/blueprint}/__init__.py +0 -0
  100. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  101. {xpk-0.13.0/src/xpk/parser → xpk-0.14.1/src/xpk/core/blueprint/testing}/__init__.py +0 -0
  102. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/config_test.py +0 -0
  103. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/docker_container.py +0 -0
  104. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/docker_manager.py +0 -0
  105. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/docker_resources.py +0 -0
  106. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/filestore.py +0 -0
  107. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/gcsfuse.py +0 -0
  108. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/mtc.py +0 -0
  109. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/remote_state/__init__.py +0 -0
  110. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  111. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  112. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/storage.py +0 -0
  113. {xpk-0.13.0/src/xpk/utils → xpk-0.14.1/src/xpk/core/workload_decorators}/__init__.py +0 -0
  114. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  115. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  116. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  117. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  118. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  119. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/core/workload_test.py +0 -0
  120. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/batch.py +0 -0
  121. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/config.py +0 -0
  122. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/core.py +0 -0
  123. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/info.py +0 -0
  124. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/inspector.py +0 -0
  125. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/job.py +0 -0
  126. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/kind.py +0 -0
  127. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/run.py +0 -0
  128. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/shell.py +0 -0
  129. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/storage.py +0 -0
  130. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/validators.py +0 -0
  131. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/parser/version.py +0 -0
  132. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/templates/__init__.py +0 -0
  133. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/templates/storage.yaml +0 -0
  134. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/console.py +0 -0
  135. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/execution_context.py +0 -0
  136. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/file.py +0 -0
  137. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/gcs_utils.py +0 -0
  138. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/kubectl.py +0 -0
  139. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/network.py +0 -0
  140. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/objects.py +0 -0
  141. {xpk-0.13.0 → xpk-0.14.1}/src/xpk/utils/yaml.py +0 -0
  142. {xpk-0.13.0 → xpk-0.14.1}/src/xpk.egg-info/dependency_links.txt +0 -0
  143. {xpk-0.13.0 → xpk-0.14.1}/src/xpk.egg-info/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.13.0
3
+ Version: 0.14.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -22,15 +22,18 @@ Requires-Dist: google-api-core==2.24.1
22
22
  Requires-Dist: packaging==24.2
23
23
  Requires-Dist: google-cloud-filestore==1.12.0
24
24
  Requires-Dist: google-cloud-storage
25
+ Requires-Dist: Jinja2==3.1.6
25
26
  Provides-Extra: dev
26
27
  Requires-Dist: pyink==24.3.0; extra == "dev"
27
28
  Requires-Dist: pylint>=2.6.0; extra == "dev"
28
29
  Requires-Dist: pre-commit; extra == "dev"
29
30
  Requires-Dist: pytest; extra == "dev"
31
+ Requires-Dist: pytest-mock==3.15.1; extra == "dev"
30
32
  Requires-Dist: docker==7.1.0; extra == "dev"
31
33
  Requires-Dist: mypy~=1.17; extra == "dev"
32
34
  Requires-Dist: types-PyYAML==6.0.2; extra == "dev"
33
35
  Requires-Dist: types-docker~=7.1.0.0; extra == "dev"
36
+ Requires-Dist: pylint-per-file-ignores==1.4.0; extra == "dev"
34
37
  Dynamic: license-file
35
38
 
36
39
  <!--
@@ -76,6 +79,7 @@ XPK supports the following TPU types:
76
79
  * v5e
77
80
  * v5p
78
81
  * Trillium (v6e)
82
+ * Ironwood (tpu7x)
79
83
 
80
84
  and the following GPU types:
81
85
  * A100
@@ -83,6 +87,7 @@ and the following GPU types:
83
87
  * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
84
88
  * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
85
89
  * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
90
+ * A4X (gb200)
86
91
 
87
92
  and the following CPU types:
88
93
  * n2-standard-32
@@ -41,6 +41,7 @@ XPK supports the following TPU types:
41
41
  * v5e
42
42
  * v5p
43
43
  * Trillium (v6e)
44
+ * Ironwood (tpu7x)
44
45
 
45
46
  and the following GPU types:
46
47
  * A100
@@ -48,6 +49,7 @@ and the following GPU types:
48
49
  * A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
49
50
  * A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
50
51
  * A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
52
+ * A4X (gb200)
51
53
 
52
54
  and the following CPU types:
53
55
  * n2-standard-32
@@ -40,7 +40,8 @@ dependencies = [
40
40
  "google-api-core==2.24.1",
41
41
  "packaging==24.2",
42
42
  "google-cloud-filestore==1.12.0",
43
- "google-cloud-storage"
43
+ "google-cloud-storage",
44
+ "Jinja2==3.1.6"
44
45
  ]
45
46
 
46
47
  [project.urls]
@@ -62,19 +63,24 @@ dev = [
62
63
  "pylint>=2.6.0",
63
64
  "pre-commit",
64
65
  "pytest",
66
+ "pytest-mock==3.15.1",
65
67
  "docker==7.1.0",
66
68
  "mypy ~= 1.17",
67
69
  "types-PyYAML == 6.0.2",
68
70
  "types-docker ~= 7.1.0.0",
71
+ "pylint-per-file-ignores == 1.4.0",
69
72
  ]
70
73
 
71
74
  [tool.setuptools.dynamic]
72
75
  version = {attr = "xpk.core.config.__version__"}
73
76
 
74
77
  [tool.setuptools]
75
- packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api", "xpk.templates", "xpk.utils", "xpk.core.blueprint", "xpk.core.remote_state", "xpk.core.workload_decorators"]
76
78
  package-dir = {"" = "src"}
77
- package-data = {"xpk.api" = ["storage_crd.yaml"], "xpk.templates" = ["storage.yaml"]}
79
+ packages = { find = { where = ["src"] } }
80
+
81
+ [tool.setuptools.package-data]
82
+ "xpk" = ["templates/*"]
83
+ "xpk.api" = ["*.yaml"]
78
84
 
79
85
  [tool.pyink]
80
86
  # Formatting configuration to follow Google style-guide.
@@ -0,0 +1,102 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import docker
18
+ from docker.errors import APIError
19
+ from xpk.core.docker_manager import DockerManager, ctk_build_ref
20
+ import pytest
21
+ import os
22
+ import time
23
+
24
+ test_cfg_path = '/tmp/xpk_gcloud_cfg'
25
+ test_deployment_dir = '/tmp/xpk_deployment'
26
+ test_gcluster_cmd = 'gcluster --version'
27
+ test_ctk_xpk_img = 'gcluster-xpk'
28
+ test_ctk_xpk_container = 'xpk-test-container'
29
+
30
+
31
+ def remove_img():
32
+ dc = docker.from_env()
33
+ try:
34
+ dc.images.remove(test_ctk_xpk_img, force=True)
35
+ except APIError as _:
36
+ pass
37
+
38
+
39
+ def remove_container():
40
+ dc = docker.from_env()
41
+ try:
42
+ container = dc.containers.get(test_ctk_xpk_container)
43
+ container.remove(force=True)
44
+ except APIError as _:
45
+ pass
46
+
47
+
48
+ def create_tmp_dirs():
49
+ os.mkdir(test_cfg_path)
50
+ os.mkdir(test_deployment_dir)
51
+
52
+
53
+ def remove_tmp_dirs():
54
+ os.removedirs(test_cfg_path)
55
+ os.removedirs(test_deployment_dir)
56
+
57
+
58
+ @pytest.fixture(name='setup_img_name')
59
+ def remove_test_ctk_img():
60
+ create_tmp_dirs()
61
+ remove_container()
62
+ remove_img()
63
+ yield test_ctk_xpk_img
64
+ remove_container()
65
+ remove_img()
66
+ remove_tmp_dirs()
67
+
68
+
69
+ def test_docker_build_image(setup_img_name):
70
+ dm = DockerManager(
71
+ gcloud_cfg_path=test_cfg_path,
72
+ working_dir=test_deployment_dir,
73
+ img_name=setup_img_name,
74
+ )
75
+ dm.initialize()
76
+
77
+ dc = docker.from_env()
78
+ containers_before = dc.containers.list(all=True)
79
+ dc.images.get(f'{setup_img_name}:{ctk_build_ref}')
80
+ containers_after = dc.containers.list(all=True)
81
+ assert len(containers_before) == len(containers_after)
82
+
83
+
84
+ def test_run_command(setup_img_name):
85
+
86
+ dm = DockerManager(
87
+ gcloud_cfg_path=test_cfg_path,
88
+ working_dir=test_deployment_dir,
89
+ img_name=setup_img_name,
90
+ remove_container=True,
91
+ )
92
+ dc = docker.from_env()
93
+
94
+ containers_before = dc.containers.list(all=True)
95
+ dm.initialize()
96
+ dm.run_command(test_gcluster_cmd)
97
+
98
+ time.sleep(2)
99
+
100
+ containers_after = dc.containers.list(all=True)
101
+
102
+ assert len(containers_after) - len(containers_before) == 0
@@ -0,0 +1,204 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.commands.cluster_gcluster import get_unique_name
18
+ from xpk.core.docker_manager import DockerManager
19
+ from xpk.core.gcluster_manager import GclusterManager
20
+ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
21
+ import pytest
22
+ import os
23
+ import shutil
24
+
25
+ ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
26
+ project_id = os.getenv("PROJECT_ID")
27
+ region = os.getenv("REGION")
28
+ zone = os.getenv("ZONE")
29
+ auth_cidr = os.getenv("AUTH_CIDR")
30
+ cluster_name = os.getenv("A3_MEGA_TEST_CLUSTER_NAME")
31
+
32
+ uploads_dir = "uploads"
33
+
34
+
35
+ @pytest.fixture(name="setup_tests")
36
+ def prepare_test():
37
+ pwd = os.getcwd()
38
+ docker_path = os.path.join(pwd, "xpk_test_docker_dir")
39
+ bp_path = os.path.join(pwd, "xpk_bp_path")
40
+ if not os.path.exists(docker_path):
41
+ os.makedirs(docker_path)
42
+ if not os.path.exists(bp_path):
43
+ os.makedirs(bp_path)
44
+ yield (docker_path, bp_path)
45
+ shutil.rmtree(docker_path)
46
+ shutil.rmtree(bp_path)
47
+
48
+
49
+ @pytest.mark.skip(
50
+ reason=(
51
+ "This test requires A3 capacity, therefore it should not be run on each"
52
+ " build. Please invoke it manually if needed. "
53
+ )
54
+ )
55
+ def test_deploy_a3_mega_deployment(setup_tests):
56
+ docker_path, bp_path = setup_tests[0], setup_tests[1]
57
+ (
58
+ blueprint_name,
59
+ prefix,
60
+ gcluster_manager,
61
+ staged_bp_path,
62
+ ) = create_test_a3_mega_deployment(docker_path, bp_path)
63
+ gcluster_manager.deploy(
64
+ blueprint_path=staged_bp_path,
65
+ deployment_name=blueprint_name,
66
+ prefix=prefix,
67
+ )
68
+
69
+ # cleanup part
70
+ gcluster_manager.destroy_deployment(
71
+ deployment_name=blueprint_name, prefix=prefix
72
+ )
73
+ shutil.rmtree(docker_path)
74
+ shutil.rmtree(bp_path)
75
+
76
+
77
+ @pytest.mark.skip(
78
+ reason=(
79
+ "This test requires A3 capacity, therefore it should not be run on each"
80
+ " build. Please invoke it manually if needed. "
81
+ )
82
+ )
83
+ def test_create_a3_mega_deployment_files(setup_tests):
84
+ assert project_id is not None
85
+ assert region is not None
86
+ assert zone is not None
87
+ assert auth_cidr is not None
88
+ assert ctk_gcloud_cfg is not None
89
+ assert cluster_name is not None
90
+ docker_path, bp_path = setup_tests[0], setup_tests[1]
91
+
92
+ blueprint_name = f"{cluster_name}-a3-mega-xpk"
93
+ prefix = "prefix"
94
+
95
+ docker_manager = DockerManager(
96
+ gcloud_cfg_path=ctk_gcloud_cfg, working_dir=docker_path
97
+ )
98
+ docker_manager.initialize()
99
+
100
+ bpm = BlueprintGenerator(storage_path=bp_path)
101
+ a3_mega_blueprint = bpm.generate_a3_mega_blueprint(
102
+ cluster_name=cluster_name,
103
+ blueprint_name=blueprint_name,
104
+ prefix=prefix,
105
+ region=region,
106
+ project_id=project_id,
107
+ auth_cidr=auth_cidr,
108
+ zone=zone,
109
+ system_node_pool_min_node_count=3,
110
+ )
111
+ blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
112
+ blueprint_deps_test_path = os.path.join(bp_path, prefix, blueprint_name)
113
+
114
+ assert a3_mega_blueprint.blueprint_file == blueprint_test_path
115
+ assert a3_mega_blueprint.blueprint_dependencies == blueprint_deps_test_path
116
+
117
+ assert os.path.isfile(blueprint_test_path)
118
+ assert os.path.isdir(blueprint_deps_test_path)
119
+ assert os.path.isfile(
120
+ os.path.join(blueprint_deps_test_path, "config-map.yaml.tftpl")
121
+ )
122
+
123
+ gcluster_manager = GclusterManager(
124
+ gcluster_command_runner=docker_manager, remote_state_client=None
125
+ )
126
+
127
+ staged_bp_path = gcluster_manager.stage_files(
128
+ blueprint_file=a3_mega_blueprint.blueprint_file,
129
+ blueprint_dependencies=a3_mega_blueprint.blueprint_dependencies,
130
+ prefix=prefix,
131
+ )
132
+
133
+ assert staged_bp_path == os.path.join(
134
+ "/out", uploads_dir, prefix, f"{blueprint_name}.yaml"
135
+ )
136
+
137
+ staged_bp_path_local = os.path.join(
138
+ docker_path, uploads_dir, prefix, f"{blueprint_name}.yaml"
139
+ )
140
+ staged_bp_deps_path_local = os.path.join(
141
+ docker_path, uploads_dir, prefix, blueprint_name
142
+ )
143
+
144
+ assert os.path.isfile(staged_bp_path_local)
145
+ assert os.path.isdir(staged_bp_deps_path_local)
146
+ assert os.path.isfile(
147
+ os.path.join(staged_bp_deps_path_local, "config-map.yaml.tftpl")
148
+ )
149
+ assert os.path.isfile(
150
+ os.path.join(
151
+ staged_bp_deps_path_local, "kueue-xpk-configuration.yaml.tftpl"
152
+ )
153
+ )
154
+ unique_name = get_unique_name(project_id, region, zone)
155
+ gcluster_manager.deploy(
156
+ blueprint_path=staged_bp_path, deployment_name=unique_name, dry_run=True
157
+ )
158
+
159
+
160
+ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
161
+ assert project_id is not None
162
+ assert region is not None
163
+ assert zone is not None
164
+ assert auth_cidr is not None
165
+ assert ctk_gcloud_cfg is not None
166
+ assert cluster_name is not None
167
+
168
+ blueprint_name = f"{cluster_name}-a3-mega-xpk"
169
+ prefix = "prefix"
170
+
171
+ docker_manager = DockerManager(
172
+ gcloud_cfg_path=ctk_gcloud_cfg, working_dir=docker_path
173
+ )
174
+ docker_manager.initialize()
175
+
176
+ bpm = BlueprintGenerator(storage_path=bp_path)
177
+ a3_mega_blueprint = bpm.generate_a3_mega_blueprint(
178
+ cluster_name=cluster_name,
179
+ blueprint_name=blueprint_name,
180
+ prefix=prefix,
181
+ region=region,
182
+ project_id=project_id,
183
+ auth_cidr=auth_cidr,
184
+ zone=zone,
185
+ system_node_pool_min_node_count=3,
186
+ )
187
+
188
+ gcluster_manager = GclusterManager(
189
+ gcluster_command_runner=docker_manager,
190
+ remote_state_client=None,
191
+ )
192
+
193
+ staged_bp_path = gcluster_manager.stage_files(
194
+ blueprint_file=a3_mega_blueprint.blueprint_file,
195
+ blueprint_dependencies=a3_mega_blueprint.blueprint_dependencies,
196
+ prefix=prefix,
197
+ )
198
+
199
+ return (
200
+ blueprint_name,
201
+ prefix,
202
+ gcluster_manager,
203
+ staged_bp_path,
204
+ )
@@ -0,0 +1,176 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import os
18
+ import shutil
19
+
20
+ import pytest
21
+
22
+ from xpk.commands.cluster_gcluster import get_unique_name
23
+ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
24
+ from xpk.core.capacity import CapacityType
25
+ from xpk.core.docker_manager import DockerManager
26
+ from xpk.core.gcluster_manager import GclusterManager
27
+
28
+ ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
29
+ project_id = os.getenv("PROJECT_ID")
30
+ region = os.getenv("REGION")
31
+ zone = os.getenv("ZONE")
32
+ auth_cidr = os.getenv("AUTH_CIDR")
33
+ cluster_name = os.getenv("A3_ULTRA_TEST_CLUSTER_NAME")
34
+
35
+
36
+ @pytest.fixture(name="setup_tests")
37
+ def prepare_test():
38
+ pwd = os.getcwd()
39
+ docker_path = os.path.join(pwd, "xpk_test_docker_dir")
40
+ bp_path = os.path.join(pwd, "xpk_test_bp_dir")
41
+ if not os.path.exists(docker_path):
42
+ os.makedirs(docker_path)
43
+ if not os.path.exists(bp_path):
44
+ os.makedirs(bp_path)
45
+ yield (docker_path, bp_path)
46
+ shutil.rmtree(docker_path)
47
+ shutil.rmtree(bp_path)
48
+
49
+
50
+ @pytest.mark.skip(
51
+ reason=(
52
+ "This test requires A3 capacity, therefore it should not be run on each"
53
+ " build. Please invoke it manually if needed. "
54
+ )
55
+ )
56
+ def test_create_a3_ultra_deployment_files(setup_tests):
57
+ assert project_id is not None
58
+ assert region is not None
59
+ assert zone is not None
60
+ assert auth_cidr is not None
61
+ assert ctk_gcloud_cfg is not None
62
+ assert cluster_name is not None
63
+ docker_path, bp_path = setup_tests[0], setup_tests[1]
64
+ blueprint_name = f"{cluster_name}-a3-ultra-xpk"
65
+
66
+ docker_manager = DockerManager(
67
+ gcloud_cfg_path=ctk_gcloud_cfg, working_dir=docker_path
68
+ )
69
+ docker_manager.initialize()
70
+ prefix = f"{project_id}-{region}".lower()
71
+ bpm = BlueprintGenerator(storage_path=bp_path)
72
+ a3_mega_blueprint = bpm.generate_a3_ultra_blueprint(
73
+ cluster_name=cluster_name,
74
+ blueprint_name=blueprint_name,
75
+ region=region,
76
+ project_id=project_id,
77
+ auth_cidr=auth_cidr,
78
+ zone=zone,
79
+ reservation="foo",
80
+ num_nodes=1,
81
+ system_node_pool_machine_type="e2-standard-16",
82
+ prefix=prefix,
83
+ )
84
+ blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
85
+ blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
86
+ assert a3_mega_blueprint.blueprint_file == blueprint_test_path
87
+ assert a3_mega_blueprint.blueprint_dependencies == blueprint_deps_test_path
88
+
89
+ assert os.path.isfile(blueprint_test_path)
90
+ assert os.path.isdir(blueprint_deps_test_path)
91
+ assert os.path.isfile(
92
+ os.path.join(blueprint_deps_test_path, "mlgru-disable.yaml")
93
+ )
94
+ assert os.path.isfile(
95
+ os.path.join(blueprint_deps_test_path, "nccl-installer.yaml")
96
+ )
97
+ gcluster_manager = GclusterManager(
98
+ gcluster_command_runner=docker_manager, remote_state_client=None
99
+ )
100
+
101
+ staged_bp_path = gcluster_manager.stage_files(
102
+ blueprint_file=a3_mega_blueprint.blueprint_file,
103
+ blueprint_dependencies=a3_mega_blueprint.blueprint_dependencies,
104
+ prefix=prefix,
105
+ )
106
+ assert staged_bp_path == os.path.join(
107
+ "/out/uploads", prefix, f"{blueprint_name}.yaml"
108
+ )
109
+ unique_name = get_unique_name(project_id, region, zone)
110
+ gcluster_manager.deploy(
111
+ blueprint_path=staged_bp_path, deployment_name=unique_name, dry_run=True
112
+ )
113
+
114
+
115
+ @pytest.mark.skip(
116
+ reason=(
117
+ "This test requires A3 capacity, therefore it should not be run on each"
118
+ " build. Please invoke it manually if needed. "
119
+ )
120
+ )
121
+ def test_create_a3_ultra_deployment(setup_tests):
122
+ assert project_id is not None
123
+ assert region is not None
124
+ assert zone is not None
125
+ assert auth_cidr is not None
126
+ assert ctk_gcloud_cfg is not None
127
+ assert cluster_name is not None
128
+ docker_path, bp_path = setup_tests[0], setup_tests[1]
129
+ blueprint_name = f"{cluster_name}-a3-ultra-xpk"
130
+
131
+ docker_manager = DockerManager(
132
+ gcloud_cfg_path=ctk_gcloud_cfg, working_dir=docker_path
133
+ )
134
+ docker_manager.initialize()
135
+
136
+ bpm = BlueprintGenerator(storage_path=bp_path)
137
+ a3_mega_blueprint = bpm.generate_a3_ultra_blueprint(
138
+ cluster_name=cluster_name,
139
+ blueprint_name=blueprint_name,
140
+ region=region,
141
+ project_id=project_id,
142
+ auth_cidr=auth_cidr,
143
+ zone=zone,
144
+ capacity_type=CapacityType.SPOT,
145
+ num_nodes=1,
146
+ system_node_pool_machine_type="e2-standard-16",
147
+ )
148
+ blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
149
+ blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
150
+
151
+ assert a3_mega_blueprint.blueprint_file == blueprint_test_path
152
+ assert a3_mega_blueprint.blueprint_dependencies == blueprint_deps_test_path
153
+
154
+ assert os.path.isfile(blueprint_test_path)
155
+ assert os.path.isdir(blueprint_deps_test_path)
156
+ assert os.path.isfile(
157
+ os.path.join(blueprint_deps_test_path, "mlgru-disable.yaml")
158
+ )
159
+ assert os.path.isfile(
160
+ os.path.join(blueprint_deps_test_path, "nccl-installer.yaml")
161
+ )
162
+ gcluster_manager = GclusterManager(
163
+ gcluster_command_runner=docker_manager, remote_state_client=None
164
+ )
165
+
166
+ staged_bp_path = gcluster_manager.stage_files(
167
+ blueprint_file=a3_mega_blueprint.blueprint_file,
168
+ blueprint_dependencies=a3_mega_blueprint.blueprint_dependencies,
169
+ )
170
+
171
+ gcluster_manager.deploy(
172
+ blueprint_path=staged_bp_path, deployment_name=blueprint_name
173
+ )
174
+
175
+ # cleanup part
176
+ gcluster_manager.destroy_deployment(deployment_name=blueprint_name)