xpk 0.15.0__tar.gz → 0.16.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. xpk-0.16.1/.dockerignore +142 -0
  2. xpk-0.16.1/.github/CODEOWNERS +1 -0
  3. xpk-0.16.1/.github/PULL_REQUEST_TEMPLATE.md +18 -0
  4. xpk-0.16.1/.github/actions/install-kjob/action.yml +35 -0
  5. xpk-0.16.1/.github/actions/install-kueue/action.yml +32 -0
  6. xpk-0.16.1/.github/actions/setup-test-env/action.yml +53 -0
  7. xpk-0.16.1/.github/release.yaml +25 -0
  8. xpk-0.16.1/.github/workflows/README.md +30 -0
  9. xpk-0.16.1/.github/workflows/build_tests.yaml +71 -0
  10. xpk-0.16.1/.github/workflows/build_wheels.yaml +71 -0
  11. xpk-0.16.1/.github/workflows/cleanup.yaml +45 -0
  12. xpk-0.16.1/.github/workflows/gemini-dispatch.yml +204 -0
  13. xpk-0.16.1/.github/workflows/gemini-invoke.yml +252 -0
  14. xpk-0.16.1/.github/workflows/gemini-review.yml +278 -0
  15. xpk-0.16.1/.github/workflows/gemini-scheduled-triage.yml +317 -0
  16. xpk-0.16.1/.github/workflows/gemini-triage.yml +204 -0
  17. xpk-0.16.1/.github/workflows/integration_basic_cluster_create.yaml +219 -0
  18. xpk-0.16.1/.github/workflows/integration_legacy_tests.yaml +67 -0
  19. xpk-0.16.1/.github/workflows/integration_pathways_cluster_create.yaml +63 -0
  20. xpk-0.16.1/.github/workflows/integration_ray_cluster_create.yaml +51 -0
  21. xpk-0.16.1/.github/workflows/integration_storage_tests.yaml +160 -0
  22. xpk-0.16.1/.github/workflows/label-validation.yaml +54 -0
  23. xpk-0.16.1/.github/workflows/nightly_tests.yaml +53 -0
  24. xpk-0.16.1/.github/workflows/reusable_build_kjob.yaml +23 -0
  25. xpk-0.16.1/.github/workflows/reusable_build_scripts.yaml +35 -0
  26. xpk-0.16.1/.github/workflows/reusable_build_wheel.yaml +48 -0
  27. xpk-0.16.1/.github/workflows/reusable_goldens.yaml +45 -0
  28. xpk-0.16.1/.github/workflows/reusable_integration_tests.yaml +62 -0
  29. xpk-0.16.1/.github/workflows/reusable_lint_and_format.yml +52 -0
  30. xpk-0.16.1/.github/workflows/reusable_storage_create.yaml +161 -0
  31. xpk-0.16.1/.github/workflows/reusable_storage_delete.yaml +73 -0
  32. xpk-0.16.1/.github/workflows/reusable_unit_tests.yaml +42 -0
  33. xpk-0.16.1/.github/workflows/stale.yaml +35 -0
  34. xpk-0.16.1/.gitignore +149 -0
  35. xpk-0.16.1/.pre-commit-config.yaml +13 -0
  36. xpk-0.16.1/Makefile +112 -0
  37. xpk-0.16.1/PKG-INFO +127 -0
  38. xpk-0.16.1/README.md +87 -0
  39. xpk-0.16.1/backoff_retry.sh +177 -0
  40. xpk-0.16.1/data/Dockerfile +71 -0
  41. xpk-0.16.1/docs/code-of-conduct.md +109 -0
  42. xpk-0.16.1/docs/contributing.md +87 -0
  43. xpk-0.16.1/docs/installation.md +176 -0
  44. xpk-0.16.1/docs/local_testing.md +61 -0
  45. xpk-0.16.1/docs/permissions.md +27 -0
  46. xpk-0.16.1/docs/testing.md +101 -0
  47. xpk-0.16.1/docs/troubleshooting.md +164 -0
  48. xpk-0.16.1/docs/usage/advanced.md +36 -0
  49. xpk-0.16.1/docs/usage/autoprovisioning.md +189 -0
  50. xpk-0.16.1/docs/usage/clusters.md +329 -0
  51. xpk-0.16.1/docs/usage/cpu.md +46 -0
  52. xpk-0.16.1/docs/usage/docker.md +72 -0
  53. xpk-0.16.1/docs/usage/gpu.md +120 -0
  54. xpk-0.16.1/docs/usage/inspector.md +58 -0
  55. xpk-0.16.1/docs/usage/job.md +41 -0
  56. xpk-0.16.1/docs/usage/run.md +44 -0
  57. xpk-0.16.1/docs/usage/storage.md +202 -0
  58. xpk-0.16.1/docs/usage/tpu7x/clusters.md +329 -0
  59. xpk-0.16.1/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +356 -0
  60. xpk-0.16.1/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +560 -0
  61. xpk-0.16.1/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +334 -0
  62. xpk-0.16.1/docs/usage/tpu7x/workloads.md +269 -0
  63. xpk-0.16.1/docs/usage/workloads.md +269 -0
  64. xpk-0.16.1/examples/batch.md +24 -0
  65. xpk-0.16.1/examples/fake_training.py +74 -0
  66. xpk-0.16.1/examples/job.sh +12 -0
  67. xpk-0.16.1/examples/llama-3.1-finetuning/check_cuda.sh +9 -0
  68. xpk-0.16.1/examples/llama-3.1-finetuning/requirements.txt +9 -0
  69. xpk-0.16.1/examples/llama-3.1-finetuning/train.py +100 -0
  70. xpk-0.16.1/examples/llama-3.1-finetuning/train.slurm +14 -0
  71. xpk-0.16.1/examples/llama-3.1-finetuning/training_data.jsonl +11 -0
  72. xpk-0.16.1/examples/nccl/nccl-a3mega.sh +78 -0
  73. xpk-0.16.1/examples/nccl/nccl-a3ultra.sh +84 -0
  74. xpk-0.16.1/examples/nccl/nccl.md +55 -0
  75. xpk-0.16.1/examples/storage/filestore-manifest-attach.yaml +31 -0
  76. xpk-0.16.1/examples/storage/gcsfuse-manifest.yaml +33 -0
  77. xpk-0.16.1/examples/storage/lustre-manifest-attach.yaml +34 -0
  78. xpk-0.16.1/examples/storage/parallelstore-manifest-attach.yaml +34 -0
  79. xpk-0.16.1/examples/storage/pd-manifest-attach.yaml +30 -0
  80. xpk-0.16.1/golden_buddy.sh +150 -0
  81. xpk-0.16.1/goldens/Basic_cluster_create.txt +365 -0
  82. xpk-0.16.1/goldens/Batch.txt +19 -0
  83. xpk-0.16.1/goldens/Cluster_create_for_multi-host_nodepool.txt +367 -0
  84. xpk-0.16.1/goldens/Cluster_create_for_single-host_single-slice_TPU.txt +199 -0
  85. xpk-0.16.1/goldens/Cluster_create_private.txt +384 -0
  86. xpk-0.16.1/goldens/Cluster_create_sub-slicing.txt +388 -0
  87. xpk-0.16.1/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +369 -0
  88. xpk-0.16.1/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +369 -0
  89. xpk-0.16.1/goldens/Cluster_create_with_Managed_Lustre_driver.txt +370 -0
  90. xpk-0.16.1/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +370 -0
  91. xpk-0.16.1/goldens/Cluster_create_with_gb200-4.txt +388 -0
  92. xpk-0.16.1/goldens/Cluster_create_with_shared_reservation.txt +370 -0
  93. xpk-0.16.1/goldens/Cluster_delete.txt +19 -0
  94. xpk-0.16.1/goldens/Cluster_delete_force.txt +16 -0
  95. xpk-0.16.1/goldens/Job_cancel.txt +14 -0
  96. xpk-0.16.1/goldens/Job_info.txt +21 -0
  97. xpk-0.16.1/goldens/Job_list.txt +14 -0
  98. xpk-0.16.1/goldens/NAP_cluster-create.txt +406 -0
  99. xpk-0.16.1/goldens/NAP_cluster-create_with_pathways.txt +418 -0
  100. xpk-0.16.1/goldens/Storage_list.txt +5 -0
  101. xpk-0.16.1/goldens/Workload_create.txt +156 -0
  102. xpk-0.16.1/goldens/Workload_create_pathways.txt +134 -0
  103. xpk-0.16.1/goldens/Workload_create_sub-slicing.txt +161 -0
  104. xpk-0.16.1/goldens/Workload_create_with_output-manifest-file.txt +157 -0
  105. xpk-0.16.1/goldens/Workload_delete.txt +17 -0
  106. xpk-0.16.1/goldens/Workload_list.txt +17 -0
  107. xpk-0.16.1/goldens.yaml +51 -0
  108. xpk-0.16.1/pylintrc +404 -0
  109. {xpk-0.15.0 → xpk-0.16.1}/pyproject.toml +2 -4
  110. xpk-0.16.1/src/integration/README.md +19 -0
  111. xpk-0.16.1/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  112. xpk-0.16.1/src/xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  113. xpk-0.16.1/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  114. xpk-0.16.1/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  115. xpk-0.16.1/src/xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  116. xpk-0.16.1/src/xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  117. xpk-0.16.1/src/xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  118. xpk-0.16.1/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  119. xpk-0.16.1/src/xpk/blueprints/a4/storage_crd.yaml +52 -0
  120. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/cluster.py +33 -12
  121. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/cluster_gcluster_test.py +5 -1
  122. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/cluster_test.py +125 -0
  123. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/config.py +3 -3
  124. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/inspector.py +5 -3
  125. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/kind.py +2 -0
  126. xpk-0.16.1/src/xpk/commands/managed_ml_diagnostics.py +249 -0
  127. xpk-0.16.1/src/xpk/commands/managed_ml_diagnostics_test.py +146 -0
  128. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/workload.py +125 -139
  129. xpk-0.16.1/src/xpk/commands/workload_test.py +206 -0
  130. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/blueprint/blueprint_generator.py +3 -0
  131. xpk-0.16.1/src/xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  132. xpk-0.16.1/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  133. xpk-0.16.1/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  134. xpk-0.16.1/src/xpk/core/blueprint/testing/data/a4.yaml +185 -0
  135. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/capacity.py +2 -0
  136. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/cluster.py +18 -47
  137. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/cluster_test.py +76 -1
  138. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/config.py +81 -7
  139. xpk-0.16.1/src/xpk/core/config_test.py +127 -0
  140. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/docker_container.py +3 -1
  141. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/docker_image.py +10 -6
  142. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/docker_resources.py +1 -10
  143. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/kjob.py +17 -16
  144. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/kueue_manager.py +13 -19
  145. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/kueue_manager_test.py +27 -1
  146. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/nap.py +13 -14
  147. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/nodepool.py +17 -15
  148. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/nodepool_test.py +25 -4
  149. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/pathways.py +23 -0
  150. xpk-0.16.1/src/xpk/core/pathways_test.py +57 -0
  151. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/resources.py +84 -27
  152. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/scheduling.py +128 -132
  153. xpk-0.16.1/src/xpk/core/scheduling_test.py +323 -0
  154. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/system_characteristics.py +179 -0
  155. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/system_characteristics_test.py +49 -1
  156. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/telemetry.py +4 -4
  157. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/telemetry_test.py +9 -9
  158. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/vertex.py +4 -3
  159. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  160. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/main.py +2 -0
  161. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/cluster.py +22 -88
  162. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/cluster_test.py +41 -0
  163. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/common.py +84 -0
  164. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/storage.py +10 -0
  165. xpk-0.16.1/src/xpk/parser/storage_test.py +47 -0
  166. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/workload.py +14 -41
  167. xpk-0.16.1/src/xpk/parser/workload_test.py +36 -0
  168. xpk-0.16.1/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  169. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/feature_flags.py +3 -0
  170. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/validation.py +2 -2
  171. xpk-0.16.1/src/xpk.egg-info/PKG-INFO +127 -0
  172. xpk-0.16.1/src/xpk.egg-info/SOURCES.txt +285 -0
  173. {xpk-0.15.0 → xpk-0.16.1}/src/xpk.egg-info/requires.txt +1 -0
  174. xpk-0.16.1/tools/Dockerfile-kjob +33 -0
  175. xpk-0.16.1/tools/build-kjob.sh +9 -0
  176. xpk-0.16.1/tools/install-gke-auth-plugin.sh +64 -0
  177. xpk-0.16.1/tools/install-xpk.sh +11 -0
  178. xpk-0.16.1/xpk-large-scale-guide.sh +562 -0
  179. xpk-0.16.1/xpk-notebooks.md +340 -0
  180. xpk-0.16.1/xpk-slurm-commands.md +382 -0
  181. xpk-0.16.1/xpk.py +39 -0
  182. xpk-0.15.0/PKG-INFO +0 -1666
  183. xpk-0.15.0/README.md +0 -1627
  184. xpk-0.15.0/src/xpk/commands/workload_test.py +0 -164
  185. xpk-0.15.0/src/xpk/core/config_test.py +0 -71
  186. xpk-0.15.0/src/xpk/core/scheduling_test.py +0 -110
  187. xpk-0.15.0/src/xpk/parser/workload_test.py +0 -82
  188. xpk-0.15.0/src/xpk.egg-info/PKG-INFO +0 -1666
  189. xpk-0.15.0/src/xpk.egg-info/SOURCES.txt +0 -152
  190. {xpk-0.15.0 → xpk-0.16.1}/LICENSE +0 -0
  191. {xpk-0.15.0 → xpk-0.16.1}/setup.cfg +0 -0
  192. {xpk-0.15.0 → xpk-0.16.1}/src/integration/__init__.py +0 -0
  193. {xpk-0.15.0 → xpk-0.16.1}/src/integration/docker_manager_test.py +0 -0
  194. {xpk-0.15.0 → xpk-0.16.1}/src/integration/gcluster_a3mega_test.py +0 -0
  195. {xpk-0.15.0 → xpk-0.16.1}/src/integration/gcluster_a3ultra_test.py +0 -0
  196. {xpk-0.15.0 → xpk-0.16.1}/src/integration/gcluster_a4_test.py +0 -0
  197. {xpk-0.15.0 → xpk-0.16.1}/src/integration/gcluster_test.py +0 -0
  198. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/__init__.py +0 -0
  199. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/api/__init__.py +0 -0
  200. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/api/storage_crd.yaml +0 -0
  201. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/__init__.py +0 -0
  202. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/batch.py +0 -0
  203. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/cluster_gcluster.py +0 -0
  204. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/common.py +0 -0
  205. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/info.py +0 -0
  206. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/job.py +0 -0
  207. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/kjob_common.py +0 -0
  208. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/run.py +0 -0
  209. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/shell.py +0 -0
  210. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/storage.py +0 -0
  211. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/commands/version.py +0 -0
  212. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/__init__.py +0 -0
  213. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/blueprint/__init__.py +0 -0
  214. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  215. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  216. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/blueprint/testing/__init__.py +0 -0
  217. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/capacity_test.py +0 -0
  218. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/cluster_private.py +0 -0
  219. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/commands.py +0 -0
  220. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/docker_manager.py +0 -0
  221. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/filestore.py +0 -0
  222. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/gcloud_context.py +0 -0
  223. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/gcloud_context_test.py +0 -0
  224. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/gcluster_manager.py +0 -0
  225. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/gcsfuse.py +0 -0
  226. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/jobset.py +0 -0
  227. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/monitoring.py +0 -0
  228. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/mtc.py +0 -0
  229. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/network.py +0 -0
  230. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/ray.py +0 -0
  231. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/remote_state/__init__.py +0 -0
  232. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  233. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  234. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/storage.py +0 -0
  235. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/testing/__init__.py +0 -0
  236. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/testing/commands_tester.py +0 -0
  237. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/testing/commands_tester_test.py +0 -0
  238. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/updates.py +0 -0
  239. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/updates_test.py +0 -0
  240. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload.py +0 -0
  241. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_decorators/__init__.py +0 -0
  242. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  243. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  244. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  245. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  246. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/core/workload_test.py +0 -0
  247. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/__init__.py +0 -0
  248. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/batch.py +0 -0
  249. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/config.py +0 -0
  250. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/core.py +0 -0
  251. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/info.py +0 -0
  252. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/inspector.py +0 -0
  253. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/job.py +0 -0
  254. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/kind.py +0 -0
  255. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/run.py +0 -0
  256. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/shell.py +0 -0
  257. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/validators.py +0 -0
  258. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/parser/version.py +0 -0
  259. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/telemetry_uploader.py +0 -0
  260. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/__init__.py +0 -0
  261. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
  262. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/filestore-pv.yaml +0 -0
  263. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/filestore-pvc.yaml +0 -0
  264. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/filestore-sc.yaml +0 -0
  265. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/fuse-pv.yaml +0 -0
  266. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/fuse-pvc.yaml +0 -0
  267. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
  268. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
  269. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
  270. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/mtc-cpc.yaml +0 -0
  271. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/storage.yaml +0 -0
  272. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/templates/volume_bundle.yaml +0 -0
  273. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/__init__.py +0 -0
  274. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/console.py +0 -0
  275. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/console_test.py +0 -0
  276. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/execution_context.py +0 -0
  277. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/file.py +0 -0
  278. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/gcs_utils.py +0 -0
  279. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/kubectl.py +0 -0
  280. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/kueue.py +0 -0
  281. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/network.py +0 -0
  282. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/objects.py +0 -0
  283. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/templates.py +0 -0
  284. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/topology.py +0 -0
  285. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/topology_test.py +0 -0
  286. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/user_agent.py +0 -0
  287. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/user_agent_test.py +0 -0
  288. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/user_input.py +0 -0
  289. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/user_input_test.py +0 -0
  290. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/validation_test.py +0 -0
  291. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/versions.py +0 -0
  292. {xpk-0.15.0 → xpk-0.16.1}/src/xpk/utils/yaml.py +0 -0
  293. {xpk-0.15.0 → xpk-0.16.1}/src/xpk.egg-info/dependency_links.txt +0 -0
  294. {xpk-0.15.0 → xpk-0.16.1}/src/xpk.egg-info/entry_points.txt +0 -0
  295. {xpk-0.15.0 → xpk-0.16.1}/src/xpk.egg-info/top_level.txt +0 -0
@@ -0,0 +1,142 @@
1
+ # editor and IDE paraphernalia
2
+ .idea/
3
+
4
+ *__pycache__*
5
+ tmp/
6
+ .pytype
7
+ .mypy_cache
8
+ # Byte-compiled / optimized / DLL files
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+
13
+ # C extensions
14
+ *.so
15
+ bin/
16
+ # Distribution / packaging
17
+ .Python
18
+ build/
19
+ develop-eggs/
20
+ dist/
21
+ downloads/
22
+ eggs/
23
+ .eggs/
24
+ lib/
25
+ lib64/
26
+ parts/
27
+ sdist/
28
+ var/
29
+ wheels/
30
+ pip-wheel-metadata/
31
+ share/python-wheels/
32
+ *.egg-info/
33
+ .installed.cfg
34
+ *.egg
35
+ MANIFEST
36
+
37
+ # PyInstaller
38
+ # Usually these files are written by a python script from a template
39
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
40
+ *.manifest
41
+ *.spec
42
+
43
+ # Installer logs
44
+ pip-log.txt
45
+ pip-delete-this-directory.txt
46
+
47
+ # Unit test / coverage reports
48
+ htmlcov/
49
+ .tox/
50
+ .nox/
51
+ .coverage
52
+ .coverage.*
53
+ .cache
54
+ nosetests.xml
55
+ coverage.xml
56
+ *.cover
57
+ *.py,cover
58
+ .hypothesis/
59
+ .pytest_cache/
60
+
61
+ # Translations
62
+ *.mo
63
+ *.pot
64
+
65
+ # Django stuff:
66
+ *.log
67
+ local_settings.py
68
+ db.sqlite3
69
+ db.sqlite3-journal
70
+
71
+ # Flask stuff:
72
+ instance/
73
+ .webassets-cache
74
+
75
+ # Scrapy stuff:
76
+ .scrapy
77
+
78
+ # Sphinx documentation
79
+ docs/_build/
80
+
81
+ # PyBuilder
82
+ target/
83
+
84
+ # Jupyter Notebook
85
+ .ipynb_checkpoints
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102
+ __pypackages__/
103
+
104
+ # Celery stuff
105
+ celerybeat-schedule
106
+ celerybeat.pid
107
+
108
+ # SageMath parsed files
109
+ *.sage.py
110
+
111
+ # Environments
112
+ .env
113
+ .venv
114
+ env/
115
+ venv/
116
+ ENV/
117
+ env.bak/
118
+ venv.bak/
119
+
120
+ # Spyder project settings
121
+ .spyderproject
122
+ .spyproject
123
+
124
+ # Rope project settings
125
+ .ropeproject
126
+
127
+ # mkdocs documentation
128
+ /site
129
+
130
+ # mypy
131
+ .mypy_cache/
132
+ .dmypy.json
133
+ dmypy.json
134
+
135
+ # Pyre type checker
136
+ .pyre/
137
+
138
+ # DS_Store files
139
+ **/.DS_Store
140
+
141
+ # XPK/Cluster Toolkit working directory
142
+ xpkclusters/*
@@ -0,0 +1 @@
1
+ * @scaliby @jamOne- @SikaGrr @FIoannides @stony-tark
@@ -0,0 +1,18 @@
1
+ # Description
2
+ <!--
3
+ Describe your change.
4
+ What does it introduce?
5
+ Why do we need it?
6
+ If you are releasing a feature, have you updated documentation?
7
+ -->
8
+
9
+ # Issue
10
+ <!--
11
+ Is there any related issue this change is trying to fix?
12
+ -->
13
+
14
+ # Testing
15
+ <!--
16
+ Have you performed any manual testing on your change?
17
+ Have you verified use cases affected by goldens?
18
+ -->
@@ -0,0 +1,35 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ name: "Install kjob"
16
+ description: "Installs kjob"
17
+
18
+ inputs:
19
+ version:
20
+ description: "The version to install"
21
+ required: false
22
+ default: "0.1.0"
23
+
24
+ runs:
25
+ using: composite
26
+ steps:
27
+ - uses: actions/download-artifact@v4
28
+ with:
29
+ name: kjob-artifact
30
+ - name: Set permissions
31
+ shell: bash
32
+ run: chmod +x kubectl-kjob
33
+ - name: Move binary
34
+ shell: bash
35
+ run: mv ./kubectl-kjob /usr/local/bin/kubectl-kjob
@@ -0,0 +1,32 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ name: "Install kueue"
16
+ description: "Installs kueue"
17
+
18
+ inputs:
19
+ version:
20
+ description: "The version to install"
21
+ required: false
22
+ default: "0.14.3"
23
+
24
+ runs:
25
+ using: composite
26
+ steps:
27
+ - name: Install kubectl-kueue
28
+ shell: bash
29
+ run: |
30
+ curl -Lo ./kubectl-kueue https://github.com/kubernetes-sigs/kueue/releases/download/v${{inputs.version}}/kubectl-kueue-linux-amd64
31
+ chmod +x ./kubectl-kueue
32
+ mv ./kubectl-kueue /usr/local/bin/kubectl-kueue
@@ -0,0 +1,53 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ name: "Setup test environment"
16
+ description: "Sets up integration test environment"
17
+
18
+ inputs:
19
+ credentials_json:
20
+ description: "GCP Service Account Key"
21
+ required: true
22
+
23
+ runs:
24
+ using: composite
25
+ steps:
26
+ - uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.10"
29
+ - uses: actions/download-artifact@v4
30
+ with:
31
+ name: python-package-distributions
32
+ path: dist/
33
+ - name: Set permissions
34
+ run: chmod +x ./backoff_retry.sh
35
+ shell: bash
36
+ - uses: "google-github-actions/auth@v2"
37
+ with:
38
+ credentials_json: "${{ inputs.credentials_json }}"
39
+ - uses: google-github-actions/setup-gcloud@v2
40
+ with:
41
+ version: ">= 363.0.0"
42
+ install_components: "beta,gke-gcloud-auth-plugin"
43
+ - name: Authenticate Docker
44
+ run: gcloud auth configure-docker --quiet
45
+ shell: bash
46
+ - uses: ./.github/actions/install-kueue
47
+ - uses: ./.github/actions/install-kjob
48
+ - name: Install XPK
49
+ run: pip install dist/xpk-*.whl
50
+ shell: bash
51
+ - name: Install expect package
52
+ run: sudo apt-get install expect
53
+ shell: bash
@@ -0,0 +1,25 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+ # For more info, see:
15
+ # https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes#configuration-options
16
+
17
+
18
+ changelog:
19
+ categories:
20
+ - title: New Features
21
+ labels: [release-features]
22
+ - title: Improvments
23
+ labels: [release-improvments]
24
+ - title: Bug fixes
25
+ labels: [release-bugfix]
@@ -0,0 +1,30 @@
1
+ <!--
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # Integration Test Workflows
18
+ The following tests are currently implemented through Github Actions:
19
+ * Create an XPK Cluster with zero node pools
20
+ * Delete the cluster created
21
+ * Create a Private XPK Cluster with 2x v4-8 nodepools
22
+ * Delete the cluster created
23
+ * Create an XPK Cluster with 2x v4-8 nodepools
24
+ * Delete the cluster created
25
+
26
+ ## Nightly Tests:
27
+ A cron job is scheduled to run at 12AM PST daily. The details of the jobs run are in `xpk/.github/workflows/nightly_tests.yaml`
28
+
29
+ ## Integration Tests:
30
+ Integration tests are run on a push to the `main` branch and on an approved PR. The details of the jobs run are in `xpk/.github/workflows/build_tests.yaml`
@@ -0,0 +1,71 @@
1
+ # Copyright 2024 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ name: Build Tests
16
+
17
+ on:
18
+ pull_request:
19
+ merge_group:
20
+ types: [checks_requested]
21
+
22
+ permissions:
23
+ contents: read
24
+
25
+ jobs:
26
+ install-dependencies:
27
+ runs-on: ubuntu-22.04
28
+ strategy:
29
+ matrix:
30
+ python-version: ["3.10", "3.11"]
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+ - uses: google-github-actions/setup-gcloud@v2
34
+ with:
35
+ version: '>= 363.0.0'
36
+ install_components: 'beta, gke-gcloud-auth-plugin'
37
+ - uses: actions/setup-python@v5
38
+ with:
39
+ python-version: ${{ matrix.python-version }}
40
+ - name: Check if cache exists
41
+ id: check-cache
42
+ uses: actions/cache@v3
43
+ with:
44
+ path: |
45
+ usr/local/bin/
46
+ ~/.cache/pip
47
+ ${{env.pythonLocation}}
48
+ key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
49
+ lookup-only: true
50
+ - name: install dependencies
51
+ if : steps.check-cache.outputs.cache-hit != 'true'
52
+ run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
53
+ - name: Cache dependencies
54
+ if : steps.check-cache.outputs.cache-hit != 'true'
55
+ uses: actions/cache/save@v3
56
+ with:
57
+ path: |
58
+ /usr/local/bin/kubectl-kueue
59
+ /usr/local/bin/kubectl-kjob
60
+ ~/.cache/pip
61
+ ${{env.pythonLocation}}
62
+ key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
63
+ linter:
64
+ needs: [install-dependencies]
65
+ uses: ./.github/workflows/reusable_lint_and_format.yml
66
+ verify-goldens:
67
+ needs: [install-dependencies]
68
+ uses: ./.github/workflows/reusable_goldens.yaml
69
+ run-unit-tests:
70
+ needs: [install-dependencies]
71
+ uses: ./.github/workflows/reusable_unit_tests.yaml
@@ -0,0 +1,71 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ # Note: Name of this file needs to be build_wheels.yaml as this is the value that has been specified in pypi
16
+ # If you want to change the name of this file, please update pypi publisher:
17
+ # https://docs.pypi.org/trusted-publishers/adding-a-publisher/
18
+
19
+ name: PyPi releases
20
+
21
+ permissions:
22
+ contents: read
23
+
24
+ on:
25
+ push:
26
+ tags:
27
+ - "v[0-9]+.[0-9]+.[0-9]+"
28
+
29
+ jobs:
30
+ build_wheel:
31
+ uses: ./.github/workflows/reusable_build_wheel.yaml
32
+ approval:
33
+ name: Wait for approval
34
+ needs: [build_wheel]
35
+ runs-on: ubuntu-latest
36
+ environment:
37
+ name: release
38
+ steps:
39
+ - run: echo "Deployment approved!"
40
+ github_release:
41
+ name: Create GitHub release
42
+ runs-on: ubuntu-latest
43
+ needs: [approval]
44
+ permissions:
45
+ contents: write
46
+ steps:
47
+ - name: Checkout code
48
+ uses: actions/checkout@v4
49
+ - name: Create Release
50
+ env:
51
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
52
+ run: gh release create ${{ github.ref_name }} --generate-notes --draft=false --prerelease=false
53
+ publish-to-pypi:
54
+ name: Publish Python distribution to PyPI
55
+ needs: [approval]
56
+ runs-on: ubuntu-latest
57
+ permissions:
58
+ id-token: write
59
+ environment:
60
+ # We should configure trusted publishing as specified here:
61
+ # https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#configuring-trusted-publishing
62
+ name: pypi
63
+ url: https://pypi.org/p/xpk # Replace <package-name> with your PyPI project name
64
+ steps:
65
+ - name: Download all the dists
66
+ uses: actions/download-artifact@v4
67
+ with:
68
+ name: python-package-distributions
69
+ path: dist/
70
+ - name: Publish distribution to PyPI
71
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,45 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ name: Cleanup caches by a branch
16
+
17
+ permissions:
18
+ contents: read
19
+
20
+ on:
21
+ pull_request:
22
+ types:
23
+ - closed
24
+
25
+ jobs:
26
+ cleanup:
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - name: Cleanup
30
+ run: |
31
+ echo "Fetching list of cache key"
32
+ cacheKeysForPR=$(gh cache list --ref $BRANCH --limit 100 --json id --jq '.[].id')
33
+
34
+ ## Setting this to not fail the workflow while deleting cache keys.
35
+ set +e
36
+ echo "Deleting caches..."
37
+ for cacheKey in $cacheKeysForPR
38
+ do
39
+ gh cache delete $cacheKey
40
+ done
41
+ echo "Done"
42
+ env:
43
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
44
+ GH_REPO: ${{ github.repository }}
45
+ BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge