xpk 1.0.0__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_basic_cluster_create.yaml +3 -35
  2. xpk-1.1.1/.github/workflows/integration_gpu_cluster_create.yaml +78 -0
  3. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/nightly_tests.yaml +7 -6
  4. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_goldens.yaml +1 -1
  5. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/stale.yaml +4 -4
  6. {xpk-1.0.0 → xpk-1.1.1}/Makefile +2 -10
  7. {xpk-1.0.0/src/xpk.egg-info → xpk-1.1.1}/PKG-INFO +37 -21
  8. {xpk-1.0.0 → xpk-1.1.1}/README.md +36 -20
  9. {xpk-1.0.0 → xpk-1.1.1}/docs/testing.md +37 -16
  10. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/clusters.md +2 -2
  11. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -4
  12. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -4
  13. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/workloads.md +3 -0
  14. xpk-1.1.1/recipes/Basic_cluster_adapt.md +143 -0
  15. xpk-1.0.0/goldens/Basic_cluster_create.txt → xpk-1.1.1/recipes/Basic_cluster_create.md +14 -5
  16. xpk-1.1.1/recipes/Cluster_create_RayCluster.md +288 -0
  17. xpk-1.0.0/goldens/Cluster_create_for_multi-host_nodepool.txt → xpk-1.1.1/recipes/Cluster_create_for_multi-host_nodepool.md +15 -6
  18. xpk-1.1.1/recipes/Cluster_create_for_single-host_nodepool.md +275 -0
  19. xpk-1.0.0/goldens/Cluster_create_private.txt → xpk-1.1.1/recipes/Cluster_create_private.md +15 -6
  20. xpk-1.0.0/goldens/Cluster_create_sub-slicing.txt → xpk-1.1.1/recipes/Cluster_create_sub-slicing.md +14 -5
  21. xpk-1.0.0/goldens/Cluster_create_super-slicing.txt → xpk-1.1.1/recipes/Cluster_create_super-slicing.md +18 -9
  22. xpk-1.0.0/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt → xpk-1.1.1/recipes/Cluster_create_with_CPU_and_memory_limits_above_capacity.md +14 -5
  23. xpk-1.0.0/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt → xpk-1.1.1/recipes/Cluster_create_with_CPU_and_memory_limits_below_capacity.md +14 -5
  24. xpk-1.0.0/goldens/Cluster_create_with_Managed_Lustre_driver.txt → xpk-1.1.1/recipes/Cluster_create_with_Managed_Lustre_driver.md +14 -5
  25. xpk-1.0.0/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt → xpk-1.1.1/recipes/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.md +14 -5
  26. xpk-1.0.0/goldens/Cluster_create_with_gb200-4.txt → xpk-1.1.1/recipes/Cluster_create_with_gb200-4.md +15 -6
  27. xpk-1.0.0/goldens/Cluster_create_with_shared_reservation.txt → xpk-1.1.1/recipes/Cluster_create_with_shared_reservation.md +14 -5
  28. xpk-1.0.0/goldens/Cluster_delete.txt → xpk-1.1.1/recipes/Cluster_delete.md +10 -1
  29. xpk-1.0.0/goldens/Cluster_delete_force.txt → xpk-1.1.1/recipes/Cluster_delete_force.md +10 -1
  30. xpk-1.0.0/goldens/NAP_cluster-create.txt → xpk-1.1.1/recipes/NAP_cluster-create.md +14 -5
  31. xpk-1.0.0/goldens/NAP_cluster-create_with_pathways.txt → xpk-1.1.1/recipes/NAP_cluster-create_with_pathways.md +14 -5
  32. xpk-1.0.0/goldens/Storage_list.txt → xpk-1.1.1/recipes/Storage_list.md +10 -1
  33. xpk-1.0.0/goldens/Workload_create_with_output-manifest-file.txt → xpk-1.1.1/recipes/Workload_create.md +15 -9
  34. xpk-1.0.0/goldens/Workload_create_pathways.txt → xpk-1.1.1/recipes/Workload_create_pathways.md +13 -6
  35. xpk-1.0.0/goldens/Workload_create_sub-slicing.txt → xpk-1.1.1/recipes/Workload_create_sub-slicing.md +15 -8
  36. xpk-1.0.0/goldens/Workload_create_super-slicing.txt → xpk-1.1.1/recipes/Workload_create_super-slicing.md +16 -9
  37. xpk-1.0.0/goldens/Workload_create.txt → xpk-1.1.1/recipes/Workload_create_with_output-manifest-file.md +16 -8
  38. xpk-1.0.0/goldens/Workload_delete.txt → xpk-1.1.1/recipes/Workload_delete.md +10 -1
  39. xpk-1.0.0/goldens/Workload_list.txt → xpk-1.1.1/recipes/Workload_list.md +10 -1
  40. xpk-1.1.1/recipes/comprehensive-demo.md +83 -0
  41. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster.py +29 -30
  42. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster_gcluster.py +19 -14
  43. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster_test.py +1 -21
  44. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/common.py +39 -6
  45. xpk-1.1.1/src/xpk/commands/common_test.py +170 -0
  46. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/info.py +9 -5
  47. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/inspector.py +33 -4
  48. xpk-1.1.1/src/xpk/commands/inspector_test.py +142 -0
  49. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/workload.py +35 -17
  50. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/workload_test.py +70 -3
  51. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_generator.py +19 -8
  52. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
  53. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a4.yaml +3 -1
  54. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/capacity.py +37 -17
  55. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/capacity_test.py +66 -1
  56. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/cluster.py +10 -10
  57. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/cluster_private.py +3 -3
  58. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/cluster_test.py +29 -2
  59. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_container.py +55 -30
  60. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_manager.py +4 -4
  61. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_resources.py +4 -1
  62. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/kueue_manager.py +6 -8
  63. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/kueue_manager_test.py +4 -5
  64. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/nap.py +14 -3
  65. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/nodepool.py +46 -13
  66. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/nodepool_test.py +143 -8
  67. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/pathways.py +4 -8
  68. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/remote_state/fuse_remote_state.py +1 -1
  69. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/scheduling.py +16 -13
  70. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/scheduling_test.py +15 -7
  71. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/system_characteristics.py +6 -0
  72. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/telemetry.py +11 -1
  73. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/telemetry_test.py +39 -0
  74. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/testing/commands_tester.py +26 -0
  75. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/testing/commands_tester_test.py +20 -1
  76. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/rdma_decorator.py +9 -0
  77. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/cluster.py +11 -1
  78. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/cluster_test.py +59 -1
  79. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/common.py +11 -0
  80. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/storage.py +3 -3
  81. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/console.py +1 -1
  82. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/feature_flags.py +7 -3
  83. {xpk-1.0.0 → xpk-1.1.1/src/xpk.egg-info}/PKG-INFO +37 -21
  84. {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/SOURCES.txt +32 -39
  85. xpk-1.1.1/src/xpk.egg-info/top_level.txt +1 -0
  86. {xpk-1.0.0 → xpk-1.1.1}/tools/install-xpk.sh +1 -1
  87. xpk-1.1.1/tools/recipes.py +235 -0
  88. xpk-1.0.0/.github/workflows/integration_legacy_tests.yaml +0 -66
  89. xpk-1.0.0/.github/workflows/reusable_integration_tests.yaml +0 -61
  90. xpk-1.0.0/docs/usage/tpu7x/clusters.md +0 -329
  91. xpk-1.0.0/docs/usage/tpu7x/workloads.md +0 -269
  92. xpk-1.0.0/golden_buddy.sh +0 -150
  93. xpk-1.0.0/goldens.yaml +0 -47
  94. xpk-1.0.0/src/integration/README.md +0 -19
  95. xpk-1.0.0/src/integration/docker_manager_test.py +0 -102
  96. xpk-1.0.0/src/integration/gcluster_a3mega_test.py +0 -215
  97. xpk-1.0.0/src/integration/gcluster_a3ultra_test.py +0 -187
  98. xpk-1.0.0/src/integration/gcluster_a4_test.py +0 -187
  99. xpk-1.0.0/src/integration/gcluster_test.py +0 -107
  100. xpk-1.0.0/src/xpk/utils/__init__.py +0 -15
  101. xpk-1.0.0/src/xpk/utils/user_input.py +0 -48
  102. xpk-1.0.0/src/xpk/utils/user_input_test.py +0 -92
  103. xpk-1.0.0/src/xpk.egg-info/top_level.txt +0 -2
  104. {xpk-1.0.0 → xpk-1.1.1}/.dockerignore +0 -0
  105. {xpk-1.0.0 → xpk-1.1.1}/.github/CODEOWNERS +0 -0
  106. {xpk-1.0.0 → xpk-1.1.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  107. {xpk-1.0.0 → xpk-1.1.1}/.github/actions/install-kueue/action.yml +0 -0
  108. {xpk-1.0.0 → xpk-1.1.1}/.github/actions/setup-test-env/action.yml +0 -0
  109. {xpk-1.0.0 → xpk-1.1.1}/.github/release.yaml +0 -0
  110. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/README.md +0 -0
  111. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/build_tests.yaml +0 -0
  112. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/build_wheels.yaml +0 -0
  113. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/cleanup.yaml +0 -0
  114. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-dispatch.yml +0 -0
  115. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-invoke.yml +0 -0
  116. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-review.yml +0 -0
  117. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-scheduled-triage.yml +0 -0
  118. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-triage.yml +0 -0
  119. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
  120. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
  121. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_storage_tests.yaml +0 -0
  122. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/label-validation.yaml +0 -0
  123. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/periodic_release.yaml +0 -0
  124. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/release_branch_versioning.yaml +0 -0
  125. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_build_scripts.yaml +0 -0
  126. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_build_wheel.yaml +0 -0
  127. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_lint_and_format.yml +0 -0
  128. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_storage_create.yaml +0 -0
  129. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_storage_delete.yaml +0 -0
  130. {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_unit_tests.yaml +0 -0
  131. {xpk-1.0.0 → xpk-1.1.1}/.gitignore +0 -0
  132. {xpk-1.0.0 → xpk-1.1.1}/.pre-commit-config.yaml +0 -0
  133. {xpk-1.0.0 → xpk-1.1.1}/LICENSE +0 -0
  134. {xpk-1.0.0 → xpk-1.1.1}/backoff_retry.sh +0 -0
  135. {xpk-1.0.0 → xpk-1.1.1}/data/Dockerfile +0 -0
  136. {xpk-1.0.0 → xpk-1.1.1}/docs/code-of-conduct.md +0 -0
  137. {xpk-1.0.0 → xpk-1.1.1}/docs/contributing.md +0 -0
  138. {xpk-1.0.0 → xpk-1.1.1}/docs/installation.md +0 -0
  139. {xpk-1.0.0 → xpk-1.1.1}/docs/permissions.md +0 -0
  140. {xpk-1.0.0 → xpk-1.1.1}/docs/troubleshooting.md +0 -0
  141. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/advanced.md +0 -0
  142. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/autoprovisioning.md +0 -0
  143. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/cpu.md +0 -0
  144. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/docker.md +0 -0
  145. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/gpu.md +0 -0
  146. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/inspector.md +0 -0
  147. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/storage.md +0 -0
  148. {xpk-1.0.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
  149. {xpk-1.0.0 → xpk-1.1.1}/examples/fake_training.py +0 -0
  150. {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
  151. {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/requirements.txt +0 -0
  152. {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/train.py +0 -0
  153. {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/train.slurm +0 -0
  154. {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
  155. {xpk-1.0.0 → xpk-1.1.1}/examples/nccl/nccl-a3mega.sh +0 -0
  156. {xpk-1.0.0 → xpk-1.1.1}/examples/nccl/nccl-a3ultra.sh +0 -0
  157. {xpk-1.0.0 → xpk-1.1.1}/examples/nccl/nccl.md +0 -0
  158. {xpk-1.0.0 → xpk-1.1.1}/examples/storage/filestore-manifest-attach.yaml +0 -0
  159. {xpk-1.0.0 → xpk-1.1.1}/examples/storage/gcsfuse-manifest.yaml +0 -0
  160. {xpk-1.0.0 → xpk-1.1.1}/examples/storage/lustre-manifest-attach.yaml +0 -0
  161. {xpk-1.0.0 → xpk-1.1.1}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
  162. {xpk-1.0.0 → xpk-1.1.1}/examples/storage/pd-manifest-attach.yaml +0 -0
  163. {xpk-1.0.0 → xpk-1.1.1}/pylintrc +0 -0
  164. {xpk-1.0.0 → xpk-1.1.1}/pyproject.toml +0 -0
  165. {xpk-1.0.0 → xpk-1.1.1}/setup.cfg +0 -0
  166. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/__init__.py +0 -0
  167. {xpk-1.0.0/src/integration → xpk-1.1.1/src/xpk/api}/__init__.py +0 -0
  168. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/api/storage_crd.yaml +0 -0
  169. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
  170. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
  171. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
  172. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
  173. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
  174. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
  175. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
  176. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
  177. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
  178. {xpk-1.0.0/src/xpk/api → xpk-1.1.1/src/xpk/commands}/__init__.py +0 -0
  179. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster_gcluster_test.py +0 -0
  180. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/config.py +0 -0
  181. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
  182. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
  183. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/storage.py +0 -0
  184. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/version.py +0 -0
  185. {xpk-1.0.0/src/xpk/commands → xpk-1.1.1/src/xpk/core}/__init__.py +0 -0
  186. {xpk-1.0.0/src/xpk/core → xpk-1.1.1/src/xpk/core/blueprint}/__init__.py +0 -0
  187. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  188. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  189. {xpk-1.0.0/src/xpk/core/blueprint → xpk-1.1.1/src/xpk/core/blueprint/testing}/__init__.py +0 -0
  190. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
  191. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
  192. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/commands.py +0 -0
  193. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/config.py +0 -0
  194. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/config_test.py +0 -0
  195. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_image.py +0 -0
  196. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/filestore.py +0 -0
  197. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcloud_context.py +0 -0
  198. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcloud_context_test.py +0 -0
  199. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcluster_manager.py +0 -0
  200. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcsfuse.py +0 -0
  201. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/jobset.py +0 -0
  202. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/monitoring.py +0 -0
  203. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/mtc.py +0 -0
  204. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/network.py +0 -0
  205. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/pathways_test.py +0 -0
  206. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/ray.py +0 -0
  207. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/remote_state/__init__.py +0 -0
  208. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  209. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/resources.py +0 -0
  210. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/storage.py +0 -0
  211. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/system_characteristics_test.py +0 -0
  212. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/testing/__init__.py +0 -0
  213. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/updates.py +0 -0
  214. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/updates_test.py +0 -0
  215. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/vertex.py +0 -0
  216. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload.py +0 -0
  217. {xpk-1.0.0/src/xpk/core/blueprint/testing → xpk-1.1.1/src/xpk/core/workload_decorators}/__init__.py +0 -0
  218. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  219. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  220. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  221. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  222. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_test.py +0 -0
  223. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/main.py +0 -0
  224. {xpk-1.0.0/src/xpk/core/workload_decorators → xpk-1.1.1/src/xpk/parser}/__init__.py +0 -0
  225. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/common_test.py +0 -0
  226. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/config.py +0 -0
  227. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/core.py +0 -0
  228. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/info.py +0 -0
  229. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/inspector.py +0 -0
  230. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/storage_test.py +0 -0
  231. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/validators.py +0 -0
  232. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/version.py +0 -0
  233. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/workload.py +0 -0
  234. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/workload_test.py +0 -0
  235. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/telemetry_uploader.py +0 -0
  236. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/__init__.py +0 -0
  237. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
  238. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
  239. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/filestore-pv.yaml +0 -0
  240. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/filestore-pvc.yaml +0 -0
  241. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/filestore-sc.yaml +0 -0
  242. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/fuse-pv.yaml +0 -0
  243. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/fuse-pvc.yaml +0 -0
  244. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
  245. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
  246. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
  247. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
  248. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/mtc-cpc.yaml +0 -0
  249. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/storage.yaml +0 -0
  250. {xpk-1.0.0/src/xpk/parser → xpk-1.1.1/src/xpk/utils}/__init__.py +0 -0
  251. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/console_test.py +0 -0
  252. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/execution_context.py +0 -0
  253. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/file.py +0 -0
  254. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/gcs_utils.py +0 -0
  255. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/kubectl.py +0 -0
  256. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/kueue.py +0 -0
  257. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/network.py +0 -0
  258. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/objects.py +0 -0
  259. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/templates.py +0 -0
  260. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/topology.py +0 -0
  261. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/topology_test.py +0 -0
  262. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/user_agent.py +0 -0
  263. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/user_agent_test.py +0 -0
  264. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/validation.py +0 -0
  265. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/validation_test.py +0 -0
  266. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/versions.py +0 -0
  267. {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/yaml.py +0 -0
  268. {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/dependency_links.txt +0 -0
  269. {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/entry_points.txt +0 -0
  270. {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/requires.txt +0 -0
  271. {xpk-1.0.0 → xpk-1.1.1}/tools/install-gke-auth-plugin.sh +0 -0
  272. {xpk-1.0.0 → xpk-1.1.1}/xpk-large-scale-guide.sh +0 -0
  273. {xpk-1.0.0 → xpk-1.1.1}/xpk-notebooks.md +0 -0
  274. {xpk-1.0.0 → xpk-1.1.1}/xpk.py +0 -0
@@ -31,7 +31,7 @@ jobs:
31
31
  group: nightly-test-cluster-group-empty
32
32
  cancel-in-progress: false
33
33
  env:
34
- EMPTY_CLUSTER_NAME: nightly-xpk-zero-nodepools
34
+ EMPTY_CLUSTER_NAME: nightly-xpk-zero
35
35
  steps:
36
36
  - uses: actions/download-artifact@v4
37
37
  with:
@@ -59,7 +59,7 @@ jobs:
59
59
  group: nightly-test-cluster-group-private
60
60
  cancel-in-progress: false
61
61
  env:
62
- PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8-nodepools
62
+ PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8
63
63
  steps:
64
64
  - uses: actions/download-artifact@v4
65
65
  with:
@@ -83,38 +83,6 @@ jobs:
83
83
  with:
84
84
  name: empty-private-cluster-nodepool-log-${{github.run_id}}
85
85
  path: /tmp/NodepoolCreate-${{ env.PRIVATE_CLUSTER_NAME }}-np-*
86
- dws_flex_cluster:
87
- runs-on: [ubuntu-22.04]
88
- concurrency: # We support one build test to run at a time currently.
89
- group: nightly-test-cluster-group-flex
90
- cancel-in-progress: false
91
- env:
92
- DWS_FLEX_CLUSTER_NAME: xpk-dws-nightly-test-2-v4-8
93
- steps:
94
- - uses: actions/download-artifact@v4
95
- with:
96
- name: custom-scripts
97
- - name: Setup environment
98
- uses: ./.github/actions/setup-test-env
99
- with:
100
- credentials_json: "${{ secrets.GCP_SA_KEY }}"
101
- - name: Check xpk installation
102
- run: xpk version
103
- - name: Create a DWS flex queued xpk cluster
104
- run: xpk cluster create --cluster ${DWS_FLEX_CLUSTER_NAME} --tpu-type=v5p-8 --num-slices=1 --zone=us-east5-a --default-pool-cpu-num-nodes=2 --flex --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS_DWS}"
105
- - name: Run dws flex queued TPU workload
106
- run: xpk workload create --workload xpktest-build-${{ github.run_attempt }}-dws --cluster ${DWS_FLEX_CLUSTER_NAME} --zone=us-east5-a --tpu-type=v5p-8 --flex --command "echo foo" --num-slices=1
107
- - name: Wait for workload completion and confirm it succeeded
108
- run: xpk workload list --cluster ${DWS_FLEX_CLUSTER_NAME} --zone=us-east5-a --wait-for-job-completion xpktest-build-${{ github.run_attempt }}-dws --timeout 1000
109
- - name: Delete the DWS flex queued cluster
110
- if: always()
111
- run: xpk cluster delete --cluster ${DWS_FLEX_CLUSTER_NAME} --zone=us-east5-a --force
112
- - name: Upload DWS cluster nodepool creation log
113
- if: always()
114
- uses: actions/upload-artifact@v4
115
- with:
116
- name: empty-dws-cluster-nodepool-log-${{github.run_id}}
117
- path: /tmp/NodepoolCreate-${{ env.DWS_FLEX_CLUSTER_NAME }}-np-*
118
86
 
119
87
  cluster-create-and-delete:
120
88
  runs-on: [ubuntu-22.04]
@@ -122,7 +90,7 @@ jobs:
122
90
  group: nightly-test-cluster-group-tpu
123
91
  cancel-in-progress: false
124
92
  env:
125
- TPU_CLUSTER_NAME: nightly-xpk-2-v5p-8-nodepools
93
+ TPU_CLUSTER_NAME: nightly-xpk-2-v5p-8
126
94
  WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
127
95
  steps:
128
96
  - uses: actions/download-artifact@v4
@@ -0,0 +1,78 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ name: Basic GPU cluster create
16
+
17
+ on:
18
+ workflow_call:
19
+
20
+ permissions:
21
+ contents: read
22
+
23
+ jobs:
24
+ gpu-cluster-create-and-delete:
25
+ runs-on: [ubuntu-22.04]
26
+ concurrency:
27
+ group: nightly-test-cluster-group-gpu
28
+ cancel-in-progress: false
29
+ env:
30
+ GPU_CLUSTER_NAME: nightly-xpk-b200
31
+ WORKLOAD_NAME: xpktest-gpu-nightly-${{ github.run_attempt }}
32
+ steps:
33
+ - uses: actions/download-artifact@v4
34
+ with:
35
+ name: custom-scripts
36
+ - name: Setup environment
37
+ uses: ./.github/actions/setup-test-env
38
+ with:
39
+ credentials_json: "${{ secrets.GCP_SA_KEY }}"
40
+ - name: Check xpk installation
41
+ run: xpk version
42
+ - name: 'Setup Service Account for XPK'
43
+ run: |
44
+ # 1. Clear any existing WIF configurations to avoid conflicts
45
+ rm -rf $HOME/.config/gcloud
46
+ mkdir -p $HOME/.config/gcloud
47
+
48
+ # 2. Write the Key File
49
+ echo '${{ secrets.GCP_SA_KEY }}' > $HOME/.config/gcloud/application_default_credentials.json
50
+
51
+ # 3. Activate the Service Account
52
+ # This updates the internal config files to point to the key file.
53
+ # When Docker mounts the directory, it will now see "Active Account: Service Account"
54
+ gcloud auth activate-service-account --key-file=$HOME/.config/gcloud/application_default_credentials.json --project=cloud-tpu-multipod-dev
55
+
56
+ # 4. Set Env Var for the host (GitHub Runner)
57
+ echo "GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/gcloud/application_default_credentials.json" >> $GITHUB_ENV
58
+ - name: Create an XPK Cluster with 1 x b200 GPU
59
+ run: xpk cluster create --cluster $GPU_CLUSTER_NAME --device-type=b200-8 --zone=asia-northeast1-b --default-pool-cpu-machine-type=n1-standard-16 --spot
60
+ - name: Authenticate Docker
61
+ run: gcloud auth configure-docker --quiet
62
+ - name: Run a base-docker-image workload
63
+ run: xpk workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --docker-image='nvidia/cuda:12.1.0-base-ubuntu22.04' --command "nvidia-smi" --zone=asia-northeast1-b --device-type=b200-8
64
+ - name: List out the workloads on the cluster
65
+ run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
66
+ - name: Wait for workload completion and confirm it succeeded
67
+ run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --wait-for-job-completion $WORKLOAD_NAME --timeout 600
68
+ - name: Delete the workload on the cluster
69
+ run: xpk workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
70
+ - name: Delete the cluster created
71
+ if: always()
72
+ run: xpk cluster delete --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --force
73
+ - name: Upload cluster nodepool creation log
74
+ if: always()
75
+ uses: actions/upload-artifact@v4
76
+ with:
77
+ name: gpu-cluster-nodepool-log-${{github.run_id}}
78
+ path: /tmp/NodepoolCreate-${{ env.GPU_CLUSTER_NAME }}-np-*
@@ -16,8 +16,8 @@ name: Nightly Tests
16
16
 
17
17
  on:
18
18
  workflow_dispatch:
19
- schedule: # Schedule the job run at 12AM PST daily.
20
- - cron: "0 8 * * *"
19
+ schedule: # Schedule the job run at 6AM UTC daily.
20
+ - cron: "0 6 * * *"
21
21
 
22
22
  permissions:
23
23
  contents: read
@@ -32,6 +32,11 @@ jobs:
32
32
  uses: ./.github/workflows/integration_basic_cluster_create.yaml
33
33
  secrets: inherit
34
34
 
35
+ gpu_cluster_create:
36
+ needs: [build_actions, build_wheel]
37
+ uses: ./.github/workflows/integration_gpu_cluster_create.yaml
38
+ secrets: inherit
39
+
35
40
  pathways_cluster_create:
36
41
  needs: [build_actions, build_wheel]
37
42
  uses: ./.github/workflows/integration_pathways_cluster_create.yaml
@@ -41,10 +46,6 @@ jobs:
41
46
  needs: [build_actions, build_wheel]
42
47
  uses: ./.github/workflows/integration_ray_cluster_create.yaml
43
48
  secrets: inherit
44
- legacy_integration:
45
- needs: [build_actions, build_wheel]
46
- uses: ./.github/workflows/integration_legacy_tests.yaml
47
- secrets: inherit
48
49
  storage-tests:
49
50
  needs: [build_actions, build_wheel]
50
51
  uses: ./.github/workflows/integration_storage_tests.yaml
@@ -38,7 +38,7 @@ jobs:
38
38
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
39
39
  restore-keys: xpk-deps-3.10-
40
40
  - name: Verify goldens
41
- run: ./golden_buddy.sh verify goldens.yaml goldens
41
+ run: python3 tools/recipes.py golden recipes/*.md
42
42
  env:
43
43
  UPDATE_GOLDEN_COMMAND: make goldens
44
44
  XPK_VERSION_OVERRIDE: v0.0.0
@@ -12,11 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License
14
14
 
15
-
16
- name: 'Close stale issues and PRs'
15
+ name: "Close stale issues and PRs"
17
16
  on:
18
17
  schedule:
19
- - cron: '30 1 * * *'
18
+ - cron: "30 1 * * *"
20
19
 
21
20
  jobs:
22
21
  stale:
@@ -24,7 +23,8 @@ jobs:
24
23
  steps:
25
24
  - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
26
25
  with:
27
- stale-pr-message: 'This pull request is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
26
+ days-before-issue-stale: -1
27
+ stale-pr-message: "This pull request is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
28
28
  days-before-pr-stale: 30
29
29
  days-before-pr-close: 7
30
30
  operations-per-run: 100
@@ -1,12 +1,7 @@
1
- KUEUE_REPO=https://github.com/kubernetes-sigs/kueue.git
2
-
3
- KUBECTL_VERSION := $(shell curl -L -s https://dl.k8s.io/release/stable.txt)
4
- KUEUE_VERSION=v0.14.3
5
-
6
1
  OS := $(shell uname -s | tr A-Z a-z)
7
2
  PLATFORM := $(shell uname -m | sed -e 's/aarch64/arm64/' | sed -e 's/x86_64/amd64/')
8
3
 
9
- KUBECTL_URL = "https://dl.k8s.io/release/$(KUBECTL_VERSION)/bin/$(OS)/$(PLATFORM)/kubectl"
4
+ KUEUE_VERSION=v0.15.2
10
5
  KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUEUE_VERSION)/kubectl-kueue-$(OS)-$(PLATFORM)"
11
6
 
12
7
  PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
@@ -34,12 +29,9 @@ install-pytest:
34
29
  run-unittests:
35
30
  XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 pytest -vv src/xpk/
36
31
 
37
- run-integrationtests:
38
- XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 pytest src/integration/
39
-
40
32
  .PHONY: goldens
41
33
  goldens:
42
- XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 ./golden_buddy.sh update goldens.yaml goldens
34
+ XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 python3 tools/recipes.py update recipes/*.md
43
35
 
44
36
  .PHONY: mkdir-bin
45
37
  mkdir-bin:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.0.0
3
+ Version: 1.1.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -93,28 +93,41 @@ XPK supports a variety of hardware accelerators.
93
93
 
94
94
  XPK also supports the following [Google Cloud Storage solutions](./docs/usage/storage.md):
95
95
 
96
- | Storage Type | Documentation |
97
- |--------------------------------------------|------------------------------------------------------------------------------------------|
98
- | Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
99
- | Filestore | [docs](./docs/usage/storage.md#filestore) |
100
- | Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
101
- | Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
96
+ | Storage Type | Documentation |
97
+ | ------------------------------------------ | ----------------------------------------------------------------------- |
98
+ | Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
99
+ | Filestore | [docs](./docs/usage/storage.md#filestore) |
100
+ | Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
101
+ | Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
102
102
 
103
103
  # Documentation
104
104
 
105
- * [Permissions](./docs/permissions.md)
106
- * [Installation](./docs/installation.md)
107
- * Usage:
108
- * [Clusters](./docs/usage/clusters.md)
109
- * [GPU](./docs/usage/gpu.md)
110
- * [CPU](./docs/usage/cpu.md)
111
- * [Autoprovisioning](./docs/usage/autoprovisioning.md)
112
- * [Workloads](./docs/usage/workloads.md)
113
- * [Docker](./docs/usage/docker.md)
114
- * [Storage](./docs/usage/storage.md)
115
- * [Advanced](./docs/usage/advanced.md)
116
- * [Inspector](./docs/usage/inspector.md)
117
- * [Troubleshooting](./docs/troubleshooting.md)
105
+ - [Permissions](./docs/permissions.md)
106
+ - [Installation](./docs/installation.md)
107
+ - Usage:
108
+ - [Clusters](./docs/usage/clusters.md)
109
+ - [GPU](./docs/usage/gpu.md)
110
+ - [CPU](./docs/usage/cpu.md)
111
+ - [Autoprovisioning](./docs/usage/autoprovisioning.md)
112
+ - [Workloads](./docs/usage/workloads.md)
113
+ - [Docker](./docs/usage/docker.md)
114
+ - [Storage](./docs/usage/storage.md)
115
+ - [Advanced](./docs/usage/advanced.md)
116
+ - [Inspector](./docs/usage/inspector.md)
117
+ - [Troubleshooting](./docs/troubleshooting.md)
118
+
119
+ # Dependencies
120
+
121
+ | Dependency | When used |
122
+ | ------------------------------------------------------------------------------------------------------------ | --------------------------- |
123
+ | [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/install) | _always_ |
124
+ | [kubectl](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) | _always_ |
125
+ | [ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) | Provisioning GPU clusters |
126
+ | [Kueue](https://github.com/kubernetes-sigs/kueue) | Scheduling workloads |
127
+ | [JobSet](https://github.com/kubernetes-sigs/jobset) | Workload creation |
128
+ | [Docker](https://docs.docker.com/engine/install/) | Building workload container |
129
+ | [CoreDNS](https://github.com/coredns/deployment/tree/master/kubernetes) | Cluster set up |
130
+ | [PathwaysJob](https://github.com/google/pathways-job) | Running Pathways workloads |
118
131
 
119
132
  # Privacy notice
120
133
 
@@ -129,11 +142,14 @@ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](
129
142
  you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
130
143
  [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
131
144
 
132
-
133
145
  # Contributing
134
146
 
135
147
  Please read [`contributing.md`](./docs/contributing.md) for details on our code of conduct, and the process for submitting pull requests to us.
136
148
 
149
+ # Get involved
150
+
151
+ We'd love to hear from you! If you have questions or want to discuss ideas, join us on [GitHub Discussions](https://github.com/AI-Hypercomputer/xpk/discussions). Found a bug or have a feature request? Please let us know on [GitHub Issues](https://github.com/AI-Hypercomputer/xpk/issues).
152
+
137
153
  # License
138
154
 
139
155
  This project is licensed under the Apache License 2.0 - see the [`LICENSE`](./LICENSE) file for details
@@ -52,28 +52,41 @@ XPK supports a variety of hardware accelerators.
52
52
 
53
53
  XPK also supports the following [Google Cloud Storage solutions](./docs/usage/storage.md):
54
54
 
55
- | Storage Type | Documentation |
56
- |--------------------------------------------|------------------------------------------------------------------------------------------|
57
- | Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
58
- | Filestore | [docs](./docs/usage/storage.md#filestore) |
59
- | Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
60
- | Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
55
+ | Storage Type | Documentation |
56
+ | ------------------------------------------ | ----------------------------------------------------------------------- |
57
+ | Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
58
+ | Filestore | [docs](./docs/usage/storage.md#filestore) |
59
+ | Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
60
+ | Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
61
61
 
62
62
  # Documentation
63
63
 
64
- * [Permissions](./docs/permissions.md)
65
- * [Installation](./docs/installation.md)
66
- * Usage:
67
- * [Clusters](./docs/usage/clusters.md)
68
- * [GPU](./docs/usage/gpu.md)
69
- * [CPU](./docs/usage/cpu.md)
70
- * [Autoprovisioning](./docs/usage/autoprovisioning.md)
71
- * [Workloads](./docs/usage/workloads.md)
72
- * [Docker](./docs/usage/docker.md)
73
- * [Storage](./docs/usage/storage.md)
74
- * [Advanced](./docs/usage/advanced.md)
75
- * [Inspector](./docs/usage/inspector.md)
76
- * [Troubleshooting](./docs/troubleshooting.md)
64
+ - [Permissions](./docs/permissions.md)
65
+ - [Installation](./docs/installation.md)
66
+ - Usage:
67
+ - [Clusters](./docs/usage/clusters.md)
68
+ - [GPU](./docs/usage/gpu.md)
69
+ - [CPU](./docs/usage/cpu.md)
70
+ - [Autoprovisioning](./docs/usage/autoprovisioning.md)
71
+ - [Workloads](./docs/usage/workloads.md)
72
+ - [Docker](./docs/usage/docker.md)
73
+ - [Storage](./docs/usage/storage.md)
74
+ - [Advanced](./docs/usage/advanced.md)
75
+ - [Inspector](./docs/usage/inspector.md)
76
+ - [Troubleshooting](./docs/troubleshooting.md)
77
+
78
+ # Dependencies
79
+
80
+ | Dependency | When used |
81
+ | ------------------------------------------------------------------------------------------------------------ | --------------------------- |
82
+ | [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/install) | _always_ |
83
+ | [kubectl](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) | _always_ |
84
+ | [ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) | Provisioning GPU clusters |
85
+ | [Kueue](https://github.com/kubernetes-sigs/kueue) | Scheduling workloads |
86
+ | [JobSet](https://github.com/kubernetes-sigs/jobset) | Workload creation |
87
+ | [Docker](https://docs.docker.com/engine/install/) | Building workload container |
88
+ | [CoreDNS](https://github.com/coredns/deployment/tree/master/kubernetes) | Cluster set up |
89
+ | [PathwaysJob](https://github.com/google/pathways-job) | Running Pathways workloads |
77
90
 
78
91
  # Privacy notice
79
92
 
@@ -88,11 +101,14 @@ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](
88
101
  you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
89
102
  [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
90
103
 
91
-
92
104
  # Contributing
93
105
 
94
106
  Please read [`contributing.md`](./docs/contributing.md) for details on our code of conduct, and the process for submitting pull requests to us.
95
107
 
108
+ # Get involved
109
+
110
+ We'd love to hear from you! If you have questions or want to discuss ideas, join us on [GitHub Discussions](https://github.com/AI-Hypercomputer/xpk/discussions). Found a bug or have a feature request? Please let us know on [GitHub Issues](https://github.com/AI-Hypercomputer/xpk/issues).
111
+
96
112
  # License
97
113
 
98
114
  This project is licensed under the Apache License 2.0 - see the [`LICENSE`](./LICENSE) file for details
@@ -47,36 +47,57 @@ A crucial aspect of effective unit testing is isolation. A unit test should only
47
47
 
48
48
  A good, state-of-the-art sample of [code](https://github.com/AI-Hypercomputer/xpk/blob/0434cf6a023069522f90d5846c6d980b68382b66/src/xpk/core/nodepool.py#L614) that has been correctly covered with unit tests can be found [here](https://github.com/AI-Hypercomputer/xpk/blob/8464ce26cd0fd24c681e346b2c915ad918724e53/src/xpk/core/nodepool_test.py#L26). This provided example serves as a practical guide and "source of truth" for developers, demonstrating best practices in unit test structure like naming. Another sample, leveraging mocks could be found [here](https://github.com/AI-Hypercomputer/xpk/blob/8464ce26cd0fd24c681e346b2c915ad918724e53/src/xpk/core/nodepool_test.py#L86).
49
49
 
50
- ## Golden Test
50
+ ## Golden Recipes
51
51
 
52
- Golden tests encompass a broad scope within XPK, effectively covering the entire execution of a command from a user's perspective. Their primary objective is to highlight the blast radius of a change by making developers aware of all user journeys that might be affected by the change. These tests are executed on feature branches and serve as the main tool for raising awareness, enabling developers to thoroughly double-check changes across various scenarios and understand their potential impact.
52
+ Golden recipes encompass a broad scope within XPK, effectively covering entire user journeys. Their primary objective is to orchestrate multiple commands to achieve a high-level goal, simulating a real user interacting with the system. They also serve as regression tests by asserting on the output of each step, ensuring that the user experience remains consistent. These tests are executed on feature branches and serve as the main tool for raising awareness, enabling developers to thoroughly double-check changes across various complex scenarios and understand their potential impact.
53
53
 
54
54
  ### Naming Conventions
55
55
 
56
- Each Golden test name should refer to a potential use case or persona utilizing the system, explicitly including the command that is executed. This approach ensures that the test names clearly communicate the real-world scenarios and user interactions they validate, focusing on the actions taken. A good Golden test name should typically convey:
56
+ Each Golden recipe file in the `recipes` directory corresponds to a specific use case or persona utilizing the system. The filename should clearly indicate the scenario, for example `NAP_cluster-create_with_pathways.md` or `Cluster_create_with_Managed_Lustre_driver.md`.
57
57
 
58
- * **Command name that is executed:** cluster create, cluster create-pathways, or workload list.
59
- * **Use case it is covering:** nap cluster creation, tpu cluster creation, workload status listing
58
+ ### Developer guide to Golden Recipes
59
+
60
+ All golden recipes are located in the `recipes` directory. Each recipe is a Markdown file that describes the user journey and contains the sequence of commands to be executed.
61
+
62
+ A sample structure of a recipe file is:
60
63
 
61
- For example, a good golden test name could be: "NAP cluster-create with pathways".
64
+ ```markdown
65
+ # Recipe Title
62
66
 
63
- ### Developer guide to Golden Tests
64
- All golden tests are registered in the `goldens.yaml` file in the root directory. Their reference output is stored in text files located in goldens directory in the root directory.
67
+ Description of the recipe.
65
68
 
66
- A sample structure of `goldens.yaml` file is defined as:
69
+ ## Step 1: Create Cluster
70
+ \`\`\`shell #golden
71
+ xpk cluster create ...
72
+ \`\`\`
73
+ <!--
74
+ Expected output block
75
+ -->
67
76
 
68
- ```yaml
69
- goldens:
70
- "NAP cluster-create with pathways":
71
- command: xpk cluster create-pathways --enable-autoprovisioning
72
- description: "" # optional description allowing to better understand use-case
77
+ ## Step 2: Submit Workload
78
+ \`\`\`shell #golden
79
+ xpk workload create ...
80
+ \`\`\`
81
+ <!--
82
+ Expected output block
83
+ -->
73
84
  ```
74
85
 
75
- Goldens after change in the code, or registering a new one can be re-generated using `make goldens` command.
86
+ Recipe files are self-contained, storing both the commands and their expected golden outputs in comment blocks. The recipe executor runs these commands in order, maintaining state between them (e.g. environment variables).
76
87
 
77
88
  ### Underlying execution mechanisms
78
89
 
79
- These tests are executed through the GoldenBuddy testing script located in the `golden_buddy.sh` file of the repository. The framework executes all registered commands in `dry_run` mode, then compares diffs between them with the reference output located in goldens directory.
90
+ These tests are executed through the `tools/recipes.py` script. The framework executes the sequence of commands in `dry_run` mode (by injecting a mock `xpk` function) and compares the output of each step with the expected output stored in the recipe file.
91
+
92
+ **Usage:**
93
+
94
+ * **Regenerate Goldens:** `make goldens`
95
+ * This is the primary command for developers. It executes all recipes in `update` mode, regenerating the golden outputs. Run this after making changes to the code or adding new recipes.
96
+
97
+ * **Advanced:** *(These commands are primarily used by CI/CD pipelines or for debugging specific scenarios.)*
98
+ * **Verification:** `python3 tools/recipes.py golden <file or files>` (Verifies outputs match without updating)
99
+ * **Integration Run:** `python3 tools/recipes.py run <file or files>` (Executes commands for real)
100
+ * **Selective Update:** `python3 tools/recipes.py update <file or files>` (Updates a specific recipe)
80
101
 
81
102
  ## Integration Test
82
103
  Integration tests sit at the apex of the testing pyramid, being the most expensive and slowest to execute. This is primarily because they rely on actual Google Cloud Platform (GCP) infrastructure, which introduces potential flakiness due to external factors and makes it challenging to write given capacity constraints. Consequently, these tests should be reserved for ultimate verification before release, ensuring all of XPK's components function seamlessly together within a real GCP environment. They are not run on feature branches; instead, they are executed on the mainline (`main`) branch nightly after code merges, and right before a release to validate a new XPK release candidate. This strategic placement ensures a final, comprehensive check of the entire system's functionality in its production-like setting.
@@ -51,7 +51,7 @@ all zones.
51
51
  --num-slices=4 --on-demand
52
52
  ```
53
53
 
54
- * Cluster Create (provision spot / preemptable capacity):
54
+ * Cluster Create (provision spot / preemptible capacity):
55
55
 
56
56
  ```shell
57
57
  xpk cluster create \
@@ -274,7 +274,7 @@ xpk cluster create-pathways \
274
274
  --managed-mldiagnostics
275
275
  ```
276
276
 
277
- * Cluster Create (provision spot / preemptable capacity) with flag **--managed-mldiagnostics**:
277
+ * Cluster Create (provision spot / preemptible capacity) with flag **--managed-mldiagnostics**:
278
278
 
279
279
  ```shell
280
280
  xpk cluster create \
@@ -29,14 +29,10 @@ Before you start, complete the following steps:
29
29
 
30
30
  ### Create a single-NIC, single slice cluster
31
31
 
32
- Currently flex start provisioning for Ironwood works only in single slice and multi-host or multi-slice and single host setups. More options will be added soon
33
-
34
32
  1. Set the following environment variables:
35
33
 
36
34
  > **NOTE:** For multi-host provisioning use an ACCELERATOR_TYPE with any topology that results to more than 8 chips, e.g. `tpu7x-2x2x2` or `tpu7x-16`. For single-host provisioning use an ACCELERATOR_TYPE with any topology that results to 8 or less chips, e.g. `tpu7x-2x2x1` or `tpu7x-8`.
37
35
 
38
- > **NOTE:** Single-host provisioning is not supported for single-slice. If you want to create a single-host cluster, you need to set `--num-slices` to 2 or higher on the `xpk cluster create` command.
39
-
40
36
  ```shell
41
37
  export PROJECT_ID=<project_id> # Your GCP project name
42
38
  export ZONE=<zone> # Example: us-central1-c
@@ -29,14 +29,10 @@ Before you start, complete the following steps:
29
29
 
30
30
  ### Create a single-NIC, single slice cluster
31
31
 
32
- Currently flex start provisioning for Ironwood works only in single slice and multi-host or multi-slice and single host setups. More options will be added soon
33
-
34
32
  1. Set the following environment variables:
35
33
 
36
34
  > **NOTE:** For multi-host provisioning use an ACCELERATOR_TYPE with any topology that results to more than 8 chips, e.g. `tpu7x-2x2x2` or `tpu7x-16`. For single-host provisioning use an ACCELERATOR_TYPE with any topology that results to 8 or less chips, e.g. `tpu7x-2x2x1` or `tpu7x-8`.
37
35
 
38
- > **NOTE:** Single-host provisioning is not supported for single-slice. If you want to create a single-host cluster, you need to set `--num-slices` to 2 or higher on the `xpk cluster create` command.
39
-
40
36
  ```shell
41
37
  export PROJECT_ID=<project_id> # Your GCP project name
42
38
  export ZONE=<zone> # Example: us-central1-c
@@ -15,6 +15,9 @@
15
15
  -->
16
16
 
17
17
  ## Workload Create
18
+
19
+ _Note: `xpk workload create` works only on clusters created through XPK. See [docs](./clusters.md) on how to create a cluster via XPK._
20
+
18
21
  * Workload Create (submit training job):
19
22
 
20
23
  ```shell