xpk 1.1.1__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. {xpk-1.1.1 → xpk-1.2.0}/Makefile +3 -2
  2. {xpk-1.1.1/src/xpk.egg-info → xpk-1.2.0}/PKG-INFO +1 -1
  3. {xpk-1.1.1 → xpk-1.2.0}/recipes/Basic_cluster_adapt.md +2 -2
  4. {xpk-1.1.1 → xpk-1.2.0}/recipes/Basic_cluster_create.md +2 -2
  5. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_RayCluster.md +2 -2
  6. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_for_multi-host_nodepool.md +2 -2
  7. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_for_single-host_nodepool.md +2 -2
  8. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_private.md +2 -2
  9. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_sub-slicing.md +2 -2
  10. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_super-slicing.md +6 -5
  11. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_CPU_and_memory_limits_above_capacity.md +2 -2
  12. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_CPU_and_memory_limits_below_capacity.md +2 -2
  13. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_Managed_Lustre_driver.md +8 -2
  14. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.md +8 -2
  15. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_gb200-4.md +2 -2
  16. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_shared_reservation.md +2 -2
  17. {xpk-1.1.1 → xpk-1.2.0}/recipes/NAP_cluster-create.md +2 -2
  18. {xpk-1.1.1 → xpk-1.2.0}/recipes/NAP_cluster-create_with_pathways.md +2 -2
  19. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_super-slicing.md +3 -3
  20. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster.py +15 -6
  21. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster_test.py +16 -1
  22. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/cluster.py +31 -1
  23. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/cluster_test.py +61 -4
  24. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_container.py +3 -1
  25. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_resources.py +5 -5
  26. xpk-1.2.0/src/xpk/core/kubectl_common.py +77 -0
  27. xpk-1.2.0/src/xpk/core/kubectl_common_test.py +174 -0
  28. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/kueue_manager.py +26 -26
  29. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/kueue_manager_test.py +52 -12
  30. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/nodepool.py +34 -0
  31. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/nodepool_test.py +104 -0
  32. {xpk-1.1.1 → xpk-1.2.0/src/xpk.egg-info}/PKG-INFO +1 -1
  33. {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/SOURCES.txt +2 -0
  34. {xpk-1.1.1 → xpk-1.2.0}/.dockerignore +0 -0
  35. {xpk-1.1.1 → xpk-1.2.0}/.github/CODEOWNERS +0 -0
  36. {xpk-1.1.1 → xpk-1.2.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  37. {xpk-1.1.1 → xpk-1.2.0}/.github/actions/install-kueue/action.yml +0 -0
  38. {xpk-1.1.1 → xpk-1.2.0}/.github/actions/setup-test-env/action.yml +0 -0
  39. {xpk-1.1.1 → xpk-1.2.0}/.github/release.yaml +0 -0
  40. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/README.md +0 -0
  41. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/build_tests.yaml +0 -0
  42. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/build_wheels.yaml +0 -0
  43. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/cleanup.yaml +0 -0
  44. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-dispatch.yml +0 -0
  45. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-invoke.yml +0 -0
  46. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-review.yml +0 -0
  47. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-scheduled-triage.yml +0 -0
  48. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-triage.yml +0 -0
  49. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_basic_cluster_create.yaml +0 -0
  50. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_gpu_cluster_create.yaml +0 -0
  51. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
  52. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
  53. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_storage_tests.yaml +0 -0
  54. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/label-validation.yaml +0 -0
  55. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/nightly_tests.yaml +0 -0
  56. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/periodic_release.yaml +0 -0
  57. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/release_branch_versioning.yaml +0 -0
  58. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_build_scripts.yaml +0 -0
  59. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_build_wheel.yaml +0 -0
  60. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_goldens.yaml +0 -0
  61. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_lint_and_format.yml +0 -0
  62. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_storage_create.yaml +0 -0
  63. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_storage_delete.yaml +0 -0
  64. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_unit_tests.yaml +0 -0
  65. {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/stale.yaml +0 -0
  66. {xpk-1.1.1 → xpk-1.2.0}/.gitignore +0 -0
  67. {xpk-1.1.1 → xpk-1.2.0}/.pre-commit-config.yaml +0 -0
  68. {xpk-1.1.1 → xpk-1.2.0}/LICENSE +0 -0
  69. {xpk-1.1.1 → xpk-1.2.0}/README.md +0 -0
  70. {xpk-1.1.1 → xpk-1.2.0}/backoff_retry.sh +0 -0
  71. {xpk-1.1.1 → xpk-1.2.0}/data/Dockerfile +0 -0
  72. {xpk-1.1.1 → xpk-1.2.0}/docs/code-of-conduct.md +0 -0
  73. {xpk-1.1.1 → xpk-1.2.0}/docs/contributing.md +0 -0
  74. {xpk-1.1.1 → xpk-1.2.0}/docs/installation.md +0 -0
  75. {xpk-1.1.1 → xpk-1.2.0}/docs/permissions.md +0 -0
  76. {xpk-1.1.1 → xpk-1.2.0}/docs/testing.md +0 -0
  77. {xpk-1.1.1 → xpk-1.2.0}/docs/troubleshooting.md +0 -0
  78. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/advanced.md +0 -0
  79. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/autoprovisioning.md +0 -0
  80. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/clusters.md +0 -0
  81. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/cpu.md +0 -0
  82. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/docker.md +0 -0
  83. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/gpu.md +0 -0
  84. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/inspector.md +0 -0
  85. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/storage.md +0 -0
  86. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
  87. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
  88. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
  89. {xpk-1.1.1 → xpk-1.2.0}/docs/usage/workloads.md +0 -0
  90. {xpk-1.1.1 → xpk-1.2.0}/examples/fake_training.py +0 -0
  91. {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
  92. {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/requirements.txt +0 -0
  93. {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/train.py +0 -0
  94. {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/train.slurm +0 -0
  95. {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
  96. {xpk-1.1.1 → xpk-1.2.0}/examples/nccl/nccl-a3mega.sh +0 -0
  97. {xpk-1.1.1 → xpk-1.2.0}/examples/nccl/nccl-a3ultra.sh +0 -0
  98. {xpk-1.1.1 → xpk-1.2.0}/examples/nccl/nccl.md +0 -0
  99. {xpk-1.1.1 → xpk-1.2.0}/examples/storage/filestore-manifest-attach.yaml +0 -0
  100. {xpk-1.1.1 → xpk-1.2.0}/examples/storage/gcsfuse-manifest.yaml +0 -0
  101. {xpk-1.1.1 → xpk-1.2.0}/examples/storage/lustre-manifest-attach.yaml +0 -0
  102. {xpk-1.1.1 → xpk-1.2.0}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
  103. {xpk-1.1.1 → xpk-1.2.0}/examples/storage/pd-manifest-attach.yaml +0 -0
  104. {xpk-1.1.1 → xpk-1.2.0}/pylintrc +0 -0
  105. {xpk-1.1.1 → xpk-1.2.0}/pyproject.toml +0 -0
  106. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_delete.md +0 -0
  107. {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_delete_force.md +0 -0
  108. {xpk-1.1.1 → xpk-1.2.0}/recipes/Storage_list.md +0 -0
  109. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create.md +0 -0
  110. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_pathways.md +0 -0
  111. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_sub-slicing.md +0 -0
  112. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_with_output-manifest-file.md +0 -0
  113. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_delete.md +0 -0
  114. {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_list.md +0 -0
  115. {xpk-1.1.1 → xpk-1.2.0}/recipes/comprehensive-demo.md +0 -0
  116. {xpk-1.1.1 → xpk-1.2.0}/setup.cfg +0 -0
  117. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/__init__.py +0 -0
  118. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/api/__init__.py +0 -0
  119. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/api/storage_crd.yaml +0 -0
  120. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
  121. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
  122. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
  123. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
  124. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
  125. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
  126. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
  127. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
  128. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
  129. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/__init__.py +0 -0
  130. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster_gcluster.py +0 -0
  131. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster_gcluster_test.py +0 -0
  132. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/common.py +0 -0
  133. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/common_test.py +0 -0
  134. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/config.py +0 -0
  135. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/info.py +0 -0
  136. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/inspector.py +0 -0
  137. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/inspector_test.py +0 -0
  138. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
  139. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
  140. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/storage.py +0 -0
  141. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/version.py +0 -0
  142. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/workload.py +0 -0
  143. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/workload_test.py +0 -0
  144. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/__init__.py +0 -0
  145. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/__init__.py +0 -0
  146. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  147. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
  148. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  149. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/__init__.py +0 -0
  150. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
  151. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
  152. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
  153. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
  154. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/capacity.py +0 -0
  155. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/capacity_test.py +0 -0
  156. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/cluster_private.py +0 -0
  157. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/commands.py +0 -0
  158. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/config.py +0 -0
  159. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/config_test.py +0 -0
  160. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_image.py +0 -0
  161. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_manager.py +0 -0
  162. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/filestore.py +0 -0
  163. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcloud_context.py +0 -0
  164. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcloud_context_test.py +0 -0
  165. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcluster_manager.py +0 -0
  166. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcsfuse.py +0 -0
  167. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/jobset.py +0 -0
  168. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/monitoring.py +0 -0
  169. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/mtc.py +0 -0
  170. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/nap.py +0 -0
  171. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/network.py +0 -0
  172. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/pathways.py +0 -0
  173. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/pathways_test.py +0 -0
  174. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/ray.py +0 -0
  175. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/remote_state/__init__.py +0 -0
  176. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  177. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  178. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/resources.py +0 -0
  179. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/scheduling.py +0 -0
  180. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/scheduling_test.py +0 -0
  181. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/storage.py +0 -0
  182. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/system_characteristics.py +0 -0
  183. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/system_characteristics_test.py +0 -0
  184. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/telemetry.py +0 -0
  185. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/telemetry_test.py +0 -0
  186. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/testing/__init__.py +0 -0
  187. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/testing/commands_tester.py +0 -0
  188. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/testing/commands_tester_test.py +0 -0
  189. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/updates.py +0 -0
  190. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/updates_test.py +0 -0
  191. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/vertex.py +0 -0
  192. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload.py +0 -0
  193. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  194. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  195. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  196. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  197. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  198. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  199. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_test.py +0 -0
  200. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/main.py +0 -0
  201. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/__init__.py +0 -0
  202. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/cluster.py +0 -0
  203. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/cluster_test.py +0 -0
  204. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/common.py +0 -0
  205. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/common_test.py +0 -0
  206. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/config.py +0 -0
  207. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/core.py +0 -0
  208. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/info.py +0 -0
  209. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/inspector.py +0 -0
  210. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/storage.py +0 -0
  211. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/storage_test.py +0 -0
  212. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/validators.py +0 -0
  213. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/version.py +0 -0
  214. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/workload.py +0 -0
  215. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/workload_test.py +0 -0
  216. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/telemetry_uploader.py +0 -0
  217. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/__init__.py +0 -0
  218. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
  219. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
  220. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/filestore-pv.yaml +0 -0
  221. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/filestore-pvc.yaml +0 -0
  222. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/filestore-sc.yaml +0 -0
  223. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/fuse-pv.yaml +0 -0
  224. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/fuse-pvc.yaml +0 -0
  225. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
  226. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
  227. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
  228. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
  229. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/mtc-cpc.yaml +0 -0
  230. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/storage.yaml +0 -0
  231. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/__init__.py +0 -0
  232. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/console.py +0 -0
  233. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/console_test.py +0 -0
  234. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/execution_context.py +0 -0
  235. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/feature_flags.py +0 -0
  236. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/file.py +0 -0
  237. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/gcs_utils.py +0 -0
  238. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/kubectl.py +0 -0
  239. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/kueue.py +0 -0
  240. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/network.py +0 -0
  241. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/objects.py +0 -0
  242. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/templates.py +0 -0
  243. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/topology.py +0 -0
  244. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/topology_test.py +0 -0
  245. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/user_agent.py +0 -0
  246. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/user_agent_test.py +0 -0
  247. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/validation.py +0 -0
  248. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/validation_test.py +0 -0
  249. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/versions.py +0 -0
  250. {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/yaml.py +0 -0
  251. {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  252. {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/entry_points.txt +0 -0
  253. {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/requires.txt +0 -0
  254. {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/top_level.txt +0 -0
  255. {xpk-1.1.1 → xpk-1.2.0}/tools/install-gke-auth-plugin.sh +0 -0
  256. {xpk-1.1.1 → xpk-1.2.0}/tools/install-xpk.sh +0 -0
  257. {xpk-1.1.1 → xpk-1.2.0}/tools/recipes.py +0 -0
  258. {xpk-1.1.1 → xpk-1.2.0}/xpk-large-scale-guide.sh +0 -0
  259. {xpk-1.1.1 → xpk-1.2.0}/xpk-notebooks.md +0 -0
  260. {xpk-1.1.1 → xpk-1.2.0}/xpk.py +0 -0
@@ -6,6 +6,7 @@ KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUE
6
6
 
7
7
  PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
8
8
  BIN_PATH=$(PROJECT_DIR)/bin
9
+ PIP_OPTS ?=
9
10
 
10
11
  .PHONY: install
11
12
  install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl pip-install
@@ -15,11 +16,11 @@ install-dev: check-python check-gcloud mkdir-bin install-kueuectl pip-install pi
15
16
 
16
17
  .PHONY: pip-install-dev
17
18
  pip-install-dev:
18
- pip install -e ".[dev]"
19
+ pip install $(PIP_OPTS) -e ".[dev]"
19
20
 
20
21
  .PHONY: pip-install
21
22
  pip-install:
22
- pip install -e .
23
+ pip install $(PIP_OPTS) -e .
23
24
 
24
25
  .PHONY: install-pytest
25
26
  install-pytest:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -134,8 +134,8 @@ description: "Very High"
134
134
  kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
135
135
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
136
136
  kubectl get node --no-headers | wc -l
137
- [XPK] Try 1: Updating Kueue Controller Manager resources
138
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
137
+ [XPK] Try 1: Updating Controller Manager resources
138
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
139
139
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
140
140
  [XPK] GKE commands done! Resources are created.
141
141
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -266,8 +266,8 @@ description: "Very High"
266
266
  kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
267
267
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
268
268
  kubectl get node --no-headers | wc -l
269
- [XPK] Try 1: Updating Kueue Controller Manager resources
270
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
269
+ [XPK] Try 1: Updating Controller Manager resources
270
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
271
271
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
272
272
  [XPK] GKE commands done! Resources are created.
273
273
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -273,8 +273,8 @@ description: "Very High"
273
273
  kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
274
274
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
275
275
  kubectl get node --no-headers | wc -l
276
- [XPK] Try 1: Updating Kueue Controller Manager resources
277
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
276
+ [XPK] Try 1: Updating Controller Manager resources
277
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
278
278
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
279
279
  [XPK] Try 1: Deleting old RayCluster
280
280
  [XPK] Task: `Deleting old RayCluster` is implemented by the following command not running since it is a dry run.
@@ -268,8 +268,8 @@ description: "Very High"
268
268
  kubectl apply -f b58f50dd88cb1211d51276b9b445f6bca02f0e97fa984656d47992aecd9322cc
269
269
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
270
270
  kubectl get node --no-headers | wc -l
271
- [XPK] Try 1: Updating Kueue Controller Manager resources
272
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
271
+ [XPK] Try 1: Updating Controller Manager resources
272
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
273
273
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
274
274
  [XPK] GKE commands done! Resources are created.
275
275
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -266,8 +266,8 @@ description: "Very High"
266
266
  kubectl apply -f f228edecda8022002fe1876e83ebf4c0c280eb4aeb0f72da3a5d746b5dfb1c91
267
267
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
268
268
  kubectl get node --no-headers | wc -l
269
- [XPK] Try 1: Updating Kueue Controller Manager resources
270
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
269
+ [XPK] Try 1: Updating Controller Manager resources
270
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
271
271
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
272
272
  [XPK] GKE commands done! Resources are created.
273
273
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -285,8 +285,8 @@ description: "Very High"
285
285
  kubectl apply -f 2e0015f210b664c3b767ae4e11af51387b01d4d6b36e20fecbdee137d3d2700b
286
286
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
287
287
  kubectl get node --no-headers | wc -l
288
- [XPK] Try 1: Updating Kueue Controller Manager resources
289
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
288
+ [XPK] Try 1: Updating Controller Manager resources
289
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
290
290
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
291
291
  [XPK] GKE commands done! Resources are created.
292
292
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster-private/details?project=golden-project
@@ -290,8 +290,8 @@ spec:
290
290
  kubectl apply -f 2f2b4591858b4bc50348c575cd2cc048c79d1e4ffb67e0a6d6e1eafad21c5002
291
291
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
292
292
  kubectl get node --no-headers | wc -l
293
- [XPK] Try 1: Updating Kueue Controller Manager resources
294
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
293
+ [XPK] Try 1: Updating Controller Manager resources
294
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
295
295
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
296
296
  [XPK] GKE commands done! Resources are created.
297
297
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -98,6 +98,9 @@ data:
98
98
  [XPK] Try 1: Install Jobset on golden-cluster
99
99
  [XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run.
100
100
  kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml
101
+ [XPK] Try 1: Updating Controller Manager resources
102
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
103
+ kubectl patch deployment jobset-controller-manager -n jobset-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"requests": {"cpu": "4", "memory": "16Gi"}, "limits": {"cpu": "4", "memory": "16Gi"}}}]}}}}'
101
104
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
102
105
  kubectl get node --no-headers | wc -l
103
106
  [XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content:
@@ -299,11 +302,9 @@ spec:
299
302
  - nodeLabel: kubernetes.io/hostname
300
303
  [XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
301
304
  kubectl apply -f 6df31e8df3d8970d7ed3bf3aa948ae7cea9487c15ed6cfb1577ca6c948cf5525
302
- [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
303
- kubectl get node --no-headers | wc -l
304
- [XPK] Try 1: Updating Kueue Controller Manager resources
305
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
306
- kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
305
+ [XPK] Try 1: Updating Controller Manager resources
306
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
307
+ kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"replicas": 3, "template": {"spec": {"containers": [{"name": "manager", "resources": {"requests": {"cpu": "16", "memory": "64Gi"}, "limits": {"cpu": "16", "memory": "64Gi"}}}]}}}}'
307
308
  [XPK] GKE commands done! Resources are created.
308
309
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
309
310
  [XPK] Exiting XPK cleanly
@@ -270,8 +270,8 @@ description: "Very High"
270
270
  kubectl apply -f 1ce6c42efe0834ff0519978ad09539c725a5d6f22267c5f1b41b6e458668e45f
271
271
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
272
272
  kubectl get node --no-headers | wc -l
273
- [XPK] Try 1: Updating Kueue Controller Manager resources
274
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
273
+ [XPK] Try 1: Updating Controller Manager resources
274
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
275
275
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
276
276
  [XPK] GKE commands done! Resources are created.
277
277
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -270,8 +270,8 @@ description: "Very High"
270
270
  kubectl apply -f 1ce6c42efe0834ff0519978ad09539c725a5d6f22267c5f1b41b6e458668e45f
271
271
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
272
272
  kubectl get node --no-headers | wc -l
273
- [XPK] Try 1: Updating Kueue Controller Manager resources
274
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
273
+ [XPK] Try 1: Updating Controller Manager resources
274
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
275
275
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
276
276
  [XPK] GKE commands done! Resources are created.
277
277
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -47,6 +47,12 @@ gcloud container clusters describe golden-cluster --project=golden-project --loc
47
47
  [XPK] Updating GKE cluster to enable Lustre CSI driver, may take a while!
48
48
  [XPK] Task: `GKE Cluster Update to enable Lustre CSI driver` is implemented by the following command not running since it is a dry run.
49
49
  gcloud container clusters update golden-cluster --project=golden-project --location=us-central1 --quiet --update-addons=LustreCsiDriver=ENABLED
50
+ [XPK] Recreating existing nodes (if any) to complete the Lustre CSI driver installation.
51
+ [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
52
+ gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
53
+ [XPK] To complete NodesRecreate-0 we are executing gcloud container clusters upgrade golden-cluster --project=golden-project --node-pool=0 --location=us-central1 --quiet
54
+ [XPK] Breaking up a total of 1 commands into 1 batches
55
+ [XPK] Pretending all the jobs succeeded
50
56
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
51
57
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
52
58
  [XPK] Creating 1 node pool or pools of tpu7x-8
@@ -271,8 +277,8 @@ description: "Very High"
271
277
  kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
272
278
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
273
279
  kubectl get node --no-headers | wc -l
274
- [XPK] Try 1: Updating Kueue Controller Manager resources
275
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
280
+ [XPK] Try 1: Updating Controller Manager resources
281
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
276
282
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
277
283
  [XPK] GKE commands done! Resources are created.
278
284
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -47,6 +47,12 @@ gcloud container clusters describe golden-cluster --project=golden-project --loc
47
47
  [XPK] Updating GKE cluster to enable Lustre CSI driver, may take a while!
48
48
  [XPK] Task: `GKE Cluster Update to enable Lustre CSI driver` is implemented by the following command not running since it is a dry run.
49
49
  gcloud container clusters update golden-cluster --project=golden-project --location=us-central1 --quiet --enable-legacy-lustre-port
50
+ [XPK] Recreating existing nodes (if any) to complete the Lustre CSI driver installation.
51
+ [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
52
+ gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
53
+ [XPK] To complete NodesRecreate-0 we are executing gcloud container clusters upgrade golden-cluster --project=golden-project --node-pool=0 --location=us-central1 --quiet
54
+ [XPK] Breaking up a total of 1 commands into 1 batches
55
+ [XPK] Pretending all the jobs succeeded
50
56
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
51
57
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
52
58
  [XPK] Creating 1 node pool or pools of tpu7x-8
@@ -271,8 +277,8 @@ description: "Very High"
271
277
  kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
272
278
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
273
279
  kubectl get node --no-headers | wc -l
274
- [XPK] Try 1: Updating Kueue Controller Manager resources
275
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
280
+ [XPK] Try 1: Updating Controller Manager resources
281
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
276
282
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
277
283
  [XPK] GKE commands done! Resources are created.
278
284
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -287,8 +287,8 @@ spec:
287
287
  kubectl apply -f c177e643775bb8e3462648245162a984934b0e09a13b0e3bfb62adf8585442b0
288
288
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
289
289
  kubectl get node --no-headers | wc -l
290
- [XPK] Try 1: Updating Kueue Controller Manager resources
291
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
290
+ [XPK] Try 1: Updating Controller Manager resources
291
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
292
292
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
293
293
  [XPK] Installing NCCL Plugin for cluster
294
294
  [XPK] Task: `Install NCCL Plugin On Cluster` is implemented by the following command not running since it is a dry run.
@@ -273,8 +273,8 @@ description: "Very High"
273
273
  kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
274
274
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
275
275
  kubectl get node --no-headers | wc -l
276
- [XPK] Try 1: Updating Kueue Controller Manager resources
277
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
276
+ [XPK] Try 1: Updating Controller Manager resources
277
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
278
278
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
279
279
  [XPK] GKE commands done! Resources are created.
280
280
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -307,8 +307,8 @@ description: "Very High"
307
307
  kubectl apply -f ff0e8bb58b2038c4b29f1bce1aabe9f02ac0757ae2e80ad3657f704542371839
308
308
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
309
309
  kubectl get node --no-headers | wc -l
310
- [XPK] Try 1: Updating Kueue Controller Manager resources
311
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
310
+ [XPK] Try 1: Updating Controller Manager resources
311
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
312
312
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
313
313
  [XPK] GKE commands done! Resources are created.
314
314
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -317,8 +317,8 @@ description: "Very High"
317
317
  kubectl apply -f fc46093b5c0d291fe7c53c15aebd624b485d767cabf99a73500e95952c70b6f6
318
318
  [XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
319
319
  kubectl get node --no-headers | wc -l
320
- [XPK] Try 1: Updating Kueue Controller Manager resources
321
- [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
320
+ [XPK] Try 1: Updating Controller Manager resources
321
+ [XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
322
322
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
323
323
  [XPK] GKE commands done! Resources are created.
324
324
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
@@ -47,7 +47,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94
47
47
  docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
48
48
  [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
49
49
  docker push gcr.io/golden-project/dry-run-runner:prefix-current
50
- [XPK] Temp file (2eed164577b237a7b764f6adf46b9c249551c2200e440ef0f696c53dfb2d1cb5) content:
50
+ [XPK] Temp file (2c5ab381c0d643f8512a07d296d411413080ec652c15e8c676fd58435de5a327) content:
51
51
  apiVersion: jobset.x-k8s.io/v1alpha2
52
52
  kind: JobSet
53
53
  metadata:
@@ -136,7 +136,7 @@ spec:
136
136
  exit $EXIT_CODE
137
137
  resources:
138
138
  limits:
139
- google.com/tpu: 2
139
+ google.com/tpu: 4
140
140
 
141
141
  volumeMounts:
142
142
  - mountPath: /dev/shm
@@ -156,7 +156,7 @@ spec:
156
156
 
157
157
 
158
158
  [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
159
- kubectl apply -f 2eed164577b237a7b764f6adf46b9c249551c2200e440ef0f696c53dfb2d1cb5
159
+ kubectl apply -f 2c5ab381c0d643f8512a07d296d411413080ec652c15e8c676fd58435de5a327
160
160
  [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
161
161
  gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
162
162
  [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
@@ -369,7 +369,7 @@ def cluster_create(args) -> None:
369
369
 
370
370
  get_cluster_credentials(args)
371
371
 
372
- update_coredns_command_code = update_coredns_if_necessary()
372
+ update_coredns_command_code = update_coredns_if_necessary(args)
373
373
  if update_coredns_command_code != 0:
374
374
  xpk_exit(update_coredns_command_code)
375
375
 
@@ -927,7 +927,7 @@ def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
927
927
  xpk_print(f'{deployment_name} has been scaled down.')
928
928
 
929
929
 
930
- def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
930
+ def scale_up_coredns(replicas: int, namespace: str = 'kube-system'):
931
931
  """Scales up the CoreDNS deployment to a specified number of replicas."""
932
932
  command_coredns_scale = (
933
933
  f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
@@ -1008,7 +1008,14 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
1008
1008
  xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
1009
1009
 
1010
1010
 
1011
- def update_coredns() -> int:
1011
+ def _get_coredns_replica_count(args) -> int:
1012
+ # XPK large scale guide recommends 15 coreDNS replicas for clusters with 5000 VMs.
1013
+ # Otherwise, limit the replica count to the desired number of default pool nodes.
1014
+ default_pool_node_count: int = args.default_pool_cpu_num_nodes
1015
+ return min(15, default_pool_node_count)
1016
+
1017
+
1018
+ def update_coredns(args) -> int:
1012
1019
  """Updates and deploys CoreDNS within a cluster.
1013
1020
 
1014
1021
  Returns:
@@ -1018,6 +1025,8 @@ def update_coredns() -> int:
1018
1025
  coredns_repo_dir_name = 'deployment'
1019
1026
  coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
1020
1027
  coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
1028
+ coredns_replica_count = _get_coredns_replica_count(args)
1029
+
1021
1030
  # 1. Install jq
1022
1031
  install_jq()
1023
1032
 
@@ -1034,7 +1043,7 @@ def update_coredns() -> int:
1034
1043
  scale_down_deployment('kube-dns')
1035
1044
 
1036
1045
  # 6. Scale up coredns and verify readiness
1037
- scale_up_coredns(replicas=15)
1046
+ scale_up_coredns(coredns_replica_count)
1038
1047
  verify_coredns_readiness()
1039
1048
 
1040
1049
  xpk_print('The CoreDNS setup process has been completed.')
@@ -1074,7 +1083,7 @@ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
1074
1083
  return False
1075
1084
 
1076
1085
 
1077
- def update_coredns_if_necessary() -> int:
1086
+ def update_coredns_if_necessary(args) -> int:
1078
1087
  """Updates and deploys CoreDNS within the cluster if it's not already present.
1079
1088
 
1080
1089
  This function checks for the existence of the CoreDNS deployment.
@@ -1089,7 +1098,7 @@ def update_coredns_if_necessary() -> int:
1089
1098
  return 0
1090
1099
  else:
1091
1100
  xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
1092
- return update_coredns()
1101
+ return update_coredns(args)
1093
1102
 
1094
1103
 
1095
1104
  def create_cluster_if_necessary(
@@ -22,7 +22,7 @@ from unittest.mock import MagicMock, patch
22
22
  import pytest
23
23
 
24
24
  from xpk.core.telemetry import MetricsCollector
25
- from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
25
+ from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, _get_coredns_replica_count, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
26
26
  from xpk.core.capacity import CapacityType
27
27
  from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
28
28
  from xpk.core.testing.commands_tester import CommandsTester
@@ -787,3 +787,18 @@ def test_validate_cluster_create_args_sets_correct_num_slices(
787
787
  _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
788
788
 
789
789
  assert args.num_slices == expected
790
+
791
+
792
+ def test_get_coredns_replica_count_lower_limit_is_number_of_nodes():
793
+ args = construct_args(
794
+ default_pool_cpu_num_nodes=7,
795
+ )
796
+
797
+ assert _get_coredns_replica_count(args) == 7
798
+
799
+
800
+ def test_get_coredns_replica_count_upper_limit_is_15():
801
+ args = construct_args(
802
+ default_pool_cpu_num_nodes=20,
803
+ )
804
+ assert _get_coredns_replica_count(args) == 15
@@ -21,6 +21,8 @@ from kubernetes import client as k8s_client
21
21
  from kubernetes import config
22
22
  from kubernetes.client.exceptions import ApiException
23
23
 
24
+ from .kubectl_common import PatchResources, patch_controller_manager_resources
25
+ from ..utils.feature_flags import FeatureFlags
24
26
  from ..utils.console import xpk_exit, xpk_print
25
27
  from .capacity import H200_DEVICE_TYPE
26
28
  from .commands import (
@@ -33,6 +35,7 @@ from .gcloud_context import (
33
35
  get_cluster_location,
34
36
  zone_to_region,
35
37
  )
38
+ from .nodepool import recreate_nodes_in_existing_node_pools
36
39
  from .resources import get_cluster_system_characteristics
37
40
  from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
38
41
 
@@ -72,7 +75,21 @@ def set_jobset_on_cluster(args) -> int:
72
75
  ' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
73
76
  ' instructions on how to fix these permissions.'
74
77
  )
75
- return return_code
78
+ return return_code
79
+
80
+ if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
81
+ return patch_controller_manager_resources(
82
+ name='jobset-controller-manager',
83
+ namespace='jobset-system',
84
+ patch_resources=PatchResources(
85
+ cpu_request=4,
86
+ cpu_limit=4,
87
+ memory_request='16Gi',
88
+ memory_limit='16Gi',
89
+ ),
90
+ )
91
+
92
+ return 0
76
93
 
77
94
 
78
95
  def set_pathways_job_on_cluster(args) -> int:
@@ -605,6 +622,19 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
605
622
  if return_code != 0:
606
623
  xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
607
624
  return 1
625
+
626
+ xpk_print(
627
+ 'Recreating existing nodes (if any) to complete the Lustre CSI driver'
628
+ ' installation.'
629
+ )
630
+ return_code = recreate_nodes_in_existing_node_pools(args)
631
+ if return_code != 0:
632
+ xpk_print(
633
+ f'Node recreation failed with ERROR {return_code}. You must recreate'
634
+ ' the nodes manually in order to access Lustre storage from your'
635
+ ' workloads.'
636
+ )
637
+ return 1
608
638
  return 0
609
639
 
610
640
 
@@ -14,10 +14,12 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from unittest.mock import MagicMock
17
18
  import pytest
18
19
  from .testing.commands_tester import CommandsTester
19
- from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
20
+ from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary, set_jobset_on_cluster
20
21
  from pytest_mock import MockerFixture
22
+ from ..utils.feature_flags import FeatureFlags
21
23
 
22
24
 
23
25
  @pytest.fixture(autouse=True)
@@ -26,6 +28,9 @@ def commands_tester(mocker: MockerFixture) -> CommandsTester:
26
28
  mocker=mocker,
27
29
  run_command_for_value_path="xpk.core.cluster.run_command_for_value",
28
30
  run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
31
+ run_command_with_updates_retry_path=(
32
+ "xpk.core.cluster.run_command_with_updates_retry"
33
+ ),
29
34
  )
30
35
 
31
36
 
@@ -38,7 +43,17 @@ def mock_location(mocker: MockerFixture):
38
43
 
39
44
  @pytest.fixture(autouse=True)
40
45
  def command_args(mocker: MockerFixture):
41
- return mocker.Mock(cluster="cluster", project="project", zone="zone")
46
+ return mocker.Mock(
47
+ cluster="cluster", project="project", zone="zone", super_slicing=False
48
+ )
49
+
50
+
51
+ @pytest.fixture(autouse=True)
52
+ def mock_patch_controller_manager_resources(mocker: MockerFixture) -> MagicMock:
53
+ return mocker.patch(
54
+ "xpk.core.cluster.patch_controller_manager_resources",
55
+ return_value=0,
56
+ )
42
57
 
43
58
 
44
59
  def test_get_cluster_credentials_returns_1_when_retrieval_commands_fail(
@@ -166,11 +181,14 @@ def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_co
166
181
 
167
182
 
168
183
  def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
169
- commands_tester: CommandsTester, command_args
184
+ commands_tester: CommandsTester, command_args, mocker: MockerFixture
170
185
  ):
171
186
  commands_tester.set_result_for_command(
172
187
  (0, ""), "gcloud container clusters update"
173
188
  )
189
+ mocker.patch(
190
+ "xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=0
191
+ )
174
192
  command_args.enable_legacy_lustre_port = None
175
193
  update_gke_cluster_with_lustre_driver_enabled(command_args)
176
194
 
@@ -181,12 +199,30 @@ def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
181
199
  ]
182
200
 
183
201
 
202
+ def test_update_gke_cluster_with_lustre_driver_enabled_fails_if_node_recreation_failed(
203
+ commands_tester: CommandsTester, command_args, mocker: MockerFixture
204
+ ):
205
+ commands_tester.set_result_for_command(
206
+ (0, ""), "gcloud container clusters update"
207
+ )
208
+ mocker.patch(
209
+ "xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=123
210
+ )
211
+ command_args.enable_legacy_lustre_port = None
212
+ return_code = update_gke_cluster_with_lustre_driver_enabled(command_args)
213
+
214
+ assert return_code != 0
215
+
216
+
184
217
  def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
185
- commands_tester: CommandsTester, command_args
218
+ commands_tester: CommandsTester, command_args, mocker: MockerFixture
186
219
  ):
187
220
  commands_tester.set_result_for_command(
188
221
  (0, ""), "gcloud container clusters update"
189
222
  )
223
+ mocker.patch(
224
+ "xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=0
225
+ )
190
226
  command_args.enable_legacy_lustre_port = True
191
227
  update_gke_cluster_with_lustre_driver_enabled(command_args)
192
228
 
@@ -195,3 +231,24 @@ def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
195
231
  "gcloud container clusters update cluster --project=project"
196
232
  " --location=us-central1 --quiet --enable-legacy-lustre-port"
197
233
  ]
234
+
235
+
236
+ def test_set_jobset_on_cluster_not_setting_resources_by_default(
237
+ mock_patch_controller_manager_resources: MagicMock, command_args
238
+ ):
239
+ result = set_jobset_on_cluster(command_args)
240
+
241
+ assert result == 0
242
+ mock_patch_controller_manager_resources.assert_not_called()
243
+
244
+
245
+ def test_set_jobset_on_cluster_super_slicing_resources(
246
+ mock_patch_controller_manager_resources: MagicMock, command_args
247
+ ):
248
+ FeatureFlags.SUPER_SLICING_ENABLED = True
249
+ command_args.super_slicing = True
250
+
251
+ result = set_jobset_on_cluster(command_args)
252
+
253
+ assert result == 0
254
+ mock_patch_controller_manager_resources.assert_called()
@@ -181,7 +181,9 @@ def get_main_container(
181
181
  tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
182
182
  gpu_workload_terminate_command=gpu_workload_terminate_command,
183
183
  xpk_internal_commands=xpk_internal_commands,
184
- resources=get_main_container_resources(args, system, resource_type),
184
+ resources=get_main_container_resources(
185
+ args, system, resource_type, parallel_containers
186
+ ),
185
187
  volume_mounts=volume_mounts,
186
188
  )
187
189
  )
@@ -23,7 +23,10 @@ from ..utils.execution_context import is_dry_run
23
23
 
24
24
 
25
25
  def get_main_container_resources(
26
- args, system: SystemCharacteristics, resource_type
26
+ args,
27
+ system: SystemCharacteristics,
28
+ resource_type: str,
29
+ parallel_containers: int,
27
30
  ) -> str:
28
31
  """Resources for the main container.
29
32
  Args:
@@ -53,10 +56,7 @@ def get_main_container_resources(
53
56
  offset_vCPUs = int(system.chips_per_vm) * 0.95
54
57
  return f'{resource_type}: {offset_vCPUs}'
55
58
 
56
- return (
57
- f'{resource_type}:'
58
- f' {int(system.chips_per_vm / system.parallel_containers)}'
59
- )
59
+ return f'{resource_type}: {int(system.chips_per_vm / parallel_containers)}'
60
60
 
61
61
 
62
62
  def get_env_container(args, system: SystemCharacteristics) -> str: