xpk 0.17.3__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. {xpk-0.17.3 → xpk-1.0.0}/.github/actions/setup-test-env/action.yml +0 -1
  2. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/build_tests.yaml +1 -2
  3. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/integration_basic_cluster_create.yaml +0 -56
  4. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/integration_legacy_tests.yaml +1 -2
  5. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/label-validation.yaml +2 -2
  6. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/nightly_tests.yaml +5 -7
  7. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_goldens.yaml +0 -1
  8. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_integration_tests.yaml +0 -1
  9. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_lint_and_format.yml +0 -1
  10. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_storage_create.yaml +0 -41
  11. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_storage_delete.yaml +0 -3
  12. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_unit_tests.yaml +0 -1
  13. {xpk-0.17.3 → xpk-1.0.0}/Makefile +2 -16
  14. {xpk-0.17.3/src/xpk.egg-info → xpk-1.0.0}/PKG-INFO +15 -4
  15. {xpk-0.17.3 → xpk-1.0.0}/README.md +14 -3
  16. {xpk-0.17.3 → xpk-1.0.0}/docs/installation.md +0 -1
  17. {xpk-0.17.3 → xpk-1.0.0}/docs/troubleshooting.md +1 -1
  18. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/clusters.md +29 -0
  19. {xpk-0.17.3 → xpk-1.0.0}/goldens/Basic_cluster_create.txt +3 -3
  20. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_for_multi-host_nodepool.txt +3 -3
  21. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_private.txt +5 -3
  22. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_sub-slicing.txt +6 -4
  23. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_super-slicing.txt +5 -3
  24. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +3 -3
  25. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +3 -3
  26. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_with_Managed_Lustre_driver.txt +3 -3
  27. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +3 -3
  28. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_with_gb200-4.txt +36 -34
  29. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_create_with_shared_reservation.txt +5 -3
  30. {xpk-0.17.3 → xpk-1.0.0}/goldens/NAP_cluster-create.txt +3 -3
  31. {xpk-0.17.3 → xpk-1.0.0}/goldens/NAP_cluster-create_with_pathways.txt +3 -3
  32. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_create_super-slicing.txt +3 -3
  33. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/cluster.py +4 -13
  34. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/cluster_gcluster_test.py +2 -0
  35. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/workload.py +10 -3
  36. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/workload_test.py +1 -0
  37. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/cluster.py +10 -9
  38. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/config.py +5 -2
  39. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/kueue_manager_test.py +2 -0
  40. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/nodepool.py +6 -0
  41. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/nodepool_test.py +4 -0
  42. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/scheduling.py +28 -3
  43. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/scheduling_test.py +38 -1
  44. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/system_characteristics.py +38 -0
  45. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/system_characteristics_test.py +11 -0
  46. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/common.py +0 -17
  47. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/core.py +0 -8
  48. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/storage.py +0 -11
  49. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/feature_flags.py +1 -1
  50. {xpk-0.17.3 → xpk-1.0.0/src/xpk.egg-info}/PKG-INFO +15 -4
  51. {xpk-0.17.3 → xpk-1.0.0}/src/xpk.egg-info/SOURCES.txt +0 -13
  52. {xpk-0.17.3 → xpk-1.0.0}/tools/install-xpk.sh +0 -4
  53. xpk-0.17.3/.github/actions/install-kjob/action.yml +0 -35
  54. xpk-0.17.3/.github/workflows/reusable_build_kjob.yaml +0 -23
  55. xpk-0.17.3/docs/local_testing.md +0 -61
  56. xpk-0.17.3/docs/usage/job.md +0 -41
  57. xpk-0.17.3/docs/usage/run.md +0 -44
  58. xpk-0.17.3/examples/batch.md +0 -24
  59. xpk-0.17.3/examples/job.sh +0 -12
  60. xpk-0.17.3/goldens/Cluster_create_for_single-host_single-slice_TPU.txt +0 -199
  61. xpk-0.17.3/src/xpk/commands/kind.py +0 -265
  62. xpk-0.17.3/src/xpk/parser/kind.py +0 -95
  63. xpk-0.17.3/tools/Dockerfile-kjob +0 -33
  64. xpk-0.17.3/tools/build-kjob.sh +0 -9
  65. xpk-0.17.3/xpk-slurm-commands.md +0 -382
  66. {xpk-0.17.3 → xpk-1.0.0}/.dockerignore +0 -0
  67. {xpk-0.17.3 → xpk-1.0.0}/.github/CODEOWNERS +0 -0
  68. {xpk-0.17.3 → xpk-1.0.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  69. {xpk-0.17.3 → xpk-1.0.0}/.github/actions/install-kueue/action.yml +0 -0
  70. {xpk-0.17.3 → xpk-1.0.0}/.github/release.yaml +0 -0
  71. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/README.md +0 -0
  72. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/build_wheels.yaml +0 -0
  73. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/cleanup.yaml +0 -0
  74. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/gemini-dispatch.yml +0 -0
  75. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/gemini-invoke.yml +0 -0
  76. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/gemini-review.yml +0 -0
  77. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/gemini-scheduled-triage.yml +0 -0
  78. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/gemini-triage.yml +0 -0
  79. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
  80. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
  81. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/integration_storage_tests.yaml +0 -0
  82. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/periodic_release.yaml +0 -0
  83. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/release_branch_versioning.yaml +0 -0
  84. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_build_scripts.yaml +0 -0
  85. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/reusable_build_wheel.yaml +0 -0
  86. {xpk-0.17.3 → xpk-1.0.0}/.github/workflows/stale.yaml +0 -0
  87. {xpk-0.17.3 → xpk-1.0.0}/.gitignore +0 -0
  88. {xpk-0.17.3 → xpk-1.0.0}/.pre-commit-config.yaml +0 -0
  89. {xpk-0.17.3 → xpk-1.0.0}/LICENSE +0 -0
  90. {xpk-0.17.3 → xpk-1.0.0}/backoff_retry.sh +0 -0
  91. {xpk-0.17.3 → xpk-1.0.0}/data/Dockerfile +0 -0
  92. {xpk-0.17.3 → xpk-1.0.0}/docs/code-of-conduct.md +0 -0
  93. {xpk-0.17.3 → xpk-1.0.0}/docs/contributing.md +0 -0
  94. {xpk-0.17.3 → xpk-1.0.0}/docs/permissions.md +0 -0
  95. {xpk-0.17.3 → xpk-1.0.0}/docs/testing.md +0 -0
  96. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/advanced.md +0 -0
  97. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/autoprovisioning.md +0 -0
  98. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/cpu.md +0 -0
  99. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/docker.md +0 -0
  100. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/gpu.md +0 -0
  101. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/inspector.md +0 -0
  102. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/storage.md +0 -0
  103. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/tpu7x/clusters.md +0 -0
  104. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
  105. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
  106. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
  107. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/tpu7x/workloads.md +0 -0
  108. {xpk-0.17.3 → xpk-1.0.0}/docs/usage/workloads.md +0 -0
  109. {xpk-0.17.3 → xpk-1.0.0}/examples/fake_training.py +0 -0
  110. {xpk-0.17.3 → xpk-1.0.0}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
  111. {xpk-0.17.3 → xpk-1.0.0}/examples/llama-3.1-finetuning/requirements.txt +0 -0
  112. {xpk-0.17.3 → xpk-1.0.0}/examples/llama-3.1-finetuning/train.py +0 -0
  113. {xpk-0.17.3 → xpk-1.0.0}/examples/llama-3.1-finetuning/train.slurm +0 -0
  114. {xpk-0.17.3 → xpk-1.0.0}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
  115. {xpk-0.17.3 → xpk-1.0.0}/examples/nccl/nccl-a3mega.sh +0 -0
  116. {xpk-0.17.3 → xpk-1.0.0}/examples/nccl/nccl-a3ultra.sh +0 -0
  117. {xpk-0.17.3 → xpk-1.0.0}/examples/nccl/nccl.md +0 -0
  118. {xpk-0.17.3 → xpk-1.0.0}/examples/storage/filestore-manifest-attach.yaml +0 -0
  119. {xpk-0.17.3 → xpk-1.0.0}/examples/storage/gcsfuse-manifest.yaml +0 -0
  120. {xpk-0.17.3 → xpk-1.0.0}/examples/storage/lustre-manifest-attach.yaml +0 -0
  121. {xpk-0.17.3 → xpk-1.0.0}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
  122. {xpk-0.17.3 → xpk-1.0.0}/examples/storage/pd-manifest-attach.yaml +0 -0
  123. {xpk-0.17.3 → xpk-1.0.0}/golden_buddy.sh +0 -0
  124. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_delete.txt +0 -0
  125. {xpk-0.17.3 → xpk-1.0.0}/goldens/Cluster_delete_force.txt +0 -0
  126. {xpk-0.17.3 → xpk-1.0.0}/goldens/Storage_list.txt +0 -0
  127. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_create.txt +0 -0
  128. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_create_pathways.txt +0 -0
  129. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_create_sub-slicing.txt +0 -0
  130. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_create_with_output-manifest-file.txt +0 -0
  131. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_delete.txt +0 -0
  132. {xpk-0.17.3 → xpk-1.0.0}/goldens/Workload_list.txt +0 -0
  133. {xpk-0.17.3 → xpk-1.0.0}/goldens.yaml +0 -0
  134. {xpk-0.17.3 → xpk-1.0.0}/pylintrc +0 -0
  135. {xpk-0.17.3 → xpk-1.0.0}/pyproject.toml +0 -0
  136. {xpk-0.17.3 → xpk-1.0.0}/setup.cfg +0 -0
  137. {xpk-0.17.3 → xpk-1.0.0}/src/integration/README.md +0 -0
  138. {xpk-0.17.3 → xpk-1.0.0}/src/integration/__init__.py +0 -0
  139. {xpk-0.17.3 → xpk-1.0.0}/src/integration/docker_manager_test.py +0 -0
  140. {xpk-0.17.3 → xpk-1.0.0}/src/integration/gcluster_a3mega_test.py +0 -0
  141. {xpk-0.17.3 → xpk-1.0.0}/src/integration/gcluster_a3ultra_test.py +0 -0
  142. {xpk-0.17.3 → xpk-1.0.0}/src/integration/gcluster_a4_test.py +0 -0
  143. {xpk-0.17.3 → xpk-1.0.0}/src/integration/gcluster_test.py +0 -0
  144. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/__init__.py +0 -0
  145. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/api/__init__.py +0 -0
  146. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/api/storage_crd.yaml +0 -0
  147. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
  148. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
  149. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
  150. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
  151. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
  152. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
  153. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
  154. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
  155. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
  156. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/__init__.py +0 -0
  157. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/cluster_gcluster.py +0 -0
  158. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/cluster_test.py +0 -0
  159. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/common.py +0 -0
  160. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/config.py +0 -0
  161. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/info.py +0 -0
  162. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/inspector.py +0 -0
  163. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
  164. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
  165. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/storage.py +0 -0
  166. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/commands/version.py +0 -0
  167. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/__init__.py +0 -0
  168. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/__init__.py +0 -0
  169. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  170. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
  171. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  172. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/testing/__init__.py +0 -0
  173. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
  174. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
  175. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
  176. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
  177. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/capacity.py +0 -0
  178. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/capacity_test.py +0 -0
  179. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/cluster_private.py +0 -0
  180. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/cluster_test.py +0 -0
  181. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/commands.py +0 -0
  182. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/config_test.py +0 -0
  183. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/docker_container.py +0 -0
  184. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/docker_image.py +0 -0
  185. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/docker_manager.py +0 -0
  186. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/docker_resources.py +0 -0
  187. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/filestore.py +0 -0
  188. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/gcloud_context.py +0 -0
  189. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/gcloud_context_test.py +0 -0
  190. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/gcluster_manager.py +0 -0
  191. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/gcsfuse.py +0 -0
  192. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/jobset.py +0 -0
  193. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/kueue_manager.py +0 -0
  194. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/monitoring.py +0 -0
  195. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/mtc.py +0 -0
  196. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/nap.py +0 -0
  197. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/network.py +0 -0
  198. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/pathways.py +0 -0
  199. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/pathways_test.py +0 -0
  200. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/ray.py +0 -0
  201. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/remote_state/__init__.py +0 -0
  202. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  203. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  204. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/resources.py +0 -0
  205. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/storage.py +0 -0
  206. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/telemetry.py +0 -0
  207. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/telemetry_test.py +0 -0
  208. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/testing/__init__.py +0 -0
  209. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/testing/commands_tester.py +0 -0
  210. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/testing/commands_tester_test.py +0 -0
  211. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/updates.py +0 -0
  212. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/updates_test.py +0 -0
  213. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/vertex.py +0 -0
  214. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload.py +0 -0
  215. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  216. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  217. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  218. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  219. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  220. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  221. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/core/workload_test.py +0 -0
  222. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/main.py +0 -0
  223. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/__init__.py +0 -0
  224. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/cluster.py +0 -0
  225. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/cluster_test.py +0 -0
  226. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/common_test.py +0 -0
  227. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/config.py +0 -0
  228. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/info.py +0 -0
  229. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/inspector.py +0 -0
  230. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/storage_test.py +0 -0
  231. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/validators.py +0 -0
  232. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/version.py +0 -0
  233. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/workload.py +0 -0
  234. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/parser/workload_test.py +0 -0
  235. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/telemetry_uploader.py +0 -0
  236. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/__init__.py +0 -0
  237. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
  238. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
  239. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/filestore-pv.yaml +0 -0
  240. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/filestore-pvc.yaml +0 -0
  241. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/filestore-sc.yaml +0 -0
  242. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/fuse-pv.yaml +0 -0
  243. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/fuse-pvc.yaml +0 -0
  244. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
  245. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
  246. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
  247. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
  248. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/mtc-cpc.yaml +0 -0
  249. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/templates/storage.yaml +0 -0
  250. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/__init__.py +0 -0
  251. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/console.py +0 -0
  252. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/console_test.py +0 -0
  253. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/execution_context.py +0 -0
  254. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/file.py +0 -0
  255. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/gcs_utils.py +0 -0
  256. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/kubectl.py +0 -0
  257. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/kueue.py +0 -0
  258. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/network.py +0 -0
  259. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/objects.py +0 -0
  260. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/templates.py +0 -0
  261. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/topology.py +0 -0
  262. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/topology_test.py +0 -0
  263. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/user_agent.py +0 -0
  264. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/user_agent_test.py +0 -0
  265. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/user_input.py +0 -0
  266. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/user_input_test.py +0 -0
  267. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/validation.py +0 -0
  268. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/validation_test.py +0 -0
  269. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/versions.py +0 -0
  270. {xpk-0.17.3 → xpk-1.0.0}/src/xpk/utils/yaml.py +0 -0
  271. {xpk-0.17.3 → xpk-1.0.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  272. {xpk-0.17.3 → xpk-1.0.0}/src/xpk.egg-info/entry_points.txt +0 -0
  273. {xpk-0.17.3 → xpk-1.0.0}/src/xpk.egg-info/requires.txt +0 -0
  274. {xpk-0.17.3 → xpk-1.0.0}/src/xpk.egg-info/top_level.txt +0 -0
  275. {xpk-0.17.3 → xpk-1.0.0}/tools/install-gke-auth-plugin.sh +0 -0
  276. {xpk-0.17.3 → xpk-1.0.0}/xpk-large-scale-guide.sh +0 -0
  277. {xpk-0.17.3 → xpk-1.0.0}/xpk-notebooks.md +0 -0
  278. {xpk-0.17.3 → xpk-1.0.0}/xpk.py +0 -0
@@ -44,7 +44,6 @@ runs:
44
44
  run: gcloud auth configure-docker --quiet
45
45
  shell: bash
46
46
  - uses: ./.github/actions/install-kueue
47
- - uses: ./.github/actions/install-kjob
48
47
  - name: Install XPK
49
48
  run: pip install dist/xpk-*.whl
50
49
  shell: bash
@@ -49,14 +49,13 @@ jobs:
49
49
  lookup-only: true
50
50
  - name: install dependencies
51
51
  if : steps.check-cache.outputs.cache-hit != 'true'
52
- run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
52
+ run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
53
53
  - name: Cache dependencies
54
54
  if : steps.check-cache.outputs.cache-hit != 'true'
55
55
  uses: actions/cache/save@v3
56
56
  with:
57
57
  path: |
58
58
  /usr/local/bin/kubectl-kueue
59
- /usr/local/bin/kubectl-kjob
60
59
  ~/.cache/pip
61
60
  ${{env.pythonLocation}}
62
61
  key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
@@ -152,62 +152,6 @@ jobs:
152
152
  run: xpk info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
153
153
  - name: Delete the workload on the cluster
154
154
  run: xpk workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
155
- - name: Create test script to execute in batch
156
- run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
157
- - name: Run a batch job on the cluster
158
- run: xpk batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
159
- - name: List out the jobs on the cluster
160
- run: xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
161
- - name: Get created job name
162
- run: |
163
- JOB_NAME=$(xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | grep 'multislice-queue' | head -1 | awk '{print $1}')
164
- echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
165
- - name: Check job spec
166
- run: |
167
- job_spec=$(kubectl get job ${JOB_NAME} -o jsonpath='{.spec}')
168
- echo "$job_spec" | grep '"completions":2'
169
- echo "$job_spec" | grep '"parallelism":2'
170
- echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
171
- - name: Get job info for the last job created on the cluster
172
- run: xpk job info ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
173
- - name: Cancel the batch job on the cluster
174
- run: xpk job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
175
- - name: Create shell and exit it immediately
176
- run: |
177
- cat <<EOF > create-shell.exp
178
- #!/usr/bin/expect
179
- set timeout 180
180
- spawn sh -c "xpk shell --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee shell.log"
181
- send "\n"
182
- expect {
183
- "/ # " {
184
- send "exit\n"
185
- # Wait for EOF after exit
186
- expect eof
187
- exit 0
188
- }
189
- timeout {
190
- puts "Timed out waiting for pod to be running"
191
- exit 1
192
- }
193
- eof {
194
- puts "Unexpected EOF before getting prompt"
195
- exit 1
196
- }
197
- }
198
- EOF
199
- chmod +x ./create-shell.exp
200
- expect ./create-shell.exp
201
- - name: Check if shell exists and is running
202
- run: |
203
- pod_name=$(grep 'waiting for pod' shell.log | awk -F'"' '{print $2}')
204
- kubectl wait --for='jsonpath={.status.conditions[?(@.type=="Ready")].status}=True' --timeout=1m pod/${pod_name}
205
- - name: Stop the shell
206
- run: xpk shell stop --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
207
- - name: Delete create-shell.exp file
208
- run: rm create-shell.exp
209
- - name: Delete shell.log file
210
- run: rm shell.log
211
155
  - name: Delete the cluster created
212
156
  if: always()
213
157
  run: xpk cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
@@ -47,14 +47,13 @@ jobs:
47
47
  lookup-only: true
48
48
  - name: install dependencies
49
49
  if: steps.check-cache.outputs.cache-hit != 'true'
50
- run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
50
+ run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
51
51
  - name: Cache dependencies
52
52
  if: steps.check-cache.outputs.cache-hit != 'true'
53
53
  uses: actions/cache/save@v3
54
54
  with:
55
55
  path: |
56
56
  /usr/local/bin/kubectl-kueue
57
- /usr/local/bin/kubectl-kjob
58
57
  ~/.cache/pip
59
58
  ${{env.pythonLocation}}
60
59
  key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
@@ -36,8 +36,8 @@ jobs:
36
36
  with:
37
37
  mode: minimum
38
38
  count: 1
39
- labels: "release-improvements, release-bugfix, release-features"
40
- message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features"
39
+ labels: "release-improvements, release-bugfix, release-features, release-breaking"
40
+ message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features, release-breaking"
41
41
  - id: do-not-merge
42
42
  uses: mheap/github-action-required-labels@v5
43
43
  with:
@@ -23,31 +23,29 @@ permissions:
23
23
  contents: read
24
24
 
25
25
  jobs:
26
- build_kjob:
27
- uses: ./.github/workflows/reusable_build_kjob.yaml
28
26
  build_wheel:
29
27
  uses: ./.github/workflows/reusable_build_wheel.yaml
30
28
  build_actions:
31
29
  uses: ./.github/workflows/reusable_build_scripts.yaml
32
30
  basic_cluster_create:
33
- needs: [build_kjob, build_actions, build_wheel]
31
+ needs: [build_actions, build_wheel]
34
32
  uses: ./.github/workflows/integration_basic_cluster_create.yaml
35
33
  secrets: inherit
36
34
 
37
35
  pathways_cluster_create:
38
- needs: [build_kjob, build_actions, build_wheel]
36
+ needs: [build_actions, build_wheel]
39
37
  uses: ./.github/workflows/integration_pathways_cluster_create.yaml
40
38
  secrets: inherit
41
39
 
42
40
  ray_cluster_create:
43
- needs: [build_kjob, build_actions, build_wheel]
41
+ needs: [build_actions, build_wheel]
44
42
  uses: ./.github/workflows/integration_ray_cluster_create.yaml
45
43
  secrets: inherit
46
44
  legacy_integration:
47
- needs: [build_kjob, build_actions, build_wheel]
45
+ needs: [build_actions, build_wheel]
48
46
  uses: ./.github/workflows/integration_legacy_tests.yaml
49
47
  secrets: inherit
50
48
  storage-tests:
51
- needs: [build_kjob, build_actions, build_wheel]
49
+ needs: [build_actions, build_wheel]
52
50
  uses: ./.github/workflows/integration_storage_tests.yaml
53
51
  secrets: inherit
@@ -33,7 +33,6 @@ jobs:
33
33
  with:
34
34
  path: |
35
35
  /usr/local/bin/kubectl-kueue
36
- /usr/local/bin/kubectl-kjob
37
36
  ~/.cache/pip
38
37
  ${{env.pythonLocation}}
39
38
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
@@ -42,7 +42,6 @@ jobs:
42
42
  with:
43
43
  path: |
44
44
  /usr/local/bin/kubectl-kueue
45
- /usr/local/bin/kubectl-kjob
46
45
  ~/.cache/pip
47
46
  ${{env.pythonLocation}}
48
47
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
@@ -39,7 +39,6 @@ jobs:
39
39
  with:
40
40
  path: |
41
41
  /usr/local/bin/kubectl-kueue
42
- /usr/local/bin/kubectl-kjob
43
42
  ~/.cache/pip
44
43
  ${{env.pythonLocation}}
45
44
  key: xpk-deps-${{matrix.python-version}}-${{github.run_id}}-${{github.run_attempt}}
@@ -92,8 +92,6 @@ jobs:
92
92
  --auto-mount=true --vol=vol1 --mount-point='/${{inputs.storage-type}}-test-mount-point' --readonly=false
93
93
  - name: List and verify existing Storages
94
94
  run: xpk storage list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | tee output.txt | grep ${{inputs.storage-name}} || (echo 'No storage found' && exit 143)
95
- - name: Verify VolumeBundle created
96
- run: kubectl get volumebundle ${{inputs.storage-name}} -o jsonpath='{.spec.containerVolumeMounts[0].mountPath}' | grep '/${{inputs.storage-type}}-test-mount-point'
97
95
  - name: Verify Persistent Volume mount options
98
96
  if: inputs.storage-command == 'attach' && inputs.storage-type == 'gcsfuse'
99
97
  run: kubectl get pv ${{inputs.storage-name}}-pv -oyaml | grep rename-dir-limit=10000 || (echo 'Invalid storage mount options' && exit 143)
@@ -114,45 +112,6 @@ jobs:
114
112
  run: xpk workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $STORAGE_READ_WORKLOAD --timeout 300
115
113
  - name: Delete the reader workload on the cluster
116
114
  run: xpk workload delete --workload $STORAGE_READ_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
117
- - name: Create batch-read.sh script
118
- run: |
119
- cat <<EOF > batch-read.sh
120
- #!/bin/bash
121
- grep 'Test text message' /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)
122
- EOF
123
- - name: Run a batch-read job on the cluster
124
- run: xpk batch --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh | tee batch-read.log
125
- - name: Get job name
126
- run: |
127
- cat batch-read.log | grep 'xpk-def-app-profile-slurm-'
128
- READ_JOB_NAME=$(grep 'Job name: xpk-def-app-profile-slurm-' batch-read.log | awk -F': ' '{print $2}')
129
- echo "READ_JOB_NAME=${READ_JOB_NAME}" >> $GITHUB_ENV
130
- - name: Wait for the batch-read job to finish
131
- run: kubectl wait job.batch/$READ_JOB_NAME --for=condition=Complete --timeout=1m
132
- - name: Cancel the batch-read job
133
- run: xpk job cancel $READ_JOB_NAME --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | grep "job.batch/$READ_JOB_NAME deleted"
134
- - name: Delete batch-read.log file
135
- run: rm batch-read.log
136
- - name: Run a run-read job on the cluster
137
- run: xpk run --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh --timeout 60
138
- - name: Delete batch-read.sh file
139
- run: rm batch-read.sh
140
- - name: Create shell and exit it immediately
141
- run: |
142
- cat <<EOF >> create-shell.exp
143
- ##!/usr/bin/expect
144
- spawn xpk shell --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
145
- expect "/ # "
146
- send "cat /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt\n"
147
- expect "Test text message"
148
- send "exit\n"
149
- EOF
150
- chmod +x ./create-shell.exp
151
- expect ./create-shell.exp
152
- - name: Stop the shell
153
- run: xpk shell stop --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
154
- - name: Delete create-shell.exp file
155
- run: rm create-shell.exp
156
115
  - name: Run workload to delete file on filestore
157
116
  run : xpk workload create --workload $STORAGE_DELETE_WORKLOAD --command "rm -rf /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --device-type=${{inputs.device-type}} --zone ${{inputs.zone}}
158
117
  - name: Wait for delete workload completion and confirm it succeeded
@@ -61,9 +61,6 @@ jobs:
61
61
  - name: Detach storage volumes
62
62
  if: always()
63
63
  run: xpk storage detach ${{inputs.storage-name}} --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}}
64
- - name: Verify VolumeBundle deleted
65
- run: |
66
- ! kubectl get volumebundle | grep ${{inputs.storage-name}}
67
64
  - name: Delete GCP Filestore Storage instance
68
65
  if: always() && inputs.storage-command == 'delete'
69
66
  run: xpk storage delete ${{inputs.storage-name}} --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}}
@@ -33,7 +33,6 @@ jobs:
33
33
  with:
34
34
  path: |
35
35
  /usr/local/bin/kubectl-kueue
36
- /usr/local/bin/kubectl-kjob
37
36
  ~/.cache/pip
38
37
  ${{env.pythonLocation}}
39
38
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
@@ -2,25 +2,21 @@ KUEUE_REPO=https://github.com/kubernetes-sigs/kueue.git
2
2
 
3
3
  KUBECTL_VERSION := $(shell curl -L -s https://dl.k8s.io/release/stable.txt)
4
4
  KUEUE_VERSION=v0.14.3
5
- KJOB_VERSION=v0.1.0
6
5
 
7
6
  OS := $(shell uname -s | tr A-Z a-z)
8
7
  PLATFORM := $(shell uname -m | sed -e 's/aarch64/arm64/' | sed -e 's/x86_64/amd64/')
9
8
 
10
9
  KUBECTL_URL = "https://dl.k8s.io/release/$(KUBECTL_VERSION)/bin/$(OS)/$(PLATFORM)/kubectl"
11
10
  KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUEUE_VERSION)/kubectl-kueue-$(OS)-$(PLATFORM)"
12
- KJOBCTL_URL = "https://github.com/kubernetes-sigs/kjob/releases/download/$(KJOB_VERSION)/kubectl-kjob-$(OS)-$(PLATFORM)"
13
11
 
14
12
  PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
15
- KJOB_DOCKER_IMG := xpk_kjob
16
- KJOB_DOCKER_CONTAINER := xpk_kjob_container
17
13
  BIN_PATH=$(PROJECT_DIR)/bin
18
14
 
19
15
  .PHONY: install
20
- install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl install-kjobctl pip-install
16
+ install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl pip-install
21
17
 
22
18
  .PHONY: install-dev
23
- install-dev: check-python check-gcloud mkdir-bin install-kueuectl install-kjobctl pip-install pip-install-dev install-pytest install-lint
19
+ install-dev: check-python check-gcloud mkdir-bin install-kueuectl pip-install pip-install-dev install-pytest install-lint
24
20
 
25
21
  .PHONY: pip-install-dev
26
22
  pip-install-dev:
@@ -54,16 +50,6 @@ install-kueuectl: mkdir-bin
54
50
  curl -Lo $(BIN_PATH)/kubectl-kueue $(KUEUECTL_URL);
55
51
  chmod +x $(BIN_PATH)/kubectl-kueue;
56
52
 
57
- .PHONY: install-kjobctl
58
- install-kjobctl: mkdir-bin
59
- #curl -Lo $(BIN_PATH)/kubectl-kjob $(KJOBCTL_URL)
60
- #chmod +x $(BIN_PATH)/kubectl-kjob
61
- # TODO: Switch to kjob release-based installation once version >=0.2.0 is available.
62
- chmod +x tools/build-kjob.sh
63
- ./tools/build-kjob.sh
64
- mv kubectl-kjob $(BIN_PATH)/kubectl-kjob
65
- chmod +x $(BIN_PATH)/kubectl-kjob
66
-
67
53
  .PHONY: install-gcloud-auth-plugin
68
54
  install-gcloud-auth-plugin:
69
55
  chmod +x tools/install-gke-auth-plugin.sh
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.17.3
3
+ Version: 1.0.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -114,10 +114,21 @@ XPK also supports the following [Google Cloud Storage solutions](./docs/usage/st
114
114
  * [Storage](./docs/usage/storage.md)
115
115
  * [Advanced](./docs/usage/advanced.md)
116
116
  * [Inspector](./docs/usage/inspector.md)
117
- * [Run](./docs/usage/run.md)
118
- * [Job](./docs/usage/job.md)
119
117
  * [Troubleshooting](./docs/troubleshooting.md)
120
- * [Local Testing](./docs/local_testing.md)
118
+
119
+ # Privacy notice
120
+
121
+ To help improve XPK, feature usage statistics are collected and sent to Google. You can opt-out at any time by executing
122
+ the following shell command:
123
+
124
+ ```shell
125
+ xpk config set send-telemetry <true/false>
126
+ ```
127
+
128
+ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](https://policies.google.com/privacy). When
129
+ you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
130
+ [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
131
+
121
132
 
122
133
  # Contributing
123
134
 
@@ -73,10 +73,21 @@ XPK also supports the following [Google Cloud Storage solutions](./docs/usage/st
73
73
  * [Storage](./docs/usage/storage.md)
74
74
  * [Advanced](./docs/usage/advanced.md)
75
75
  * [Inspector](./docs/usage/inspector.md)
76
- * [Run](./docs/usage/run.md)
77
- * [Job](./docs/usage/job.md)
78
76
  * [Troubleshooting](./docs/troubleshooting.md)
79
- * [Local Testing](./docs/local_testing.md)
77
+
78
+ # Privacy notice
79
+
80
+ To help improve XPK, feature usage statistics are collected and sent to Google. You can opt-out at any time by executing
81
+ the following shell command:
82
+
83
+ ```shell
84
+ xpk config set send-telemetry <true/false>
85
+ ```
86
+
87
+ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](https://policies.google.com/privacy). When
88
+ you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
89
+ [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
90
+
80
91
 
81
92
  # Contributing
82
93
 
@@ -44,7 +44,6 @@ Depending on your chosen installation method, you may need these additional tool
44
44
  | Install Method | Tool | Notes |
45
45
  | :--- | :--- | :--- |
46
46
  | **Pip** | **kueuectl** | [Installation instructions](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/) |
47
- | **Pip** | **kjob** | [Installation instructions](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md) |
48
47
  | **Source** | **git** | Install via your package manager (e.g., `sudo apt-get install git` on Debian/Ubuntu) |
49
48
  | **Source** | **make** | Install via your package manager (e.g., `sudo apt-get install make` on Debian/Ubuntu) |
50
49
 
@@ -38,7 +38,7 @@ Some XPK cluster configuration might be missing, if workload creation fails with
38
38
 
39
39
  `[XPK] b'error: the server doesn\'t have a resource type "workloads"\n'`
40
40
 
41
- Mitigate this error by re-running your `xpk.py cluster create ...` command, to refresh the cluster configurations.
41
+ Mitigate this error by re-running your `xpk cluster create ...` command, to refresh the cluster configurations.
42
42
 
43
43
  ## Permission Issues: `requires one of ["permission_name"] permission(s)`.
44
44
 
@@ -254,6 +254,35 @@ xpk cluster create \
254
254
 
255
255
  will fail the cluster creation process because Vertex AI Tensorboard is not supported in `us-central2`.
256
256
 
257
+ ### Create Cluster With Google Cloud ML Diagnostics Enabled
258
+
259
+ Google Cloud ML Diagnostics is an end-to-end managed platform for ML Engineers to optimize and diagnose their AI/ML workloads on Google Cloud. The product allows ML Engineers to collect and visualize all their workload metrics, configs and profiles with one single platform, all within the same UI. The current product offering focuses on workloads running on XLA-based frameworks (JAX, Pytorch XLA, Tensorflow/Keras) on Google Cloud TPUs and GPUs. Current support is for JAX on Google Cloud TPUs only.
260
+
261
+ Enabling ML Diagnostics is streamlined and simplified through XPK cluster creation commands.
262
+
263
+ By adding the **--managed-mldiagnostics** flag during the execution of either **xpk cluster create** or **xpk cluster create-pathways**, the ML Diagnostics functionality is enabled. This flag ensures the necessary supporting components (such as the injection-webhook and connection-operator) are automatically configured, allowing the feature to function seamlessly in both Pathways and non-Pathways execution environments.
264
+
265
+ **Example Usage:**
266
+
267
+ * Cluster Create for Pathways with flag **--managed-mldiagnostics**:
268
+
269
+ ```shell
270
+ xpk cluster create-pathways \
271
+ --cluster xpk-pw-test \
272
+ --num-slices=4 --spot \
273
+ --tpu-type=v5litepod-16 \
274
+ --managed-mldiagnostics
275
+ ```
276
+
277
+ * Cluster Create (provision spot / preemptable capacity) with flag **--managed-mldiagnostics**:
278
+
279
+ ```shell
280
+ xpk cluster create \
281
+ --cluster xpk-test --tpu-type=v5litepod-16 \
282
+ --num-slices=4 --spot \
283
+ --managed-mldiagnostics
284
+ ```
285
+
257
286
  ## Cluster Delete
258
287
  * Cluster Delete (deprovision capacity):
259
288
 
@@ -9,7 +9,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
9
9
  [XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
10
10
  gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
11
11
  [XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
12
- gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --location-policy=BALANCED --scopes=storage-full,gke-default
12
+ gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --location-policy=BALANCED --scopes=storage-full,gke-default
13
13
  [XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
14
14
  gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
15
15
  [XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
@@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
37
37
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
38
38
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
39
39
  [XPK] Creating 1 node pool or pools of tpu7x-8
40
- We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
40
+ We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
41
41
  [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
42
42
  gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
43
43
  [XPK] Creating 1 node pool or pools of tpu7x-8
44
- Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
44
+ Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
45
45
  [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
46
46
  gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
47
47
  [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
@@ -9,7 +9,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
9
9
  [XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
10
10
  gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
11
11
  [XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
12
- gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --location-policy=BALANCED --scopes=storage-full,gke-default
12
+ gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --location-policy=BALANCED --scopes=storage-full,gke-default
13
13
  [XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
14
14
  gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
15
15
  [XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
@@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
37
37
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
38
38
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
39
39
  [XPK] Creating 1 node pool or pools of tpu7x-16
40
- We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
40
+ We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
41
41
  [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
42
42
  gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
43
43
  [XPK] Creating 1 node pool or pools of tpu7x-16
44
- Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
44
+ Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
45
45
  [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
46
46
  gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
47
47
  [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
@@ -4,6 +4,8 @@ $ xpk cluster create-pathways --project=golden-project --zone=us-central1-a --cl
4
4
  [XPK] Working on golden-project and us-central1-a
5
5
  [XPK] Task: `Retrieve available pathways machine types` is implemented by the following command not running since it is a dry run.
6
6
  gcloud compute machine-types list --filter "guestCpus >= 49 AND memoryMb >= 238592 AND zone = 'us-central1-a'" --format="value(name)" --project=golden-project
7
+ [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
8
+ gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
7
9
  [XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
8
10
  gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
9
11
  [XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
@@ -11,7 +13,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
11
13
  [XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
12
14
  gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
13
15
  [XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
14
- gcloud beta container clusters create golden-cluster-private --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=n1-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 4 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-master-authorized-networks --enable-private-nodes --location-policy=BALANCED --scopes=storage-full,gke-default --enable-ip-alias
16
+ gcloud beta container clusters create golden-cluster-private --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=n1-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 4 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --enable-master-authorized-networks --enable-private-nodes --location-policy=BALANCED --scopes=storage-full,gke-default
15
17
  [XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
16
18
  gcloud container clusters list --project=golden-project --filter=name=golden-cluster-private --format="value(location)"
17
19
  [XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
@@ -41,13 +43,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
41
43
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
42
44
  gcloud beta container clusters describe golden-cluster-private --location us-central1 --project golden-project --format="value(currentMasterVersion)"
43
45
  [XPK] Creating 1 node pool or pools of v5p-8
44
- We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
46
+ We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
45
47
  [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
46
48
  gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
47
49
  [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
48
50
  gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
49
51
  [XPK] Creating 1 node pool or pools of v5p-8
50
- Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
52
+ Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
51
53
  [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
52
54
  gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="value(locations)"
53
55
  [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
@@ -4,6 +4,8 @@ $ SUB_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us
4
4
  [XPK] Working on golden-project and us-central1-a
5
5
  [XPK] Task: `Get reservation deployment type` is implemented by the following command not running since it is a dry run.
6
6
  gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="value(deploymentType)"
7
+ [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
8
+ gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
7
9
  [XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
8
10
  gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
9
11
  [XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
@@ -11,7 +13,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
11
13
  [XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
12
14
  gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
13
15
  [XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
14
- gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --location-policy=BALANCED --scopes=storage-full,gke-default
16
+ gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --location-policy=BALANCED --scopes=storage-full,gke-default
15
17
  [XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
16
18
  gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
17
19
  [XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
@@ -39,19 +41,19 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
39
41
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
40
42
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
41
43
  [XPK] Creating 1 node pool or pools of v6e-4x4
42
- We assume that the underlying system is: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
44
+ We assume that the underlying system is: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
43
45
  [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
44
46
  gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
45
47
  [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
46
48
  gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
47
49
  [XPK] Creating 1 node pool or pools of v6e-16
48
- Underlyingly, we assume that means: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
50
+ Underlyingly, we assume that means: SystemCharacteristics(topology='4x4', vms_per_slice=4, gke_accelerator='tpu-v6e-slice', gce_machine_type='ct6e-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v6e-16', supports_sub_slicing=True, supports_super_slicing=False, supports_accelerator_network_profile=True, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
49
51
  [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
50
52
  gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
51
53
  [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
52
54
  kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
53
55
  [XPK] Existing node pool names ['0']
54
- [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=ct6e-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=4 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --tpu-topology=4x4 --max-pods-per-node 15
56
+ [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=ct6e-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --accelerator-network-profile=auto --node-labels=cloud.google.com/gke-networking-dra-driver=true --node-version=0 --num-nodes=4 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --tpu-topology=4x4 --max-pods-per-node 15
55
57
  [XPK] Breaking up a total of 1 commands into 1 batches
56
58
  [XPK] Pretending all the jobs succeeded
57
59
  [XPK] Create or delete node pool request complete.
@@ -4,6 +4,8 @@ $ SUPER_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=
4
4
  [XPK] Working on golden-project and us-central1-a
5
5
  [XPK] Task: `Get reservation deployment type` is implemented by the following command not running since it is a dry run.
6
6
  gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="value(deploymentType)"
7
+ [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
8
+ gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
7
9
  [XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
8
10
  gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
9
11
  [XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
@@ -11,7 +13,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
11
13
  [XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
12
14
  gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
13
15
  [XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
14
- gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --location-policy=BALANCED --scopes=storage-full,gke-default --enable-slice-controller
16
+ gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --location-policy=BALANCED --scopes=storage-full,gke-default --enable-slice-controller
15
17
  [XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
16
18
  gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
17
19
  [XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
@@ -39,13 +41,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
39
41
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
40
42
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
41
43
  [XPK] Creating 5 node pool or pools of tpu7x-4x4x4
42
- We assume that the underlying system is: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
44
+ We assume that the underlying system is: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
43
45
  [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
44
46
  gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
45
47
  [XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
46
48
  gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
47
49
  [XPK] Creating 5 node pool or pools of tpu7x-128
48
- Underlyingly, we assume that means: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
50
+ Underlyingly, we assume that means: SystemCharacteristics(topology='4x4x4', vms_per_slice=16, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-128', supports_sub_slicing=False, supports_super_slicing=True, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
49
51
  [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
50
52
  gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
51
53
  [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.