xpk 0.17.2__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (291) hide show
  1. {xpk-0.17.2 → xpk-1.0.0}/.github/actions/setup-test-env/action.yml +0 -1
  2. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/build_tests.yaml +1 -2
  3. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_basic_cluster_create.yaml +0 -56
  4. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_legacy_tests.yaml +1 -2
  5. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/label-validation.yaml +2 -2
  6. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/nightly_tests.yaml +5 -7
  7. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_goldens.yaml +0 -1
  8. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_integration_tests.yaml +0 -1
  9. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_lint_and_format.yml +0 -1
  10. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_storage_create.yaml +0 -41
  11. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_storage_delete.yaml +0 -3
  12. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_unit_tests.yaml +0 -1
  13. {xpk-0.17.2 → xpk-1.0.0}/Makefile +2 -16
  14. {xpk-0.17.2/src/xpk.egg-info → xpk-1.0.0}/PKG-INFO +15 -4
  15. {xpk-0.17.2 → xpk-1.0.0}/README.md +14 -3
  16. {xpk-0.17.2 → xpk-1.0.0}/docs/installation.md +0 -1
  17. {xpk-0.17.2 → xpk-1.0.0}/docs/troubleshooting.md +1 -1
  18. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/clusters.md +29 -0
  19. {xpk-0.17.2 → xpk-1.0.0}/goldens/Basic_cluster_create.txt +3 -88
  20. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_for_multi-host_nodepool.txt +3 -88
  21. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_private.txt +5 -88
  22. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_sub-slicing.txt +6 -89
  23. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_super-slicing.txt +5 -88
  24. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +3 -88
  25. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +3 -88
  26. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_Managed_Lustre_driver.txt +3 -88
  27. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +3 -88
  28. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_gb200-4.txt +6 -89
  29. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_shared_reservation.txt +5 -88
  30. {xpk-0.17.2 → xpk-1.0.0}/goldens/NAP_cluster-create.txt +3 -88
  31. {xpk-0.17.2 → xpk-1.0.0}/goldens/NAP_cluster-create_with_pathways.txt +3 -88
  32. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_super-slicing.txt +3 -3
  33. {xpk-0.17.2 → xpk-1.0.0}/goldens.yaml +0 -8
  34. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster.py +4 -35
  35. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster_gcluster.py +1 -13
  36. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster_gcluster_test.py +2 -10
  37. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster_test.py +0 -4
  38. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/workload.py +10 -3
  39. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/workload_test.py +1 -0
  40. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/cluster.py +10 -9
  41. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/config.py +5 -17
  42. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/kueue_manager_test.py +2 -0
  43. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/nodepool.py +6 -0
  44. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/nodepool_test.py +4 -0
  45. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/scheduling.py +28 -3
  46. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/scheduling_test.py +38 -1
  47. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/system_characteristics.py +39 -16
  48. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/system_characteristics_test.py +11 -0
  49. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -15
  50. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -8
  51. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
  52. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
  53. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/common.py +0 -17
  54. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/core.py +0 -39
  55. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/storage.py +0 -11
  56. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/feature_flags.py +1 -1
  57. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/validation.py +0 -8
  58. {xpk-0.17.2 → xpk-1.0.0/src/xpk.egg-info}/PKG-INFO +15 -4
  59. {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/SOURCES.txt +0 -26
  60. {xpk-0.17.2 → xpk-1.0.0}/tools/install-xpk.sh +0 -4
  61. xpk-0.17.2/.github/actions/install-kjob/action.yml +0 -35
  62. xpk-0.17.2/.github/workflows/reusable_build_kjob.yaml +0 -23
  63. xpk-0.17.2/docs/local_testing.md +0 -61
  64. xpk-0.17.2/docs/usage/job.md +0 -41
  65. xpk-0.17.2/docs/usage/run.md +0 -44
  66. xpk-0.17.2/examples/batch.md +0 -24
  67. xpk-0.17.2/examples/job.sh +0 -12
  68. xpk-0.17.2/goldens/Batch.txt +0 -19
  69. xpk-0.17.2/goldens/Cluster_create_for_single-host_single-slice_TPU.txt +0 -199
  70. xpk-0.17.2/goldens/Job_cancel.txt +0 -14
  71. xpk-0.17.2/goldens/Job_info.txt +0 -21
  72. xpk-0.17.2/goldens/Job_list.txt +0 -14
  73. xpk-0.17.2/src/xpk/commands/batch.py +0 -144
  74. xpk-0.17.2/src/xpk/commands/job.py +0 -244
  75. xpk-0.17.2/src/xpk/commands/kind.py +0 -286
  76. xpk-0.17.2/src/xpk/commands/kjob_common.py +0 -60
  77. xpk-0.17.2/src/xpk/commands/run.py +0 -140
  78. xpk-0.17.2/src/xpk/commands/shell.py +0 -142
  79. xpk-0.17.2/src/xpk/parser/batch.py +0 -43
  80. xpk-0.17.2/src/xpk/parser/job.py +0 -147
  81. xpk-0.17.2/src/xpk/parser/kind.py +0 -95
  82. xpk-0.17.2/src/xpk/parser/run.py +0 -47
  83. xpk-0.17.2/src/xpk/parser/shell.py +0 -59
  84. xpk-0.17.2/tools/Dockerfile-kjob +0 -33
  85. xpk-0.17.2/tools/build-kjob.sh +0 -9
  86. xpk-0.17.2/xpk-slurm-commands.md +0 -382
  87. {xpk-0.17.2 → xpk-1.0.0}/.dockerignore +0 -0
  88. {xpk-0.17.2 → xpk-1.0.0}/.github/CODEOWNERS +0 -0
  89. {xpk-0.17.2 → xpk-1.0.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  90. {xpk-0.17.2 → xpk-1.0.0}/.github/actions/install-kueue/action.yml +0 -0
  91. {xpk-0.17.2 → xpk-1.0.0}/.github/release.yaml +0 -0
  92. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/README.md +0 -0
  93. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/build_wheels.yaml +0 -0
  94. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/cleanup.yaml +0 -0
  95. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-dispatch.yml +0 -0
  96. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-invoke.yml +0 -0
  97. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-review.yml +0 -0
  98. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-scheduled-triage.yml +0 -0
  99. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-triage.yml +0 -0
  100. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
  101. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
  102. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_storage_tests.yaml +0 -0
  103. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/periodic_release.yaml +0 -0
  104. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/release_branch_versioning.yaml +0 -0
  105. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_build_scripts.yaml +0 -0
  106. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_build_wheel.yaml +0 -0
  107. {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/stale.yaml +0 -0
  108. {xpk-0.17.2 → xpk-1.0.0}/.gitignore +0 -0
  109. {xpk-0.17.2 → xpk-1.0.0}/.pre-commit-config.yaml +0 -0
  110. {xpk-0.17.2 → xpk-1.0.0}/LICENSE +0 -0
  111. {xpk-0.17.2 → xpk-1.0.0}/backoff_retry.sh +0 -0
  112. {xpk-0.17.2 → xpk-1.0.0}/data/Dockerfile +0 -0
  113. {xpk-0.17.2 → xpk-1.0.0}/docs/code-of-conduct.md +0 -0
  114. {xpk-0.17.2 → xpk-1.0.0}/docs/contributing.md +0 -0
  115. {xpk-0.17.2 → xpk-1.0.0}/docs/permissions.md +0 -0
  116. {xpk-0.17.2 → xpk-1.0.0}/docs/testing.md +0 -0
  117. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/advanced.md +0 -0
  118. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/autoprovisioning.md +0 -0
  119. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/cpu.md +0 -0
  120. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/docker.md +0 -0
  121. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/gpu.md +0 -0
  122. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/inspector.md +0 -0
  123. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/storage.md +0 -0
  124. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/clusters.md +0 -0
  125. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
  126. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
  127. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
  128. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/workloads.md +0 -0
  129. {xpk-0.17.2 → xpk-1.0.0}/docs/usage/workloads.md +0 -0
  130. {xpk-0.17.2 → xpk-1.0.0}/examples/fake_training.py +0 -0
  131. {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
  132. {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/requirements.txt +0 -0
  133. {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/train.py +0 -0
  134. {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/train.slurm +0 -0
  135. {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
  136. {xpk-0.17.2 → xpk-1.0.0}/examples/nccl/nccl-a3mega.sh +0 -0
  137. {xpk-0.17.2 → xpk-1.0.0}/examples/nccl/nccl-a3ultra.sh +0 -0
  138. {xpk-0.17.2 → xpk-1.0.0}/examples/nccl/nccl.md +0 -0
  139. {xpk-0.17.2 → xpk-1.0.0}/examples/storage/filestore-manifest-attach.yaml +0 -0
  140. {xpk-0.17.2 → xpk-1.0.0}/examples/storage/gcsfuse-manifest.yaml +0 -0
  141. {xpk-0.17.2 → xpk-1.0.0}/examples/storage/lustre-manifest-attach.yaml +0 -0
  142. {xpk-0.17.2 → xpk-1.0.0}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
  143. {xpk-0.17.2 → xpk-1.0.0}/examples/storage/pd-manifest-attach.yaml +0 -0
  144. {xpk-0.17.2 → xpk-1.0.0}/golden_buddy.sh +0 -0
  145. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_delete.txt +0 -0
  146. {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_delete_force.txt +0 -0
  147. {xpk-0.17.2 → xpk-1.0.0}/goldens/Storage_list.txt +0 -0
  148. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create.txt +0 -0
  149. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_pathways.txt +0 -0
  150. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_sub-slicing.txt +0 -0
  151. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_with_output-manifest-file.txt +0 -0
  152. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_delete.txt +0 -0
  153. {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_list.txt +0 -0
  154. {xpk-0.17.2 → xpk-1.0.0}/pylintrc +0 -0
  155. {xpk-0.17.2 → xpk-1.0.0}/pyproject.toml +0 -0
  156. {xpk-0.17.2 → xpk-1.0.0}/setup.cfg +0 -0
  157. {xpk-0.17.2 → xpk-1.0.0}/src/integration/README.md +0 -0
  158. {xpk-0.17.2 → xpk-1.0.0}/src/integration/__init__.py +0 -0
  159. {xpk-0.17.2 → xpk-1.0.0}/src/integration/docker_manager_test.py +0 -0
  160. {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_a3mega_test.py +0 -0
  161. {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_a3ultra_test.py +0 -0
  162. {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_a4_test.py +0 -0
  163. {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_test.py +0 -0
  164. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/__init__.py +0 -0
  165. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/api/__init__.py +0 -0
  166. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/api/storage_crd.yaml +0 -0
  167. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
  168. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
  169. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
  170. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
  171. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
  172. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
  173. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
  174. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
  175. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
  176. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/__init__.py +0 -0
  177. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/common.py +0 -0
  178. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/config.py +0 -0
  179. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/info.py +0 -0
  180. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/inspector.py +0 -0
  181. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
  182. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
  183. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/storage.py +0 -0
  184. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/version.py +0 -0
  185. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/__init__.py +0 -0
  186. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/__init__.py +0 -0
  187. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  188. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
  189. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  190. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/__init__.py +0 -0
  191. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
  192. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
  193. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
  194. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
  195. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/capacity.py +0 -0
  196. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/capacity_test.py +0 -0
  197. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/cluster_private.py +0 -0
  198. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/cluster_test.py +0 -0
  199. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/commands.py +0 -0
  200. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/config_test.py +0 -0
  201. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_container.py +0 -0
  202. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_image.py +0 -0
  203. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_manager.py +0 -0
  204. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_resources.py +0 -0
  205. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/filestore.py +0 -0
  206. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcloud_context.py +0 -0
  207. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcloud_context_test.py +0 -0
  208. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcluster_manager.py +0 -0
  209. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcsfuse.py +0 -0
  210. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/jobset.py +0 -0
  211. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/kueue_manager.py +0 -0
  212. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/monitoring.py +0 -0
  213. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/mtc.py +0 -0
  214. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/nap.py +0 -0
  215. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/network.py +0 -0
  216. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/pathways.py +0 -0
  217. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/pathways_test.py +0 -0
  218. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/ray.py +0 -0
  219. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/remote_state/__init__.py +0 -0
  220. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  221. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  222. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/resources.py +0 -0
  223. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/storage.py +0 -0
  224. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/telemetry.py +0 -0
  225. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/telemetry_test.py +0 -0
  226. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/testing/__init__.py +0 -0
  227. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/testing/commands_tester.py +0 -0
  228. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/testing/commands_tester_test.py +0 -0
  229. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/updates.py +0 -0
  230. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/updates_test.py +0 -0
  231. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/vertex.py +0 -0
  232. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload.py +0 -0
  233. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
  234. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  235. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_test.py +0 -0
  236. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/main.py +0 -0
  237. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/__init__.py +0 -0
  238. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/cluster.py +0 -0
  239. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/cluster_test.py +0 -0
  240. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/common_test.py +0 -0
  241. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/config.py +0 -0
  242. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/info.py +0 -0
  243. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/inspector.py +0 -0
  244. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/storage_test.py +0 -0
  245. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/validators.py +0 -0
  246. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/version.py +0 -0
  247. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/workload.py +0 -0
  248. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/workload_test.py +0 -0
  249. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/telemetry_uploader.py +0 -0
  250. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/__init__.py +0 -0
  251. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
  252. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
  253. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/filestore-pv.yaml +0 -0
  254. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/filestore-pvc.yaml +0 -0
  255. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/filestore-sc.yaml +0 -0
  256. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/fuse-pv.yaml +0 -0
  257. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/fuse-pvc.yaml +0 -0
  258. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
  259. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
  260. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
  261. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
  262. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/mtc-cpc.yaml +0 -0
  263. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/storage.yaml +0 -0
  264. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/__init__.py +0 -0
  265. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/console.py +0 -0
  266. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/console_test.py +0 -0
  267. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/execution_context.py +0 -0
  268. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/file.py +0 -0
  269. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/gcs_utils.py +0 -0
  270. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/kubectl.py +0 -0
  271. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/kueue.py +0 -0
  272. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/network.py +0 -0
  273. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/objects.py +0 -0
  274. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/templates.py +0 -0
  275. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/topology.py +0 -0
  276. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/topology_test.py +0 -0
  277. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_agent.py +0 -0
  278. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_agent_test.py +0 -0
  279. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_input.py +0 -0
  280. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_input_test.py +0 -0
  281. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/validation_test.py +0 -0
  282. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/versions.py +0 -0
  283. {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/yaml.py +0 -0
  284. {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/dependency_links.txt +0 -0
  285. {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/entry_points.txt +0 -0
  286. {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/requires.txt +0 -0
  287. {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/top_level.txt +0 -0
  288. {xpk-0.17.2 → xpk-1.0.0}/tools/install-gke-auth-plugin.sh +0 -0
  289. {xpk-0.17.2 → xpk-1.0.0}/xpk-large-scale-guide.sh +0 -0
  290. {xpk-0.17.2 → xpk-1.0.0}/xpk-notebooks.md +0 -0
  291. {xpk-0.17.2 → xpk-1.0.0}/xpk.py +0 -0
@@ -44,7 +44,6 @@ runs:
44
44
  run: gcloud auth configure-docker --quiet
45
45
  shell: bash
46
46
  - uses: ./.github/actions/install-kueue
47
- - uses: ./.github/actions/install-kjob
48
47
  - name: Install XPK
49
48
  run: pip install dist/xpk-*.whl
50
49
  shell: bash
@@ -49,14 +49,13 @@ jobs:
49
49
  lookup-only: true
50
50
  - name: install dependencies
51
51
  if : steps.check-cache.outputs.cache-hit != 'true'
52
- run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
52
+ run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
53
53
  - name: Cache dependencies
54
54
  if : steps.check-cache.outputs.cache-hit != 'true'
55
55
  uses: actions/cache/save@v3
56
56
  with:
57
57
  path: |
58
58
  /usr/local/bin/kubectl-kueue
59
- /usr/local/bin/kubectl-kjob
60
59
  ~/.cache/pip
61
60
  ${{env.pythonLocation}}
62
61
  key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
@@ -152,62 +152,6 @@ jobs:
152
152
  run: xpk info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
153
153
  - name: Delete the workload on the cluster
154
154
  run: xpk workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
155
- - name: Create test script to execute in batch
156
- run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
157
- - name: Run a batch job on the cluster
158
- run: xpk batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
159
- - name: List out the jobs on the cluster
160
- run: xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
161
- - name: Get created job name
162
- run: |
163
- JOB_NAME=$(xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | grep 'multislice-queue' | head -1 | awk '{print $1}')
164
- echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
165
- - name: Check job spec
166
- run: |
167
- job_spec=$(kubectl get job ${JOB_NAME} -o jsonpath='{.spec}')
168
- echo "$job_spec" | grep '"completions":2'
169
- echo "$job_spec" | grep '"parallelism":2'
170
- echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
171
- - name: Get job info for the last job created on the cluster
172
- run: xpk job info ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
173
- - name: Cancel the batch job on the cluster
174
- run: xpk job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
175
- - name: Create shell and exit it immediately
176
- run: |
177
- cat <<EOF > create-shell.exp
178
- #!/usr/bin/expect
179
- set timeout 180
180
- spawn sh -c "xpk shell --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee shell.log"
181
- send "\n"
182
- expect {
183
- "/ # " {
184
- send "exit\n"
185
- # Wait for EOF after exit
186
- expect eof
187
- exit 0
188
- }
189
- timeout {
190
- puts "Timed out waiting for pod to be running"
191
- exit 1
192
- }
193
- eof {
194
- puts "Unexpected EOF before getting prompt"
195
- exit 1
196
- }
197
- }
198
- EOF
199
- chmod +x ./create-shell.exp
200
- expect ./create-shell.exp
201
- - name: Check if shell exists and is running
202
- run: |
203
- pod_name=$(grep 'waiting for pod' shell.log | awk -F'"' '{print $2}')
204
- kubectl wait --for='jsonpath={.status.conditions[?(@.type=="Ready")].status}=True' --timeout=1m pod/${pod_name}
205
- - name: Stop the shell
206
- run: xpk shell stop --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
207
- - name: Delete create-shell.exp file
208
- run: rm create-shell.exp
209
- - name: Delete shell.log file
210
- run: rm shell.log
211
155
  - name: Delete the cluster created
212
156
  if: always()
213
157
  run: xpk cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
@@ -47,14 +47,13 @@ jobs:
47
47
  lookup-only: true
48
48
  - name: install dependencies
49
49
  if: steps.check-cache.outputs.cache-hit != 'true'
50
- run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
50
+ run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
51
51
  - name: Cache dependencies
52
52
  if: steps.check-cache.outputs.cache-hit != 'true'
53
53
  uses: actions/cache/save@v3
54
54
  with:
55
55
  path: |
56
56
  /usr/local/bin/kubectl-kueue
57
- /usr/local/bin/kubectl-kjob
58
57
  ~/.cache/pip
59
58
  ${{env.pythonLocation}}
60
59
  key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
@@ -36,8 +36,8 @@ jobs:
36
36
  with:
37
37
  mode: minimum
38
38
  count: 1
39
- labels: "release-improvements, release-bugfix, release-features"
40
- message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features"
39
+ labels: "release-improvements, release-bugfix, release-features, release-breaking"
40
+ message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features, release-breaking"
41
41
  - id: do-not-merge
42
42
  uses: mheap/github-action-required-labels@v5
43
43
  with:
@@ -23,31 +23,29 @@ permissions:
23
23
  contents: read
24
24
 
25
25
  jobs:
26
- build_kjob:
27
- uses: ./.github/workflows/reusable_build_kjob.yaml
28
26
  build_wheel:
29
27
  uses: ./.github/workflows/reusable_build_wheel.yaml
30
28
  build_actions:
31
29
  uses: ./.github/workflows/reusable_build_scripts.yaml
32
30
  basic_cluster_create:
33
- needs: [build_kjob, build_actions, build_wheel]
31
+ needs: [build_actions, build_wheel]
34
32
  uses: ./.github/workflows/integration_basic_cluster_create.yaml
35
33
  secrets: inherit
36
34
 
37
35
  pathways_cluster_create:
38
- needs: [build_kjob, build_actions, build_wheel]
36
+ needs: [build_actions, build_wheel]
39
37
  uses: ./.github/workflows/integration_pathways_cluster_create.yaml
40
38
  secrets: inherit
41
39
 
42
40
  ray_cluster_create:
43
- needs: [build_kjob, build_actions, build_wheel]
41
+ needs: [build_actions, build_wheel]
44
42
  uses: ./.github/workflows/integration_ray_cluster_create.yaml
45
43
  secrets: inherit
46
44
  legacy_integration:
47
- needs: [build_kjob, build_actions, build_wheel]
45
+ needs: [build_actions, build_wheel]
48
46
  uses: ./.github/workflows/integration_legacy_tests.yaml
49
47
  secrets: inherit
50
48
  storage-tests:
51
- needs: [build_kjob, build_actions, build_wheel]
49
+ needs: [build_actions, build_wheel]
52
50
  uses: ./.github/workflows/integration_storage_tests.yaml
53
51
  secrets: inherit
@@ -33,7 +33,6 @@ jobs:
33
33
  with:
34
34
  path: |
35
35
  /usr/local/bin/kubectl-kueue
36
- /usr/local/bin/kubectl-kjob
37
36
  ~/.cache/pip
38
37
  ${{env.pythonLocation}}
39
38
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
@@ -42,7 +42,6 @@ jobs:
42
42
  with:
43
43
  path: |
44
44
  /usr/local/bin/kubectl-kueue
45
- /usr/local/bin/kubectl-kjob
46
45
  ~/.cache/pip
47
46
  ${{env.pythonLocation}}
48
47
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
@@ -39,7 +39,6 @@ jobs:
39
39
  with:
40
40
  path: |
41
41
  /usr/local/bin/kubectl-kueue
42
- /usr/local/bin/kubectl-kjob
43
42
  ~/.cache/pip
44
43
  ${{env.pythonLocation}}
45
44
  key: xpk-deps-${{matrix.python-version}}-${{github.run_id}}-${{github.run_attempt}}
@@ -92,8 +92,6 @@ jobs:
92
92
  --auto-mount=true --vol=vol1 --mount-point='/${{inputs.storage-type}}-test-mount-point' --readonly=false
93
93
  - name: List and verify existing Storages
94
94
  run: xpk storage list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | tee output.txt | grep ${{inputs.storage-name}} || (echo 'No storage found' && exit 143)
95
- - name: Verify VolumeBundle created
96
- run: kubectl get volumebundle ${{inputs.storage-name}} -o jsonpath='{.spec.containerVolumeMounts[0].mountPath}' | grep '/${{inputs.storage-type}}-test-mount-point'
97
95
  - name: Verify Persistent Volume mount options
98
96
  if: inputs.storage-command == 'attach' && inputs.storage-type == 'gcsfuse'
99
97
  run: kubectl get pv ${{inputs.storage-name}}-pv -oyaml | grep rename-dir-limit=10000 || (echo 'Invalid storage mount options' && exit 143)
@@ -114,45 +112,6 @@ jobs:
114
112
  run: xpk workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $STORAGE_READ_WORKLOAD --timeout 300
115
113
  - name: Delete the reader workload on the cluster
116
114
  run: xpk workload delete --workload $STORAGE_READ_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
117
- - name: Create batch-read.sh script
118
- run: |
119
- cat <<EOF > batch-read.sh
120
- #!/bin/bash
121
- grep 'Test text message' /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)
122
- EOF
123
- - name: Run a batch-read job on the cluster
124
- run: xpk batch --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh | tee batch-read.log
125
- - name: Get job name
126
- run: |
127
- cat batch-read.log | grep 'xpk-def-app-profile-slurm-'
128
- READ_JOB_NAME=$(grep 'Job name: xpk-def-app-profile-slurm-' batch-read.log | awk -F': ' '{print $2}')
129
- echo "READ_JOB_NAME=${READ_JOB_NAME}" >> $GITHUB_ENV
130
- - name: Wait for the batch-read job to finish
131
- run: kubectl wait job.batch/$READ_JOB_NAME --for=condition=Complete --timeout=1m
132
- - name: Cancel the batch-read job
133
- run: xpk job cancel $READ_JOB_NAME --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | grep "job.batch/$READ_JOB_NAME deleted"
134
- - name: Delete batch-read.log file
135
- run: rm batch-read.log
136
- - name: Run a run-read job on the cluster
137
- run: xpk run --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh --timeout 60
138
- - name: Delete batch-read.sh file
139
- run: rm batch-read.sh
140
- - name: Create shell and exit it immediately
141
- run: |
142
- cat <<EOF >> create-shell.exp
143
- ##!/usr/bin/expect
144
- spawn xpk shell --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
145
- expect "/ # "
146
- send "cat /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt\n"
147
- expect "Test text message"
148
- send "exit\n"
149
- EOF
150
- chmod +x ./create-shell.exp
151
- expect ./create-shell.exp
152
- - name: Stop the shell
153
- run: xpk shell stop --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
154
- - name: Delete create-shell.exp file
155
- run: rm create-shell.exp
156
115
  - name: Run workload to delete file on filestore
157
116
  run : xpk workload create --workload $STORAGE_DELETE_WORKLOAD --command "rm -rf /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --device-type=${{inputs.device-type}} --zone ${{inputs.zone}}
158
117
  - name: Wait for delete workload completion and confirm it succeeded
@@ -61,9 +61,6 @@ jobs:
61
61
  - name: Detach storage volumes
62
62
  if: always()
63
63
  run: xpk storage detach ${{inputs.storage-name}} --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}}
64
- - name: Verify VolumeBundle deleted
65
- run: |
66
- ! kubectl get volumebundle | grep ${{inputs.storage-name}}
67
64
  - name: Delete GCP Filestore Storage instance
68
65
  if: always() && inputs.storage-command == 'delete'
69
66
  run: xpk storage delete ${{inputs.storage-name}} --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}}
@@ -33,7 +33,6 @@ jobs:
33
33
  with:
34
34
  path: |
35
35
  /usr/local/bin/kubectl-kueue
36
- /usr/local/bin/kubectl-kjob
37
36
  ~/.cache/pip
38
37
  ${{env.pythonLocation}}
39
38
  key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
@@ -2,25 +2,21 @@ KUEUE_REPO=https://github.com/kubernetes-sigs/kueue.git
2
2
 
3
3
  KUBECTL_VERSION := $(shell curl -L -s https://dl.k8s.io/release/stable.txt)
4
4
  KUEUE_VERSION=v0.14.3
5
- KJOB_VERSION=v0.1.0
6
5
 
7
6
  OS := $(shell uname -s | tr A-Z a-z)
8
7
  PLATFORM := $(shell uname -m | sed -e 's/aarch64/arm64/' | sed -e 's/x86_64/amd64/')
9
8
 
10
9
  KUBECTL_URL = "https://dl.k8s.io/release/$(KUBECTL_VERSION)/bin/$(OS)/$(PLATFORM)/kubectl"
11
10
  KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUEUE_VERSION)/kubectl-kueue-$(OS)-$(PLATFORM)"
12
- KJOBCTL_URL = "https://github.com/kubernetes-sigs/kjob/releases/download/$(KJOB_VERSION)/kubectl-kjob-$(OS)-$(PLATFORM)"
13
11
 
14
12
  PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
15
- KJOB_DOCKER_IMG := xpk_kjob
16
- KJOB_DOCKER_CONTAINER := xpk_kjob_container
17
13
  BIN_PATH=$(PROJECT_DIR)/bin
18
14
 
19
15
  .PHONY: install
20
- install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl install-kjobctl pip-install
16
+ install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl pip-install
21
17
 
22
18
  .PHONY: install-dev
23
- install-dev: check-python check-gcloud mkdir-bin install-kueuectl install-kjobctl pip-install pip-install-dev install-pytest install-lint
19
+ install-dev: check-python check-gcloud mkdir-bin install-kueuectl pip-install pip-install-dev install-pytest install-lint
24
20
 
25
21
  .PHONY: pip-install-dev
26
22
  pip-install-dev:
@@ -54,16 +50,6 @@ install-kueuectl: mkdir-bin
54
50
  curl -Lo $(BIN_PATH)/kubectl-kueue $(KUEUECTL_URL);
55
51
  chmod +x $(BIN_PATH)/kubectl-kueue;
56
52
 
57
- .PHONY: install-kjobctl
58
- install-kjobctl: mkdir-bin
59
- #curl -Lo $(BIN_PATH)/kubectl-kjob $(KJOBCTL_URL)
60
- #chmod +x $(BIN_PATH)/kubectl-kjob
61
- # TODO: Switch to kjob release-based installation once version >=0.2.0 is available.
62
- chmod +x tools/build-kjob.sh
63
- ./tools/build-kjob.sh
64
- mv kubectl-kjob $(BIN_PATH)/kubectl-kjob
65
- chmod +x $(BIN_PATH)/kubectl-kjob
66
-
67
53
  .PHONY: install-gcloud-auth-plugin
68
54
  install-gcloud-auth-plugin:
69
55
  chmod +x tools/install-gke-auth-plugin.sh
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.17.2
3
+ Version: 1.0.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -114,10 +114,21 @@ XPK also supports the following [Google Cloud Storage solutions](./docs/usage/st
114
114
  * [Storage](./docs/usage/storage.md)
115
115
  * [Advanced](./docs/usage/advanced.md)
116
116
  * [Inspector](./docs/usage/inspector.md)
117
- * [Run](./docs/usage/run.md)
118
- * [Job](./docs/usage/job.md)
119
117
  * [Troubleshooting](./docs/troubleshooting.md)
120
- * [Local Testing](./docs/local_testing.md)
118
+
119
+ # Privacy notice
120
+
121
+ To help improve XPK, feature usage statistics are collected and sent to Google. You can opt-out at any time by executing
122
+ the following shell command:
123
+
124
+ ```shell
125
+ xpk config set send-telemetry <true/false>
126
+ ```
127
+
128
+ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](https://policies.google.com/privacy). When
129
+ you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
130
+ [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
131
+
121
132
 
122
133
  # Contributing
123
134
 
@@ -73,10 +73,21 @@ XPK also supports the following [Google Cloud Storage solutions](./docs/usage/st
73
73
  * [Storage](./docs/usage/storage.md)
74
74
  * [Advanced](./docs/usage/advanced.md)
75
75
  * [Inspector](./docs/usage/inspector.md)
76
- * [Run](./docs/usage/run.md)
77
- * [Job](./docs/usage/job.md)
78
76
  * [Troubleshooting](./docs/troubleshooting.md)
79
- * [Local Testing](./docs/local_testing.md)
77
+
78
+ # Privacy notice
79
+
80
+ To help improve XPK, feature usage statistics are collected and sent to Google. You can opt-out at any time by executing
81
+ the following shell command:
82
+
83
+ ```shell
84
+ xpk config set send-telemetry <true/false>
85
+ ```
86
+
87
+ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](https://policies.google.com/privacy). When
88
+ you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
89
+ [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
90
+
80
91
 
81
92
  # Contributing
82
93
 
@@ -44,7 +44,6 @@ Depending on your chosen installation method, you may need these additional tool
44
44
  | Install Method | Tool | Notes |
45
45
  | :--- | :--- | :--- |
46
46
  | **Pip** | **kueuectl** | [Installation instructions](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/) |
47
- | **Pip** | **kjob** | [Installation instructions](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md) |
48
47
  | **Source** | **git** | Install via your package manager (e.g., `sudo apt-get install git` on Debian/Ubuntu) |
49
48
  | **Source** | **make** | Install via your package manager (e.g., `sudo apt-get install make` on Debian/Ubuntu) |
50
49
 
@@ -38,7 +38,7 @@ Some XPK cluster configuration might be missing, if workload creation fails with
38
38
 
39
39
  `[XPK] b'error: the server doesn\'t have a resource type "workloads"\n'`
40
40
 
41
- Mitigate this error by re-running your `xpk.py cluster create ...` command, to refresh the cluster configurations.
41
+ Mitigate this error by re-running your `xpk cluster create ...` command, to refresh the cluster configurations.
42
42
 
43
43
  ## Permission Issues: `requires one of ["permission_name"] permission(s)`.
44
44
 
@@ -254,6 +254,35 @@ xpk cluster create \
254
254
 
255
255
  will fail the cluster creation process because Vertex AI Tensorboard is not supported in `us-central2`.
256
256
 
257
+ ### Create Cluster With Google Cloud ML Diagnostics Enabled
258
+
259
+ Google Cloud ML Diagnostics is an end-to-end managed platform for ML Engineers to optimize and diagnose their AI/ML workloads on Google Cloud. The product allows ML Engineers to collect and visualize all their workload metrics, configs and profiles with one single platform, all within the same UI. The current product offering focuses on workloads running on XLA-based frameworks (JAX, Pytorch XLA, Tensorflow/Keras) on Google Cloud TPUs and GPUs. Current support is for JAX on Google Cloud TPUs only.
260
+
261
+ Enabling ML Diagnostics is streamlined and simplified through XPK cluster creation commands.
262
+
263
+ By adding the **--managed-mldiagnostics** flag during the execution of either **xpk cluster create** or **xpk cluster create-pathways**, the ML Diagnostics functionality is enabled. This flag ensures the necessary supporting components (such as the injection-webhook and connection-operator) are automatically configured, allowing the feature to function seamlessly in both Pathways and non-Pathways execution environments.
264
+
265
+ **Example Usage:**
266
+
267
+ * Cluster Create for Pathways with flag **--managed-mldiagnostics**:
268
+
269
+ ```shell
270
+ xpk cluster create-pathways \
271
+ --cluster xpk-pw-test \
272
+ --num-slices=4 --spot \
273
+ --tpu-type=v5litepod-16 \
274
+ --managed-mldiagnostics
275
+ ```
276
+
277
+ * Cluster Create (provision spot / preemptable capacity) with flag **--managed-mldiagnostics**:
278
+
279
+ ```shell
280
+ xpk cluster create \
281
+ --cluster xpk-test --tpu-type=v5litepod-16 \
282
+ --num-slices=4 --spot \
283
+ --managed-mldiagnostics
284
+ ```
285
+
257
286
  ## Cluster Delete
258
287
  * Cluster Delete (deprovision capacity):
259
288
 
@@ -9,7 +9,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
9
9
  [XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
10
10
  gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
11
11
  [XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
12
- gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --location-policy=BALANCED --scopes=storage-full,gke-default
12
+ gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --location-policy=BALANCED --scopes=storage-full,gke-default
13
13
  [XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
14
14
  gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
15
15
  [XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
@@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
37
37
  [XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
38
38
  gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
39
39
  [XPK] Creating 1 node pool or pools of tpu7x-8
40
- We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
40
+ We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
41
41
  [XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
42
42
  gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
43
43
  [XPK] Creating 1 node pool or pools of tpu7x-8
44
- Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
44
+ Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
45
45
  [XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
46
46
  gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
47
47
  [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
@@ -261,91 +261,6 @@ kubectl get node --no-headers | wc -l
261
261
  [XPK] Try 1: Updating Kueue Controller Manager resources
262
262
  [XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
263
263
  kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
264
- [XPK] Verifying kjob installation
265
- [XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run.
266
- kubectl-kjob help
267
- [XPK] kjob found
268
- [XPK] Applying kjob CDRs
269
- [XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run.
270
- kubectl kjob printcrds | kubectl apply --server-side -f -
271
- [XPK] Creating kjob CRDs succeeded
272
- [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
273
- kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
274
- [XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content:
275
-
276
- apiVersion: kjobctl.x-k8s.io/v1alpha1
277
- kind: JobTemplate
278
- metadata:
279
- name: xpk-def-batch
280
- namespace: default
281
- template:
282
- spec:
283
- parallelism: 1
284
- completions: 1
285
- completionMode: Indexed
286
- template:
287
- spec:
288
- dnsPolicy: ClusterFirstWithHostNet
289
- tolerations:
290
- - operator: "Exists"
291
- key: nvidia.com/gpu
292
- containers:
293
- - name: xpk-batch-container
294
- image: ubuntu:22.04
295
- workingDir: /
296
-
297
-
298
- priorityClassName: medium
299
- restartPolicy: OnFailure
300
- serviceAccountName:
301
-
302
- [XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run.
303
- kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61
304
- [XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content:
305
-
306
- apiVersion: v1
307
- kind: PodTemplate
308
- metadata:
309
- name: xpk-def-pod
310
- namespace: default
311
- template:
312
- spec:
313
- tolerations:
314
- - effect: NoSchedule
315
- key: components.gke.io/gke-managed-components
316
- operator: Equal
317
- value: "true"
318
- containers:
319
- - name: xpk-interactive-container
320
- image: busybox:1.28
321
- command: [/bin/sh]
322
- workingDir: /
323
- initContainers:
324
- - name: init
325
- image: busybox:1.28
326
- command: ['/bin/mkdir', '-p', '/']
327
- serviceAccountName:
328
-
329
- [XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run.
330
- kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8
331
- [XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content:
332
-
333
- apiVersion: kjobctl.x-k8s.io/v1alpha1
334
- kind: ApplicationProfile
335
- metadata:
336
- name: xpk-def-app-profile
337
- namespace: default
338
- spec:
339
- supportedModes:
340
- - name: Slurm
341
- template: xpk-def-batch
342
- requiredFlags: []
343
- - name: Interactive
344
- template: xpk-def-pod
345
- volumeBundles: []
346
-
347
- [XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run.
348
- kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486
349
264
  [XPK] GKE commands done! Resources are created.
350
265
  [XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
351
266
  [XPK] Exiting XPK cleanly