xpk 1.1.0__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. {xpk-1.1.0/src/xpk.egg-info → xpk-1.1.1}/PKG-INFO +1 -1
  2. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_create_super-slicing.md +4 -45
  3. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/workload.py +17 -13
  4. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/docker_container.py +30 -12
  5. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/pathways.py +4 -8
  6. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/scheduling_test.py +14 -6
  7. {xpk-1.1.0 → xpk-1.1.1/src/xpk.egg-info}/PKG-INFO +1 -1
  8. {xpk-1.1.0 → xpk-1.1.1}/.dockerignore +0 -0
  9. {xpk-1.1.0 → xpk-1.1.1}/.github/CODEOWNERS +0 -0
  10. {xpk-1.1.0 → xpk-1.1.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  11. {xpk-1.1.0 → xpk-1.1.1}/.github/actions/install-kueue/action.yml +0 -0
  12. {xpk-1.1.0 → xpk-1.1.1}/.github/actions/setup-test-env/action.yml +0 -0
  13. {xpk-1.1.0 → xpk-1.1.1}/.github/release.yaml +0 -0
  14. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/README.md +0 -0
  15. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/build_tests.yaml +0 -0
  16. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/build_wheels.yaml +0 -0
  17. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/cleanup.yaml +0 -0
  18. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/gemini-dispatch.yml +0 -0
  19. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/gemini-invoke.yml +0 -0
  20. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/gemini-review.yml +0 -0
  21. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/gemini-scheduled-triage.yml +0 -0
  22. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/gemini-triage.yml +0 -0
  23. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/integration_basic_cluster_create.yaml +0 -0
  24. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/integration_gpu_cluster_create.yaml +0 -0
  25. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
  26. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
  27. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/integration_storage_tests.yaml +0 -0
  28. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/label-validation.yaml +0 -0
  29. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/nightly_tests.yaml +0 -0
  30. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/periodic_release.yaml +0 -0
  31. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/release_branch_versioning.yaml +0 -0
  32. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_build_scripts.yaml +0 -0
  33. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_build_wheel.yaml +0 -0
  34. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_goldens.yaml +0 -0
  35. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_lint_and_format.yml +0 -0
  36. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_storage_create.yaml +0 -0
  37. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_storage_delete.yaml +0 -0
  38. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/reusable_unit_tests.yaml +0 -0
  39. {xpk-1.1.0 → xpk-1.1.1}/.github/workflows/stale.yaml +0 -0
  40. {xpk-1.1.0 → xpk-1.1.1}/.gitignore +0 -0
  41. {xpk-1.1.0 → xpk-1.1.1}/.pre-commit-config.yaml +0 -0
  42. {xpk-1.1.0 → xpk-1.1.1}/LICENSE +0 -0
  43. {xpk-1.1.0 → xpk-1.1.1}/Makefile +0 -0
  44. {xpk-1.1.0 → xpk-1.1.1}/README.md +0 -0
  45. {xpk-1.1.0 → xpk-1.1.1}/backoff_retry.sh +0 -0
  46. {xpk-1.1.0 → xpk-1.1.1}/data/Dockerfile +0 -0
  47. {xpk-1.1.0 → xpk-1.1.1}/docs/code-of-conduct.md +0 -0
  48. {xpk-1.1.0 → xpk-1.1.1}/docs/contributing.md +0 -0
  49. {xpk-1.1.0 → xpk-1.1.1}/docs/installation.md +0 -0
  50. {xpk-1.1.0 → xpk-1.1.1}/docs/permissions.md +0 -0
  51. {xpk-1.1.0 → xpk-1.1.1}/docs/testing.md +0 -0
  52. {xpk-1.1.0 → xpk-1.1.1}/docs/troubleshooting.md +0 -0
  53. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/advanced.md +0 -0
  54. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/autoprovisioning.md +0 -0
  55. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/clusters.md +0 -0
  56. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/cpu.md +0 -0
  57. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/docker.md +0 -0
  58. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/gpu.md +0 -0
  59. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/inspector.md +0 -0
  60. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/storage.md +0 -0
  61. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
  62. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
  63. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
  64. {xpk-1.1.0 → xpk-1.1.1}/docs/usage/workloads.md +0 -0
  65. {xpk-1.1.0 → xpk-1.1.1}/examples/fake_training.py +0 -0
  66. {xpk-1.1.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
  67. {xpk-1.1.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/requirements.txt +0 -0
  68. {xpk-1.1.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/train.py +0 -0
  69. {xpk-1.1.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/train.slurm +0 -0
  70. {xpk-1.1.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
  71. {xpk-1.1.0 → xpk-1.1.1}/examples/nccl/nccl-a3mega.sh +0 -0
  72. {xpk-1.1.0 → xpk-1.1.1}/examples/nccl/nccl-a3ultra.sh +0 -0
  73. {xpk-1.1.0 → xpk-1.1.1}/examples/nccl/nccl.md +0 -0
  74. {xpk-1.1.0 → xpk-1.1.1}/examples/storage/filestore-manifest-attach.yaml +0 -0
  75. {xpk-1.1.0 → xpk-1.1.1}/examples/storage/gcsfuse-manifest.yaml +0 -0
  76. {xpk-1.1.0 → xpk-1.1.1}/examples/storage/lustre-manifest-attach.yaml +0 -0
  77. {xpk-1.1.0 → xpk-1.1.1}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
  78. {xpk-1.1.0 → xpk-1.1.1}/examples/storage/pd-manifest-attach.yaml +0 -0
  79. {xpk-1.1.0 → xpk-1.1.1}/pylintrc +0 -0
  80. {xpk-1.1.0 → xpk-1.1.1}/pyproject.toml +0 -0
  81. {xpk-1.1.0 → xpk-1.1.1}/recipes/Basic_cluster_adapt.md +0 -0
  82. {xpk-1.1.0 → xpk-1.1.1}/recipes/Basic_cluster_create.md +0 -0
  83. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_RayCluster.md +0 -0
  84. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_for_multi-host_nodepool.md +0 -0
  85. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_for_single-host_nodepool.md +0 -0
  86. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_private.md +0 -0
  87. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_sub-slicing.md +0 -0
  88. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_super-slicing.md +0 -0
  89. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_with_CPU_and_memory_limits_above_capacity.md +0 -0
  90. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_with_CPU_and_memory_limits_below_capacity.md +0 -0
  91. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_with_Managed_Lustre_driver.md +0 -0
  92. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.md +0 -0
  93. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_with_gb200-4.md +0 -0
  94. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_create_with_shared_reservation.md +0 -0
  95. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_delete.md +0 -0
  96. {xpk-1.1.0 → xpk-1.1.1}/recipes/Cluster_delete_force.md +0 -0
  97. {xpk-1.1.0 → xpk-1.1.1}/recipes/NAP_cluster-create.md +0 -0
  98. {xpk-1.1.0 → xpk-1.1.1}/recipes/NAP_cluster-create_with_pathways.md +0 -0
  99. {xpk-1.1.0 → xpk-1.1.1}/recipes/Storage_list.md +0 -0
  100. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_create.md +0 -0
  101. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_create_pathways.md +0 -0
  102. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_create_sub-slicing.md +0 -0
  103. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_create_with_output-manifest-file.md +0 -0
  104. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_delete.md +0 -0
  105. {xpk-1.1.0 → xpk-1.1.1}/recipes/Workload_list.md +0 -0
  106. {xpk-1.1.0 → xpk-1.1.1}/recipes/comprehensive-demo.md +0 -0
  107. {xpk-1.1.0 → xpk-1.1.1}/setup.cfg +0 -0
  108. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/__init__.py +0 -0
  109. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/api/__init__.py +0 -0
  110. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/api/storage_crd.yaml +0 -0
  111. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
  112. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
  113. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
  114. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
  115. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
  116. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
  117. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
  118. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
  119. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
  120. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/__init__.py +0 -0
  121. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/cluster.py +0 -0
  122. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/cluster_gcluster.py +0 -0
  123. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/cluster_gcluster_test.py +0 -0
  124. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/cluster_test.py +0 -0
  125. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/common.py +0 -0
  126. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/common_test.py +0 -0
  127. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/config.py +0 -0
  128. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/info.py +0 -0
  129. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/inspector.py +0 -0
  130. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/inspector_test.py +0 -0
  131. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
  132. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
  133. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/storage.py +0 -0
  134. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/version.py +0 -0
  135. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/commands/workload_test.py +0 -0
  136. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/__init__.py +0 -0
  137. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/__init__.py +0 -0
  138. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
  139. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
  140. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_test.py +0 -0
  141. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/__init__.py +0 -0
  142. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
  143. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
  144. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
  145. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
  146. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/capacity.py +0 -0
  147. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/capacity_test.py +0 -0
  148. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/cluster.py +0 -0
  149. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/cluster_private.py +0 -0
  150. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/cluster_test.py +0 -0
  151. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/commands.py +0 -0
  152. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/config.py +0 -0
  153. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/config_test.py +0 -0
  154. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/docker_image.py +0 -0
  155. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/docker_manager.py +0 -0
  156. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/docker_resources.py +0 -0
  157. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/filestore.py +0 -0
  158. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/gcloud_context.py +0 -0
  159. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/gcloud_context_test.py +0 -0
  160. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/gcluster_manager.py +0 -0
  161. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/gcsfuse.py +0 -0
  162. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/jobset.py +0 -0
  163. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/kueue_manager.py +0 -0
  164. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/kueue_manager_test.py +0 -0
  165. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/monitoring.py +0 -0
  166. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/mtc.py +0 -0
  167. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/nap.py +0 -0
  168. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/network.py +0 -0
  169. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/nodepool.py +0 -0
  170. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/nodepool_test.py +0 -0
  171. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/pathways_test.py +0 -0
  172. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/ray.py +0 -0
  173. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/remote_state/__init__.py +0 -0
  174. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
  175. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/remote_state/remote_state_client.py +0 -0
  176. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/resources.py +0 -0
  177. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/scheduling.py +12 -12
  178. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/storage.py +0 -0
  179. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/system_characteristics.py +0 -0
  180. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/system_characteristics_test.py +0 -0
  181. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/telemetry.py +0 -0
  182. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/telemetry_test.py +0 -0
  183. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/testing/__init__.py +0 -0
  184. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/testing/commands_tester.py +0 -0
  185. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/testing/commands_tester_test.py +0 -0
  186. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/updates.py +0 -0
  187. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/updates_test.py +0 -0
  188. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/vertex.py +0 -0
  189. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload.py +0 -0
  190. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/__init__.py +0 -0
  191. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
  192. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
  193. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
  194. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
  195. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
  196. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/core/workload_test.py +0 -0
  197. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/main.py +0 -0
  198. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/__init__.py +0 -0
  199. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/cluster.py +0 -0
  200. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/cluster_test.py +0 -0
  201. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/common.py +0 -0
  202. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/common_test.py +0 -0
  203. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/config.py +0 -0
  204. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/core.py +0 -0
  205. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/info.py +0 -0
  206. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/inspector.py +0 -0
  207. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/storage.py +0 -0
  208. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/storage_test.py +0 -0
  209. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/validators.py +0 -0
  210. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/version.py +0 -0
  211. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/workload.py +0 -0
  212. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/parser/workload_test.py +0 -0
  213. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/telemetry_uploader.py +0 -0
  214. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/__init__.py +0 -0
  215. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
  216. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
  217. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/filestore-pv.yaml +0 -0
  218. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/filestore-pvc.yaml +0 -0
  219. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/filestore-sc.yaml +0 -0
  220. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/fuse-pv.yaml +0 -0
  221. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/fuse-pvc.yaml +0 -0
  222. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
  223. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
  224. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
  225. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
  226. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/mtc-cpc.yaml +0 -0
  227. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/templates/storage.yaml +0 -0
  228. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/__init__.py +0 -0
  229. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/console.py +0 -0
  230. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/console_test.py +0 -0
  231. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/execution_context.py +0 -0
  232. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/feature_flags.py +0 -0
  233. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/file.py +0 -0
  234. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/gcs_utils.py +0 -0
  235. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/kubectl.py +0 -0
  236. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/kueue.py +0 -0
  237. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/network.py +0 -0
  238. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/objects.py +0 -0
  239. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/templates.py +0 -0
  240. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/topology.py +0 -0
  241. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/topology_test.py +0 -0
  242. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/user_agent.py +0 -0
  243. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/user_agent_test.py +0 -0
  244. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/validation.py +0 -0
  245. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/validation_test.py +0 -0
  246. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/versions.py +0 -0
  247. {xpk-1.1.0 → xpk-1.1.1}/src/xpk/utils/yaml.py +0 -0
  248. {xpk-1.1.0 → xpk-1.1.1}/src/xpk.egg-info/SOURCES.txt +0 -0
  249. {xpk-1.1.0 → xpk-1.1.1}/src/xpk.egg-info/dependency_links.txt +0 -0
  250. {xpk-1.1.0 → xpk-1.1.1}/src/xpk.egg-info/entry_points.txt +0 -0
  251. {xpk-1.1.0 → xpk-1.1.1}/src/xpk.egg-info/requires.txt +0 -0
  252. {xpk-1.1.0 → xpk-1.1.1}/src/xpk.egg-info/top_level.txt +0 -0
  253. {xpk-1.1.0 → xpk-1.1.1}/tools/install-gke-auth-plugin.sh +0 -0
  254. {xpk-1.1.0 → xpk-1.1.1}/tools/install-xpk.sh +0 -0
  255. {xpk-1.1.0 → xpk-1.1.1}/tools/recipes.py +0 -0
  256. {xpk-1.1.0 → xpk-1.1.1}/xpk-large-scale-guide.sh +0 -0
  257. {xpk-1.1.0 → xpk-1.1.1}/xpk-notebooks.md +0 -0
  258. {xpk-1.1.0 → xpk-1.1.1}/xpk.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -47,7 +47,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94
47
47
  docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
48
48
  [XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
49
49
  docker push gcr.io/golden-project/dry-run-runner:prefix-current
50
- [XPK] Temp file (608e1382aabe2b0335855e5e99876a2e67de954453ebfa4cf12eb82c966f85da) content:
50
+ [XPK] Temp file (2eed164577b237a7b764f6adf46b9c249551c2200e440ef0f696c53dfb2d1cb5) content:
51
51
  apiVersion: jobset.x-k8s.io/v1alpha2
52
52
  kind: JobSet
53
53
  metadata:
@@ -80,13 +80,7 @@ spec:
80
80
  - action: FailJob
81
81
  onPodConditions: []
82
82
  onExitCodes:
83
- containerName: jax-tpu-1
84
- operator: NotIn
85
- values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255]
86
- - action: FailJob
87
- onPodConditions: []
88
- onExitCodes:
89
- containerName: jax-tpu-2
83
+ containerName: jax-tpu
90
84
  operator: NotIn
91
85
  values: [42,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255]
92
86
  template:
@@ -114,42 +108,7 @@ spec:
114
108
  terminationGracePeriodSeconds: 30
115
109
  containers:
116
110
 
117
- - name: jax-tpu-1
118
- image: gcr.io/golden-project/dry-run-runner:prefix-current
119
-
120
- env:
121
- securityContext:
122
- privileged: true
123
- command:
124
- - bash
125
- - -c
126
- - |
127
- echo XPK Start: $(date);
128
- _sigterm() (kill -SIGTERM $! 2>/dev/null;);
129
- trap _sigterm SIGTERM;
130
-
131
- (bash hello) & PID=$!;
132
- while kill -0 $PID 2>/dev/null;
133
- do sleep 5;
134
- done;
135
- wait $PID;
136
- EXIT_CODE=$?;
137
-
138
- echo XPK End: $(date);
139
- echo EXIT_CODE=$EXIT_CODE;
140
-
141
-
142
- exit $EXIT_CODE
143
- resources:
144
- limits:
145
- google.com/tpu: 2
146
-
147
- volumeMounts:
148
- - mountPath: /dev/shm
149
- name: dshm-2
150
-
151
-
152
- - name: jax-tpu-2
111
+ - name: jax-tpu
153
112
  image: gcr.io/golden-project/dry-run-runner:prefix-current
154
113
 
155
114
  env:
@@ -197,7 +156,7 @@ spec:
197
156
 
198
157
 
199
158
  [XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
200
- kubectl apply -f 608e1382aabe2b0335855e5e99876a2e67de954453ebfa4cf12eb82c966f85da
159
+ kubectl apply -f 2eed164577b237a7b764f6adf46b9c249551c2200e440ef0f696c53dfb2d1cb5
201
160
  [XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
202
161
  gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
203
162
  [XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
@@ -481,6 +481,16 @@ def workload_create(args) -> None:
481
481
  + lustre_storages
482
482
  )
483
483
 
484
+ use_sub_slicing = (
485
+ workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
486
+ )
487
+ use_super_slicing = (
488
+ workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
489
+ )
490
+ parallel_containers = workload_system.parallel_containers
491
+ if args.use_pathways or use_super_slicing:
492
+ parallel_containers = 1
493
+
484
494
  # Currently failure policy rules are supported for Pathways workloads. b/408465881
485
495
  failure_policy_rules = ''
486
496
  pod_failure_policy = ''
@@ -497,10 +507,8 @@ def workload_create(args) -> None:
497
507
  rules:
498
508
  """
499
509
  docker_image = get_main_container_docker_image(args, workload_system)
500
- for i in range(workload_system.parallel_containers):
501
- docker_image_sufix = (
502
- f'-{i + 1}' if workload_system.parallel_containers > 1 else ''
503
- )
510
+ for i in range(parallel_containers):
511
+ docker_image_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
504
512
  pod_failure_policy += f"""
505
513
  - action: FailJob
506
514
  onPodConditions: []
@@ -533,7 +541,7 @@ def workload_create(args) -> None:
533
541
  # Create the workload file based on accelerator type or workload type.
534
542
  if workload_system.accelerator_type == AcceleratorType.GPU:
535
543
  container, debugging_dashboard_id = get_user_workload_container(
536
- args, workload_system
544
+ args, workload_system, parallel_containers=parallel_containers
537
545
  )
538
546
  gpu_scheduler, return_code = get_gpu_scheduler(
539
547
  args, workload_system, autoprovisioning_args
@@ -624,25 +632,21 @@ def workload_create(args) -> None:
624
632
  custom_pathways_server=append_custom_pathways_server(args),
625
633
  custom_pathways_worker=append_custom_pathways_worker(args),
626
634
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
627
- user_workload=get_user_workload_for_pathways(args, workload_system),
635
+ user_workload=get_user_workload_for_pathways(
636
+ args, workload_system, parallel_containers
637
+ ),
628
638
  local_queue_name=LOCAL_QUEUE_NAME,
629
639
  autoprovisioning_args=autoprovisioning_args,
630
640
  placement_policy_label=placement_policy_label,
631
641
  )
632
642
  else:
633
- use_sub_slicing = (
634
- workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
635
- )
636
- use_super_slicing = (
637
- workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
638
- )
639
643
  if use_sub_slicing:
640
644
  xpk_print('Workload will be scheduled using the Sub-slicing feature.')
641
645
  if use_super_slicing:
642
646
  xpk_print('Workload will be scheduled using the Super-slicing feature.')
643
647
 
644
648
  container, debugging_dashboard_id = get_user_workload_container(
645
- args, workload_system
649
+ args, workload_system, parallel_containers
646
650
  )
647
651
 
648
652
  machine_label = (
@@ -30,12 +30,18 @@ from .system_characteristics import (
30
30
  )
31
31
 
32
32
 
33
- def get_main_and_sidecar_container(args, system, docker_image) -> str:
33
+ def get_main_and_sidecar_container(
34
+ args,
35
+ system: SystemCharacteristics,
36
+ docker_image: str,
37
+ parallel_containers: int,
38
+ ) -> str:
34
39
  """Generate yaml for main and sidecar container.
35
40
  Args:
36
41
  args: user provided arguments for running the command.
37
42
  system: system characteristics
38
43
  docker_image: docker image
44
+ parallel_containers: number of containers to run per VM.
39
45
 
40
46
  Returns:
41
47
  str:
@@ -44,7 +50,9 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
44
50
  resource_type = AcceleratorTypeToAcceleratorCharacteristics[
45
51
  system.accelerator_type
46
52
  ].resource_type
47
- main_container = get_main_container(args, system, docker_image, resource_type)
53
+ main_container = get_main_container(
54
+ args, system, docker_image, resource_type, parallel_containers
55
+ )
48
56
  yaml = """- name: stacktrace-explorer
49
57
  image: busybox:1.28
50
58
  args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
@@ -59,13 +67,20 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
59
67
  return yaml.format(main_container=main_container)
60
68
 
61
69
 
62
- def get_main_container(args, system, docker_image, resource_type) -> str:
70
+ def get_main_container(
71
+ args,
72
+ system: SystemCharacteristics,
73
+ docker_image: str,
74
+ resource_type,
75
+ parallel_containers: int,
76
+ ) -> str:
63
77
  """Generate yaml for main container including the xpk command.
64
78
  Args:
65
79
  args: user provided arguments for running the command.
66
80
  system: system characteristics
67
81
  docker_image: docker image
68
82
  resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
83
+ parallel_containers: number of containers to run per VM.
69
84
 
70
85
  Returns:
71
86
  str:
@@ -149,14 +164,10 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
149
164
  volumeMounts:
150
165
  {volume_mounts}
151
166
  """
152
- # pathways job running on 2 parallel containers is not verified yet
153
- if args.use_pathways:
154
- system.parallel_containers = 1
155
-
156
167
  env = get_env_container(args, system)
157
168
  image_pull_policy = add_image_pull_policy_for_pw_or_gpu(args, system)
158
- for i in range(system.parallel_containers):
159
- docker_name_sufix = f'-{i + 1}' if system.parallel_containers > 1 else ''
169
+ for i in range(parallel_containers):
170
+ docker_name_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
160
171
  containers.append(
161
172
  container_yaml.format(
162
173
  args=args,
@@ -177,12 +188,15 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
177
188
  return ''.join(containers)
178
189
 
179
190
 
180
- def get_user_workload_container(args, system: SystemCharacteristics):
191
+ def get_user_workload_container(
192
+ args, system: SystemCharacteristics, parallel_containers: int
193
+ ):
181
194
  """Deploy user workload container
182
195
 
183
196
  Args:
184
197
  args: user provided args.
185
198
  system: system characteristics.
199
+ parallel_containers: number of containers to run per VM.
186
200
 
187
201
  Returns:
188
202
  container: main container
@@ -209,11 +223,15 @@ def get_user_workload_container(args, system: SystemCharacteristics):
209
223
  'Sidecar container to display stack traces for TPU workloads will also'
210
224
  ' be deployed.'
211
225
  )
212
- container = get_main_and_sidecar_container(args, system, docker_image)
226
+ container = get_main_and_sidecar_container(
227
+ args, system, docker_image, parallel_containers
228
+ )
213
229
  # Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
214
230
  debugging_dashboard_id = get_gke_debugging_dashboard(args)
215
231
  else:
216
- container = get_main_container(args, system, docker_image, resource_type)
232
+ container = get_main_container(
233
+ args, system, docker_image, resource_type, parallel_containers
234
+ )
217
235
  return container, debugging_dashboard_id
218
236
 
219
237
 
@@ -245,18 +245,12 @@ def append_custom_colocated_python_sidecar(args) -> str:
245
245
 
246
246
 
247
247
  def get_user_workload_for_pathways(
248
- args,
249
- system: SystemCharacteristics,
248
+ args, system: SystemCharacteristics, parallel_containers: int
250
249
  ) -> str:
251
250
  """
252
251
  Create a user workload container for Pathways.
253
252
  Don't create one for Pathways headless mode.
254
253
 
255
- Args:
256
- args: user provided args.
257
- system: system characteristics.
258
-
259
-
260
254
  Returns:
261
255
  str:
262
256
  Pathways server port as a YAML string
@@ -280,7 +274,9 @@ def get_user_workload_for_pathways(
280
274
  if args.headless:
281
275
  return ''
282
276
  else:
283
- container, _ = get_user_workload_container(args, system)
277
+ container, _ = get_user_workload_container(
278
+ args, system, parallel_containers
279
+ )
284
280
  return user_workload_yaml.format(
285
281
  args=args,
286
282
  container=container,
@@ -398,15 +398,23 @@ SUPER_SLICING_CASE = SchedulingTestCase(
398
398
  WorkloadScheduling.UNAVAILABLE,
399
399
  ),
400
400
  (
401
- (
402
- 'Super-slicing should be ignored when a given device is already'
403
- ' present in the cluster'
401
+ 'Super-slicing, but one cube',
402
+ dataclasses.replace(
403
+ SUPER_SLICING_CASE,
404
+ workload_system=_get_system_characteristics_or_die('tpu7x-128'),
405
+ cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
406
+ resources_config_map={'tpu7x-128': '16'},
404
407
  ),
408
+ WorkloadScheduling.SUPER_SLICING_AVAILABLE,
409
+ ),
410
+ (
411
+ 'Super-slicing, but one cube and no super-slicing-topology',
405
412
  dataclasses.replace(
406
413
  SUPER_SLICING_CASE,
407
- workload_system=_get_system_characteristics_or_die('tpu7x-64'),
408
- cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
409
- resources_config_map={'tpu7x-64': '16'},
414
+ workload_system=_get_system_characteristics_or_die('tpu7x-128'),
415
+ cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
416
+ resources_config_map={'tpu7x-128': '16'},
417
+ super_slicing_topology_set=False,
410
418
  ),
411
419
  WorkloadScheduling.AVAILABLE,
412
420
  ),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes