xpk 0.16.1__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.16.1 → xpk-0.17.0}/.github/release.yaml +4 -2
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/build_wheels.yaml +20 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/integration_basic_cluster_create.yaml +2 -2
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/integration_storage_tests.yaml +4 -4
- xpk-0.17.0/.github/workflows/periodic_release.yaml +133 -0
- xpk-0.17.0/.github/workflows/release_branch_versioning.yaml +103 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_build_wheel.yaml +1 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_storage_create.yaml +6 -6
- {xpk-0.16.1/src/xpk.egg-info → xpk-0.17.0}/PKG-INFO +2 -1
- {xpk-0.16.1 → xpk-0.17.0}/docs/installation.md +3 -1
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Basic_cluster_create.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_for_multi-host_nodepool.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_private.txt +4 -20
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_sub-slicing.txt +4 -19
- xpk-0.17.0/goldens/Cluster_create_super-slicing.txt +384 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_with_Managed_Lustre_driver.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_with_gb200-4.txt +4 -19
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_with_shared_reservation.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/NAP_cluster-create.txt +4 -18
- {xpk-0.16.1 → xpk-0.17.0}/goldens/NAP_cluster-create_with_pathways.txt +4 -20
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Workload_create.txt +3 -3
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Workload_create_sub-slicing.txt +3 -3
- xpk-0.17.0/goldens/Workload_create_super-slicing.txt +160 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Workload_create_with_output-manifest-file.txt +3 -3
- {xpk-0.16.1 → xpk-0.17.0}/goldens.yaml +4 -0
- {xpk-0.16.1 → xpk-0.17.0}/pyproject.toml +5 -1
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/cluster.py +48 -5
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/cluster_gcluster.py +3 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/cluster_gcluster_test.py +2 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/cluster_test.py +203 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/common.py +6 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/kind.py +2 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/workload.py +35 -16
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/workload_test.py +1 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/capacity.py +83 -46
- xpk-0.17.0/src/xpk/core/capacity_test.py +135 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/commands.py +39 -12
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/kueue_manager.py +42 -11
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/kueue_manager_test.py +83 -3
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/nap.py +5 -4
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/nodepool.py +57 -20
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/nodepool_test.py +152 -23
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/pathways.py +2 -1
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/resources.py +3 -3
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/scheduling.py +54 -10
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/scheduling_test.py +118 -13
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/system_characteristics.py +41 -24
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/system_characteristics_test.py +37 -4
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/telemetry.py +5 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/telemetry_test.py +19 -2
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/updates.py +1 -1
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/main.py +2 -1
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/cluster.py +34 -2
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/cluster_test.py +117 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/common.py +32 -0
- xpk-0.17.0/src/xpk/parser/common_test.py +49 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/kueue_config.yaml.j2 +21 -5
- xpk-0.17.0/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/kueue.py +6 -2
- {xpk-0.16.1 → xpk-0.17.0/src/xpk.egg-info}/PKG-INFO +2 -1
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk.egg-info/SOURCES.txt +6 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk.egg-info/requires.txt +1 -0
- {xpk-0.16.1 → xpk-0.17.0}/xpk.py +22 -1
- xpk-0.16.1/src/xpk/core/capacity_test.py +0 -81
- {xpk-0.16.1 → xpk-0.17.0}/.dockerignore +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/CODEOWNERS +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/actions/install-kjob/action.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/actions/install-kueue/action.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/actions/setup-test-env/action.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/README.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/build_tests.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/cleanup.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/gemini-dispatch.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/gemini-invoke.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/gemini-review.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/gemini-scheduled-triage.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/gemini-triage.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/integration_legacy_tests.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/label-validation.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/nightly_tests.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_build_kjob.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_build_scripts.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_goldens.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_integration_tests.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_lint_and_format.yml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_storage_delete.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/reusable_unit_tests.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.github/workflows/stale.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.gitignore +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/.pre-commit-config.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/LICENSE +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/Makefile +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/README.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/backoff_retry.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/data/Dockerfile +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/code-of-conduct.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/contributing.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/local_testing.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/permissions.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/testing.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/troubleshooting.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/advanced.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/autoprovisioning.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/clusters.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/cpu.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/docker.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/gpu.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/inspector.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/job.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/run.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/storage.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/tpu7x/clusters.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/tpu7x/workloads.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/docs/usage/workloads.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/batch.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/fake_training.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/job.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/llama-3.1-finetuning/requirements.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/llama-3.1-finetuning/train.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/llama-3.1-finetuning/train.slurm +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/nccl/nccl-a3mega.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/nccl/nccl-a3ultra.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/nccl/nccl.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/storage/filestore-manifest-attach.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/storage/gcsfuse-manifest.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/storage/lustre-manifest-attach.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/examples/storage/pd-manifest-attach.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/golden_buddy.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Batch.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_create_for_single-host_single-slice_TPU.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_delete.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Cluster_delete_force.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Job_cancel.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Job_info.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Job_list.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Storage_list.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Workload_create_pathways.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Workload_delete.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/goldens/Workload_list.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/pylintrc +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/setup.cfg +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/README.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/docker_manager_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/gcluster_a3mega_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/gcluster_a3ultra_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/gcluster_a4_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/integration/gcluster_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/batch.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/job.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/kjob_common.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/run.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/shell.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/storage.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/testing/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/cluster.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/cluster_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/config.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/config_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/docker_resources.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/filestore.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/gcloud_context_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/jobset.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/kjob.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/mtc.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/network.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/pathways_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/storage.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/testing/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/testing/commands_tester.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/testing/commands_tester_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/updates_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/core/workload_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/storage.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/storage_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/workload.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/parser/workload_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/telemetry_uploader.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/filestore-pv.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/filestore-pvc.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/filestore-sc.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/fuse-pv.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/fuse-pvc.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/mtc-cpc.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/templates/volume_bundle.yaml +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/console_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/execution_context.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/feature_flags.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/topology.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/topology_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/user_agent.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/user_agent_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/user_input.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/user_input_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/validation_test.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/versions.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/src/xpk.egg-info/top_level.txt +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/tools/Dockerfile-kjob +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/tools/build-kjob.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/tools/install-gke-auth-plugin.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/tools/install-xpk.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/xpk-large-scale-guide.sh +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/xpk-notebooks.md +0 -0
- {xpk-0.16.1 → xpk-0.17.0}/xpk-slurm-commands.md +0 -0
|
@@ -17,9 +17,11 @@
|
|
|
17
17
|
|
|
18
18
|
changelog:
|
|
19
19
|
categories:
|
|
20
|
+
- title: Breaking Changes
|
|
21
|
+
labels: [release-breaking]
|
|
20
22
|
- title: New Features
|
|
21
23
|
labels: [release-features]
|
|
22
|
-
- title:
|
|
23
|
-
labels: [release-
|
|
24
|
+
- title: Improvements
|
|
25
|
+
labels: [release-improvements]
|
|
24
26
|
- title: Bug fixes
|
|
25
27
|
labels: [release-bugfix]
|
|
@@ -25,9 +25,29 @@ on:
|
|
|
25
25
|
push:
|
|
26
26
|
tags:
|
|
27
27
|
- "v[0-9]+.[0-9]+.[0-9]+"
|
|
28
|
+
workflow_dispatch:
|
|
28
29
|
|
|
29
30
|
jobs:
|
|
31
|
+
validate_tag:
|
|
32
|
+
name: Validate Tag Pattern
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
steps:
|
|
35
|
+
- name: Check ref
|
|
36
|
+
run: |
|
|
37
|
+
# When workflow is dispatched through workflow_dispatch, it could be launched on branches instead of tags.
|
|
38
|
+
# We want to make sure it is launched with the tag that additionally is a valid version tag.
|
|
39
|
+
if [[ "${{ github.ref_type }}" != "tag" ]]; then
|
|
40
|
+
echo "::error::This workflow must be run on a Tag, not a Branch."
|
|
41
|
+
exit 1
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
if [[ ! "${{ github.ref_name }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
|
45
|
+
echo "::error::Tag '${{ github.ref_name }}' does not match the required pattern v[0-9]+.[0-9]+.[0-9]+"
|
|
46
|
+
exit 1
|
|
47
|
+
fi
|
|
48
|
+
|
|
30
49
|
build_wheel:
|
|
50
|
+
needs: [validate_tag]
|
|
31
51
|
uses: ./.github/workflows/reusable_build_wheel.yaml
|
|
32
52
|
approval:
|
|
33
53
|
name: Wait for approval
|
|
@@ -43,7 +43,7 @@ jobs:
|
|
|
43
43
|
- name: Check xpk installation
|
|
44
44
|
run: xpk version
|
|
45
45
|
- name: Create an XPK Cluster with zero node pools
|
|
46
|
-
run: xpk cluster create --cluster $EMPTY_CLUSTER_NAME --
|
|
46
|
+
run: xpk cluster create --cluster $EMPTY_CLUSTER_NAME --device-type=n2-standard-64-1 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --on-demand --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
|
|
47
47
|
- name: Delete the cluster created
|
|
48
48
|
run: xpk cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=us-central2-b --force
|
|
49
49
|
if: always()
|
|
@@ -71,7 +71,7 @@ jobs:
|
|
|
71
71
|
- name: Check xpk installation
|
|
72
72
|
run: xpk version
|
|
73
73
|
- name: Create a Private XPK Cluster with zero node pools
|
|
74
|
-
run: xpk cluster create --cluster $PRIVATE_CLUSTER_NAME --private --
|
|
74
|
+
run: xpk cluster create --cluster $PRIVATE_CLUSTER_NAME --private --device-type=n2-standard-64-1 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --on-demand --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
|
|
75
75
|
- name: Verify the created cluster is private
|
|
76
76
|
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
|
|
77
77
|
- name: Delete the cluster created
|
|
@@ -40,7 +40,7 @@ jobs:
|
|
|
40
40
|
- name: Check xpk installation
|
|
41
41
|
run: xpk version
|
|
42
42
|
- name: Create cluster
|
|
43
|
-
run: xpk cluster create --cluster xpk-storage-tests --
|
|
43
|
+
run: xpk cluster create --cluster xpk-storage-tests --device-type=n2-standard-64-1 --num-slices=1 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=2 --on-demand --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
|
|
44
44
|
|
|
45
45
|
delete-cluster:
|
|
46
46
|
runs-on: [ubuntu-22.04]
|
|
@@ -71,7 +71,7 @@ jobs:
|
|
|
71
71
|
cancel-in-progress: true
|
|
72
72
|
with:
|
|
73
73
|
cluster-name: xpk-storage-tests
|
|
74
|
-
|
|
74
|
+
device-type: n2-standard-64-1
|
|
75
75
|
zone: us-central2-b
|
|
76
76
|
storage-type: 'gcsfuse'
|
|
77
77
|
storage-command: 'attach'
|
|
@@ -103,7 +103,7 @@ jobs:
|
|
|
103
103
|
cancel-in-progress: true
|
|
104
104
|
with:
|
|
105
105
|
cluster-name: xpk-storage-tests
|
|
106
|
-
|
|
106
|
+
device-type: n2-standard-64-1
|
|
107
107
|
zone: us-central2-b
|
|
108
108
|
storage-type: 'gcpfilestore'
|
|
109
109
|
storage-command: 'create'
|
|
@@ -134,7 +134,7 @@ jobs:
|
|
|
134
134
|
cancel-in-progress: true
|
|
135
135
|
with:
|
|
136
136
|
cluster-name: xpk-storage-tests
|
|
137
|
-
|
|
137
|
+
device-type: n2-standard-64-1
|
|
138
138
|
zone: us-central2-b
|
|
139
139
|
storage-type: 'gcpfilestore'
|
|
140
140
|
storage-command: 'attach'
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Copyright 2025 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
name: Periodic Release
|
|
16
|
+
|
|
17
|
+
on:
|
|
18
|
+
schedule:
|
|
19
|
+
- cron: "30 10 * * 1" # At 10:30 on Monday.
|
|
20
|
+
workflow_dispatch:
|
|
21
|
+
|
|
22
|
+
permissions:
|
|
23
|
+
contents: write
|
|
24
|
+
actions: write
|
|
25
|
+
|
|
26
|
+
jobs:
|
|
27
|
+
validate-ref:
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
steps:
|
|
30
|
+
- name: Check ref
|
|
31
|
+
run: |
|
|
32
|
+
if [[ "${{ github.ref_type }}" != "branch" ]]; then
|
|
33
|
+
echo "::error::This workflow must be run on a Branch, not a Tag."
|
|
34
|
+
exit 1
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
if [[ "${{ github.ref_name }}" != "main" ]]; then
|
|
38
|
+
echo "::error::This workflow must be run on a main branch."
|
|
39
|
+
exit 1
|
|
40
|
+
fi
|
|
41
|
+
|
|
42
|
+
compute-release-branch:
|
|
43
|
+
needs: [validate-ref]
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
outputs:
|
|
46
|
+
skip_push: ${{ steps.compute_branch_name.outputs.skip_push }}
|
|
47
|
+
new_branch_name: ${{ steps.compute_branch_name.outputs.new_branch_name }}
|
|
48
|
+
steps:
|
|
49
|
+
- name: Checkout repository
|
|
50
|
+
uses: actions/checkout@v4
|
|
51
|
+
with:
|
|
52
|
+
fetch-depth: 0
|
|
53
|
+
fetch-tags: true
|
|
54
|
+
- name: Compute Next Version
|
|
55
|
+
id: compute_branch_name
|
|
56
|
+
env:
|
|
57
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
58
|
+
run: |
|
|
59
|
+
LATEST_BRANCH=$(git branch -r | grep -oE 'origin/release-[0-9]+\.[0-9]+' | sort -V | tail -n 1)
|
|
60
|
+
|
|
61
|
+
if [ -z "$LATEST_BRANCH" ]; then
|
|
62
|
+
echo "No existing release branches found. Cannot determine next version automatically."
|
|
63
|
+
exit 1
|
|
64
|
+
fi
|
|
65
|
+
|
|
66
|
+
echo "Latest release branch found: $LATEST_BRANCH"
|
|
67
|
+
|
|
68
|
+
COMMIT_COUNT=$(git rev-list --count "$LATEST_BRANCH"..HEAD)
|
|
69
|
+
|
|
70
|
+
echo "Commits since last release: $COMMIT_COUNT"
|
|
71
|
+
|
|
72
|
+
if [ "$COMMIT_COUNT" -eq 0 ]; then
|
|
73
|
+
echo "No new changes detected compared to $LATEST_BRANCH."
|
|
74
|
+
echo "Skipping release branch creation."
|
|
75
|
+
echo "skip_push=true" >> $GITHUB_OUTPUT
|
|
76
|
+
exit 0
|
|
77
|
+
fi
|
|
78
|
+
|
|
79
|
+
VERSION_STR=$(echo "$LATEST_BRANCH" | sed 's/origin\/release-//')
|
|
80
|
+
CURRENT_MAJOR=$(echo "$VERSION_STR" | cut -d. -f1)
|
|
81
|
+
CURRENT_MINOR=$(echo "$VERSION_STR" | cut -d. -f2)
|
|
82
|
+
|
|
83
|
+
echo "Retrieving pull request labels between $LATEST_BRANCH and HEAD..."
|
|
84
|
+
LABELS=$(git log $LATEST_BRANCH..HEAD --pretty=format:"%s" \
|
|
85
|
+
| grep -oE "#[0-9]+" \
|
|
86
|
+
| tr -d "#" \
|
|
87
|
+
| xargs -I {} gh pr view {} --json labels --jq '.labels[].name' \
|
|
88
|
+
| sort \
|
|
89
|
+
| uniq)
|
|
90
|
+
|
|
91
|
+
echo "Pull request labels:"
|
|
92
|
+
echo "$LABELS"
|
|
93
|
+
|
|
94
|
+
if echo "$LABELS" | grep -q "release-breaking"; then
|
|
95
|
+
NEW_MAJOR=$((CURRENT_MAJOR + 1))
|
|
96
|
+
NEW_MINOR=0
|
|
97
|
+
echo "BREAKING detected. Bumping Major version."
|
|
98
|
+
else
|
|
99
|
+
NEW_MAJOR=$CURRENT_MAJOR
|
|
100
|
+
NEW_MINOR=$((CURRENT_MINOR + 1))
|
|
101
|
+
echo "No breaking changes detected. Bumping Minor version."
|
|
102
|
+
fi
|
|
103
|
+
|
|
104
|
+
NEW_BRANCH_NAME="release-$NEW_MAJOR.$NEW_MINOR"
|
|
105
|
+
echo "New release branch calculated: $NEW_BRANCH_NAME"
|
|
106
|
+
echo "new_branch_name=$NEW_BRANCH_NAME" >> $GITHUB_OUTPUT
|
|
107
|
+
echo "skip_push=false" >> $GITHUB_OUTPUT
|
|
108
|
+
|
|
109
|
+
create-release-branch:
|
|
110
|
+
needs: [compute-release-branch]
|
|
111
|
+
environment:
|
|
112
|
+
name: release
|
|
113
|
+
runs-on: ubuntu-latest
|
|
114
|
+
steps:
|
|
115
|
+
- name: Checkout repository
|
|
116
|
+
if: needs.compute-release-branch.outputs.skip_push != 'true'
|
|
117
|
+
uses: actions/checkout@v4
|
|
118
|
+
with:
|
|
119
|
+
fetch-depth: 0
|
|
120
|
+
fetch-tags: true
|
|
121
|
+
- name: Push the branch
|
|
122
|
+
if: needs.compute-release-branch.outputs.skip_push != 'true'
|
|
123
|
+
run: |
|
|
124
|
+
NEW_BRANCH_NAME="${{needs.compute-release-branch.outputs.new_branch_name}}"
|
|
125
|
+
git checkout -b "$NEW_BRANCH_NAME"
|
|
126
|
+
git push origin "$NEW_BRANCH_NAME"
|
|
127
|
+
|
|
128
|
+
echo "Successfully pushed $NEW_BRANCH_NAME"
|
|
129
|
+
- name: Run release branch versioning
|
|
130
|
+
if: needs.compute-release-branch.outputs.skip_push != 'true'
|
|
131
|
+
env:
|
|
132
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
133
|
+
run: gh workflow run release_branch_versioning.yaml --ref ${{needs.compute-release-branch.outputs.new_branch_name}}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright 2025 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
name: Release branch versioning
|
|
16
|
+
|
|
17
|
+
on:
|
|
18
|
+
push:
|
|
19
|
+
branches:
|
|
20
|
+
- "release-*"
|
|
21
|
+
workflow_dispatch:
|
|
22
|
+
|
|
23
|
+
permissions:
|
|
24
|
+
contents: write
|
|
25
|
+
actions: write
|
|
26
|
+
|
|
27
|
+
jobs:
|
|
28
|
+
validate_branch:
|
|
29
|
+
name: Validate Branch Pattern
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
steps:
|
|
32
|
+
- name: Check ref
|
|
33
|
+
run: |
|
|
34
|
+
# When workflow is dispatched through workflow_dispatch, it could be launched on any branch or tag.
|
|
35
|
+
# We want to make sure it is launched with the branch that matches the syntax.
|
|
36
|
+
if [[ "${{ github.ref_type }}" != "branch" ]]; then
|
|
37
|
+
echo "::error::This workflow must be run on a Branch, not a Tag."
|
|
38
|
+
exit 1
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
if [[ ! "${{ github.ref_name }}" =~ ^release- ]]; then
|
|
42
|
+
echo "::error::Invalid Branch: Manual runs are only allowed on branches matching 'release-*'. Current branch: ${{ github.ref_name }}"
|
|
43
|
+
exit 1
|
|
44
|
+
fi
|
|
45
|
+
|
|
46
|
+
bump_version:
|
|
47
|
+
needs: [validate_branch]
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
outputs:
|
|
50
|
+
tag: ${{ steps.bump.outputs.tag }}
|
|
51
|
+
steps:
|
|
52
|
+
- name: Checkout code
|
|
53
|
+
uses: actions/checkout@v4
|
|
54
|
+
with:
|
|
55
|
+
token: ${{ secrets.GITHUB_TOKEN }}
|
|
56
|
+
fetch-depth: 0
|
|
57
|
+
|
|
58
|
+
- name: Calculate Next Version
|
|
59
|
+
id: bump
|
|
60
|
+
run: |
|
|
61
|
+
EXISTING_TAG=$(git tag --points-at HEAD | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n 1)
|
|
62
|
+
|
|
63
|
+
if [ -n "$EXISTING_TAG" ]; then
|
|
64
|
+
echo "Commit is already tagged with: $EXISTING_TAG"
|
|
65
|
+
exit 1
|
|
66
|
+
fi
|
|
67
|
+
|
|
68
|
+
BRANCH_NAME="${{ github.ref_name }}"
|
|
69
|
+
BASE_VERSION=${BRANCH_NAME#release-}
|
|
70
|
+
|
|
71
|
+
echo "Base Version derived from branch: $BASE_VERSION"
|
|
72
|
+
|
|
73
|
+
LATEST_TAG=$(git tag -l "v$BASE_VERSION.*" | sort -V | tail -n1)
|
|
74
|
+
|
|
75
|
+
if [ -z "$LATEST_TAG" ]; then
|
|
76
|
+
echo "No tags found for this release branch. Starting at patch .0"
|
|
77
|
+
NEW_TAG="v$BASE_VERSION.0"
|
|
78
|
+
else
|
|
79
|
+
echo "Found existing latest tag: $LATEST_TAG"
|
|
80
|
+
|
|
81
|
+
VERSION=${LATEST_TAG#v}
|
|
82
|
+
|
|
83
|
+
IFS='.' read -r -a parts <<< "$VERSION"
|
|
84
|
+
patch=${parts[2]}
|
|
85
|
+
|
|
86
|
+
new_patch=$((patch + 1))
|
|
87
|
+
NEW_TAG="v$BASE_VERSION.$new_patch"
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
echo "Next Tag: $NEW_TAG"
|
|
91
|
+
echo "tag=$NEW_TAG" >> $GITHUB_OUTPUT
|
|
92
|
+
|
|
93
|
+
- name: Push Tag
|
|
94
|
+
run: |
|
|
95
|
+
NEW_TAG=${{ steps.bump.outputs.tag }}
|
|
96
|
+
|
|
97
|
+
echo "Pushing tag $NEW_TAG..."
|
|
98
|
+
git tag "$NEW_TAG"
|
|
99
|
+
git push origin "$NEW_TAG"
|
|
100
|
+
- name: Run build
|
|
101
|
+
env:
|
|
102
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
103
|
+
run: gh workflow run build_wheels.yaml --ref ${{ steps.bump.outputs.tag }}
|
|
@@ -18,10 +18,10 @@ on:
|
|
|
18
18
|
cluster-name:
|
|
19
19
|
type: string
|
|
20
20
|
required: true
|
|
21
|
-
|
|
22
|
-
description: '
|
|
21
|
+
device-type:
|
|
22
|
+
description: 'Device Type'
|
|
23
23
|
required: false
|
|
24
|
-
default: '
|
|
24
|
+
default: 'n2-standard-64-1'
|
|
25
25
|
type: string
|
|
26
26
|
zone:
|
|
27
27
|
type: string
|
|
@@ -102,14 +102,14 @@ jobs:
|
|
|
102
102
|
run: |
|
|
103
103
|
kubectl get pv ${{inputs.storage-name}}-pv -oyaml | grep 'gcsfuseMetadataPrefetchOnMount: "true"' || (echo 'Metadata pre-population was not enabled' && exit 143)
|
|
104
104
|
- name: Run workload to write file on filestore
|
|
105
|
-
run: xpk workload create --workload $STORAGE_WRITE_WORKLOAD --num-slices=1 --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir -p /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --
|
|
105
|
+
run: xpk workload create --workload $STORAGE_WRITE_WORKLOAD --num-slices=1 --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir -p /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --device-type=${{inputs.device-type}} --zone ${{inputs.zone}}
|
|
106
106
|
- name: Wait for writer workload completion and confirm it succeeded
|
|
107
107
|
run: xpk workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $STORAGE_WRITE_WORKLOAD --timeout 300
|
|
108
108
|
- name: Delete the writer workload on the cluster
|
|
109
109
|
if: always()
|
|
110
110
|
run: xpk workload delete --workload $STORAGE_WRITE_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
|
|
111
111
|
- name: Run workload to read file on filestore
|
|
112
|
-
run : xpk workload create --workload $STORAGE_READ_WORKLOAD --command "grep 'Test text message' /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --
|
|
112
|
+
run : xpk workload create --workload $STORAGE_READ_WORKLOAD --command "grep 'Test text message' /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster ${{inputs.cluster-name}} --device-type=${{inputs.device-type}} --zone ${{inputs.zone}}
|
|
113
113
|
- name: Wait for reader workload completion and confirm it succeeded
|
|
114
114
|
run: xpk workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $STORAGE_READ_WORKLOAD --timeout 300
|
|
115
115
|
- name: Delete the reader workload on the cluster
|
|
@@ -154,7 +154,7 @@ jobs:
|
|
|
154
154
|
- name: Delete create-shell.exp file
|
|
155
155
|
run: rm create-shell.exp
|
|
156
156
|
- name: Run workload to delete file on filestore
|
|
157
|
-
run : xpk workload create --workload $STORAGE_DELETE_WORKLOAD --command "rm -rf /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --
|
|
157
|
+
run : xpk workload create --workload $STORAGE_DELETE_WORKLOAD --command "rm -rf /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --device-type=${{inputs.device-type}} --zone ${{inputs.zone}}
|
|
158
158
|
- name: Wait for delete workload completion and confirm it succeeded
|
|
159
159
|
run: xpk workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $STORAGE_DELETE_WORKLOAD --timeout 300
|
|
160
160
|
- name: Delete the delete workload on the cluster
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -25,6 +25,7 @@ Requires-Dist: packaging==24.2
|
|
|
25
25
|
Requires-Dist: google-cloud-filestore==1.12.0
|
|
26
26
|
Requires-Dist: google-cloud-storage
|
|
27
27
|
Requires-Dist: Jinja2==3.1.6
|
|
28
|
+
Requires-Dist: urllib3<2.6.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: pyink==24.3.0; extra == "dev"
|
|
30
31
|
Requires-Dist: pylint>=2.6.0; extra == "dev"
|
|
@@ -86,7 +86,7 @@ If you need to modify the source code or use the latest unreleased features:
|
|
|
86
86
|
|
|
87
87
|
```shell
|
|
88
88
|
# 1. Clone the XPK repository
|
|
89
|
-
git clone
|
|
89
|
+
git clone https://github.com/AI-Hypercomputer/xpk.git
|
|
90
90
|
cd xpk
|
|
91
91
|
|
|
92
92
|
# 2. Install dependencies and build
|
|
@@ -96,6 +96,8 @@ make install
|
|
|
96
96
|
export PATH=$PATH:$PWD/bin
|
|
97
97
|
```
|
|
98
98
|
|
|
99
|
+
*Note: Installing from source is recommended only for contributors and advanced users. Most users should install via PIP for the best stability.*
|
|
100
|
+
|
|
99
101
|
**Persisting the PATH configuration:**
|
|
100
102
|
To use `xpk` in future terminal sessions without re-running the export command, add the binary path to your shell configuration:
|
|
101
103
|
|
|
@@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
|
|
|
37
37
|
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
|
|
38
38
|
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
|
|
39
39
|
[XPK] Creating 1 node pool or pools of tpu7x-8
|
|
40
|
-
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
40
|
+
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
41
41
|
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
|
|
42
42
|
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
|
|
43
43
|
[XPK] Creating 1 node pool or pools of tpu7x-8
|
|
44
|
-
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
44
|
+
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
45
45
|
[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
|
|
46
46
|
gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
|
|
47
47
|
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
|
|
@@ -174,7 +174,7 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s
|
|
|
174
174
|
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml
|
|
175
175
|
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
|
|
176
176
|
kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m
|
|
177
|
-
[XPK] Temp file (
|
|
177
|
+
[XPK] Temp file (6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133) content:
|
|
178
178
|
|
|
179
179
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
180
180
|
kind: ResourceFlavor
|
|
@@ -182,19 +182,6 @@ metadata:
|
|
|
182
182
|
name: "1xtpu7x-8"
|
|
183
183
|
spec:
|
|
184
184
|
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x1"}
|
|
185
|
-
|
|
186
|
-
---
|
|
187
|
-
|
|
188
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
189
|
-
kind: AdmissionCheck
|
|
190
|
-
metadata:
|
|
191
|
-
name: dws-prov
|
|
192
|
-
spec:
|
|
193
|
-
controllerName: kueue.x-k8s.io/provisioning-request
|
|
194
|
-
parameters:
|
|
195
|
-
apiGroup: kueue.x-k8s.io
|
|
196
|
-
kind: ProvisioningRequestConfig
|
|
197
|
-
name: dws-config
|
|
198
185
|
---
|
|
199
186
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
200
187
|
kind: ProvisioningRequestConfig
|
|
@@ -219,7 +206,6 @@ spec:
|
|
|
219
206
|
withinClusterQueue: LowerPriority
|
|
220
207
|
namespaceSelector: {} # match all.
|
|
221
208
|
resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xtpu7x-8', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 4}]}]}]
|
|
222
|
-
|
|
223
209
|
---
|
|
224
210
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
225
211
|
kind: LocalQueue
|
|
@@ -269,7 +255,7 @@ value: 1000
|
|
|
269
255
|
globalDefault: false
|
|
270
256
|
description: "Very High"
|
|
271
257
|
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
|
|
272
|
-
kubectl apply -f
|
|
258
|
+
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
273
259
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
274
260
|
kubectl get node --no-headers | wc -l
|
|
275
261
|
[XPK] Try 1: Updating Kueue Controller Manager resources
|
|
@@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
|
|
|
37
37
|
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
|
|
38
38
|
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
|
|
39
39
|
[XPK] Creating 1 node pool or pools of tpu7x-16
|
|
40
|
-
We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
|
|
40
|
+
We assume that the underlying system is: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
|
|
41
41
|
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
|
|
42
42
|
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
|
|
43
43
|
[XPK] Creating 1 node pool or pools of tpu7x-16
|
|
44
|
-
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
|
|
44
|
+
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x2', vms_per_slice=2, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-16', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=True, gpu_config=None)
|
|
45
45
|
[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
|
|
46
46
|
gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
|
|
47
47
|
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
|
|
@@ -176,7 +176,7 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s
|
|
|
176
176
|
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml
|
|
177
177
|
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
|
|
178
178
|
kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m
|
|
179
|
-
[XPK] Temp file (
|
|
179
|
+
[XPK] Temp file (b58f50dd88cb1211d51276b9b445f6bca02f0e97fa984656d47992aecd9322cc) content:
|
|
180
180
|
|
|
181
181
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
182
182
|
kind: ResourceFlavor
|
|
@@ -184,19 +184,6 @@ metadata:
|
|
|
184
184
|
name: "1xtpu7x-16"
|
|
185
185
|
spec:
|
|
186
186
|
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu7x", "cloud.google.com/gke-tpu-topology": "2x2x2"}
|
|
187
|
-
|
|
188
|
-
---
|
|
189
|
-
|
|
190
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
191
|
-
kind: AdmissionCheck
|
|
192
|
-
metadata:
|
|
193
|
-
name: dws-prov
|
|
194
|
-
spec:
|
|
195
|
-
controllerName: kueue.x-k8s.io/provisioning-request
|
|
196
|
-
parameters:
|
|
197
|
-
apiGroup: kueue.x-k8s.io
|
|
198
|
-
kind: ProvisioningRequestConfig
|
|
199
|
-
name: dws-config
|
|
200
187
|
---
|
|
201
188
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
202
189
|
kind: ProvisioningRequestConfig
|
|
@@ -221,7 +208,6 @@ spec:
|
|
|
221
208
|
withinClusterQueue: LowerPriority
|
|
222
209
|
namespaceSelector: {} # match all.
|
|
223
210
|
resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xtpu7x-16', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 8}]}]}]
|
|
224
|
-
|
|
225
211
|
---
|
|
226
212
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
227
213
|
kind: LocalQueue
|
|
@@ -271,7 +257,7 @@ value: 1000
|
|
|
271
257
|
globalDefault: false
|
|
272
258
|
description: "Very High"
|
|
273
259
|
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
|
|
274
|
-
kubectl apply -f
|
|
260
|
+
kubectl apply -f b58f50dd88cb1211d51276b9b445f6bca02f0e97fa984656d47992aecd9322cc
|
|
275
261
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
276
262
|
kubectl get node --no-headers | wc -l
|
|
277
263
|
[XPK] Try 1: Updating Kueue Controller Manager resources
|
|
@@ -41,13 +41,13 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
|
|
|
41
41
|
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
|
|
42
42
|
gcloud beta container clusters describe golden-cluster-private --location us-central1 --project golden-project --format="value(currentMasterVersion)"
|
|
43
43
|
[XPK] Creating 1 node pool or pools of v5p-8
|
|
44
|
-
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
44
|
+
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
45
45
|
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
|
|
46
46
|
gcloud beta container node-pools list --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
|
|
47
47
|
[XPK] Task: `Describe reservation` is implemented by the following command not running since it is a dry run.
|
|
48
48
|
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a
|
|
49
49
|
[XPK] Creating 1 node pool or pools of v5p-8
|
|
50
|
-
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
50
|
+
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu-v5p-slice', gce_machine_type='ct5p-hightpu-4t', chips_per_vm=4, accelerator_type=TPU, device_type='v5p-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
51
51
|
[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
|
|
52
52
|
gcloud beta container node-pools describe 0 --cluster golden-cluster-private --project=golden-project --location=us-central1 --format="value(locations)"
|
|
53
53
|
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
|
|
@@ -184,7 +184,7 @@ kubectl get deployment kueue-controller-manager -n kueue-system -o jsonpath='{.s
|
|
|
184
184
|
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.3/manifests.yaml
|
|
185
185
|
[XPK] Task: `Wait for Kueue to be available` is implemented by the following command not running since it is a dry run.
|
|
186
186
|
kubectl wait deploy/kueue-controller-manager -n kueue-system --for=condition=available --timeout=10m
|
|
187
|
-
[XPK] Temp file (
|
|
187
|
+
[XPK] Temp file (2e0015f210b664c3b767ae4e11af51387b01d4d6b36e20fecbdee137d3d2700b) content:
|
|
188
188
|
|
|
189
189
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
190
190
|
kind: ResourceFlavor
|
|
@@ -192,28 +192,13 @@ metadata:
|
|
|
192
192
|
name: "1xv5p-8"
|
|
193
193
|
spec:
|
|
194
194
|
nodeLabels: {"cloud.google.com/gke-tpu-accelerator": "tpu-v5p-slice", "cloud.google.com/gke-tpu-topology": "2x2x1"}
|
|
195
|
-
|
|
196
195
|
---
|
|
197
|
-
|
|
198
196
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
199
197
|
kind: ResourceFlavor
|
|
200
198
|
metadata:
|
|
201
199
|
name: "cpu-user"
|
|
202
200
|
spec:
|
|
203
201
|
nodeLabels: {"cloud.google.com/gke-nodepool": "cpu-np"}
|
|
204
|
-
|
|
205
|
-
---
|
|
206
|
-
|
|
207
|
-
apiVersion: kueue.x-k8s.io/v1beta1
|
|
208
|
-
kind: AdmissionCheck
|
|
209
|
-
metadata:
|
|
210
|
-
name: dws-prov
|
|
211
|
-
spec:
|
|
212
|
-
controllerName: kueue.x-k8s.io/provisioning-request
|
|
213
|
-
parameters:
|
|
214
|
-
apiGroup: kueue.x-k8s.io
|
|
215
|
-
kind: ProvisioningRequestConfig
|
|
216
|
-
name: dws-config
|
|
217
202
|
---
|
|
218
203
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
219
204
|
kind: ProvisioningRequestConfig
|
|
@@ -238,7 +223,6 @@ spec:
|
|
|
238
223
|
withinClusterQueue: LowerPriority
|
|
239
224
|
namespaceSelector: {} # match all.
|
|
240
225
|
resourceGroups: [{'coveredResources': ['google.com/tpu'], 'flavors': [{'name': '1xv5p-8', 'resources': [{'name': 'google.com/tpu', 'nominalQuota': 4}]}]}, {'coveredResources': ['cpu', 'memory'], 'flavors': [{'name': 'cpu-user', 'resources': [{'name': 'cpu', 'nominalQuota': 480}, {'name': 'memory', 'nominalQuota': '2000G'}]}]}]
|
|
241
|
-
|
|
242
226
|
---
|
|
243
227
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
244
228
|
kind: LocalQueue
|
|
@@ -288,7 +272,7 @@ value: 1000
|
|
|
288
272
|
globalDefault: false
|
|
289
273
|
description: "Very High"
|
|
290
274
|
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
|
|
291
|
-
kubectl apply -f
|
|
275
|
+
kubectl apply -f 2e0015f210b664c3b767ae4e11af51387b01d4d6b36e20fecbdee137d3d2700b
|
|
292
276
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
293
277
|
kubectl get node --no-headers | wc -l
|
|
294
278
|
[XPK] Try 1: Updating Kueue Controller Manager resources
|