xpk 1.0.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_basic_cluster_create.yaml +3 -35
- xpk-1.1.1/.github/workflows/integration_gpu_cluster_create.yaml +78 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/nightly_tests.yaml +7 -6
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_goldens.yaml +1 -1
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/stale.yaml +4 -4
- {xpk-1.0.0 → xpk-1.1.1}/Makefile +2 -10
- {xpk-1.0.0/src/xpk.egg-info → xpk-1.1.1}/PKG-INFO +37 -21
- {xpk-1.0.0 → xpk-1.1.1}/README.md +36 -20
- {xpk-1.0.0 → xpk-1.1.1}/docs/testing.md +37 -16
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/clusters.md +2 -2
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -4
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -4
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/workloads.md +3 -0
- xpk-1.1.1/recipes/Basic_cluster_adapt.md +143 -0
- xpk-1.0.0/goldens/Basic_cluster_create.txt → xpk-1.1.1/recipes/Basic_cluster_create.md +14 -5
- xpk-1.1.1/recipes/Cluster_create_RayCluster.md +288 -0
- xpk-1.0.0/goldens/Cluster_create_for_multi-host_nodepool.txt → xpk-1.1.1/recipes/Cluster_create_for_multi-host_nodepool.md +15 -6
- xpk-1.1.1/recipes/Cluster_create_for_single-host_nodepool.md +275 -0
- xpk-1.0.0/goldens/Cluster_create_private.txt → xpk-1.1.1/recipes/Cluster_create_private.md +15 -6
- xpk-1.0.0/goldens/Cluster_create_sub-slicing.txt → xpk-1.1.1/recipes/Cluster_create_sub-slicing.md +14 -5
- xpk-1.0.0/goldens/Cluster_create_super-slicing.txt → xpk-1.1.1/recipes/Cluster_create_super-slicing.md +18 -9
- xpk-1.0.0/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt → xpk-1.1.1/recipes/Cluster_create_with_CPU_and_memory_limits_above_capacity.md +14 -5
- xpk-1.0.0/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt → xpk-1.1.1/recipes/Cluster_create_with_CPU_and_memory_limits_below_capacity.md +14 -5
- xpk-1.0.0/goldens/Cluster_create_with_Managed_Lustre_driver.txt → xpk-1.1.1/recipes/Cluster_create_with_Managed_Lustre_driver.md +14 -5
- xpk-1.0.0/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt → xpk-1.1.1/recipes/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.md +14 -5
- xpk-1.0.0/goldens/Cluster_create_with_gb200-4.txt → xpk-1.1.1/recipes/Cluster_create_with_gb200-4.md +15 -6
- xpk-1.0.0/goldens/Cluster_create_with_shared_reservation.txt → xpk-1.1.1/recipes/Cluster_create_with_shared_reservation.md +14 -5
- xpk-1.0.0/goldens/Cluster_delete.txt → xpk-1.1.1/recipes/Cluster_delete.md +10 -1
- xpk-1.0.0/goldens/Cluster_delete_force.txt → xpk-1.1.1/recipes/Cluster_delete_force.md +10 -1
- xpk-1.0.0/goldens/NAP_cluster-create.txt → xpk-1.1.1/recipes/NAP_cluster-create.md +14 -5
- xpk-1.0.0/goldens/NAP_cluster-create_with_pathways.txt → xpk-1.1.1/recipes/NAP_cluster-create_with_pathways.md +14 -5
- xpk-1.0.0/goldens/Storage_list.txt → xpk-1.1.1/recipes/Storage_list.md +10 -1
- xpk-1.0.0/goldens/Workload_create_with_output-manifest-file.txt → xpk-1.1.1/recipes/Workload_create.md +15 -9
- xpk-1.0.0/goldens/Workload_create_pathways.txt → xpk-1.1.1/recipes/Workload_create_pathways.md +13 -6
- xpk-1.0.0/goldens/Workload_create_sub-slicing.txt → xpk-1.1.1/recipes/Workload_create_sub-slicing.md +15 -8
- xpk-1.0.0/goldens/Workload_create_super-slicing.txt → xpk-1.1.1/recipes/Workload_create_super-slicing.md +16 -9
- xpk-1.0.0/goldens/Workload_create.txt → xpk-1.1.1/recipes/Workload_create_with_output-manifest-file.md +16 -8
- xpk-1.0.0/goldens/Workload_delete.txt → xpk-1.1.1/recipes/Workload_delete.md +10 -1
- xpk-1.0.0/goldens/Workload_list.txt → xpk-1.1.1/recipes/Workload_list.md +10 -1
- xpk-1.1.1/recipes/comprehensive-demo.md +83 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster.py +29 -30
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster_gcluster.py +19 -14
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster_test.py +1 -21
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/common.py +39 -6
- xpk-1.1.1/src/xpk/commands/common_test.py +170 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/info.py +9 -5
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/inspector.py +33 -4
- xpk-1.1.1/src/xpk/commands/inspector_test.py +142 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/workload.py +35 -17
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/workload_test.py +70 -3
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_generator.py +19 -8
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a4.yaml +3 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/capacity.py +37 -17
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/capacity_test.py +66 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/cluster.py +10 -10
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/cluster_private.py +3 -3
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/cluster_test.py +29 -2
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_container.py +55 -30
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_manager.py +4 -4
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_resources.py +4 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/kueue_manager.py +6 -8
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/kueue_manager_test.py +4 -5
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/nap.py +14 -3
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/nodepool.py +46 -13
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/nodepool_test.py +143 -8
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/pathways.py +4 -8
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/remote_state/fuse_remote_state.py +1 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/scheduling.py +16 -13
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/scheduling_test.py +15 -7
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/system_characteristics.py +6 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/telemetry.py +11 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/telemetry_test.py +39 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/testing/commands_tester.py +26 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/testing/commands_tester_test.py +20 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/rdma_decorator.py +9 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/cluster.py +11 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/cluster_test.py +59 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/common.py +11 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/storage.py +3 -3
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/console.py +1 -1
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/feature_flags.py +7 -3
- {xpk-1.0.0 → xpk-1.1.1/src/xpk.egg-info}/PKG-INFO +37 -21
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/SOURCES.txt +32 -39
- xpk-1.1.1/src/xpk.egg-info/top_level.txt +1 -0
- {xpk-1.0.0 → xpk-1.1.1}/tools/install-xpk.sh +1 -1
- xpk-1.1.1/tools/recipes.py +235 -0
- xpk-1.0.0/.github/workflows/integration_legacy_tests.yaml +0 -66
- xpk-1.0.0/.github/workflows/reusable_integration_tests.yaml +0 -61
- xpk-1.0.0/docs/usage/tpu7x/clusters.md +0 -329
- xpk-1.0.0/docs/usage/tpu7x/workloads.md +0 -269
- xpk-1.0.0/golden_buddy.sh +0 -150
- xpk-1.0.0/goldens.yaml +0 -47
- xpk-1.0.0/src/integration/README.md +0 -19
- xpk-1.0.0/src/integration/docker_manager_test.py +0 -102
- xpk-1.0.0/src/integration/gcluster_a3mega_test.py +0 -215
- xpk-1.0.0/src/integration/gcluster_a3ultra_test.py +0 -187
- xpk-1.0.0/src/integration/gcluster_a4_test.py +0 -187
- xpk-1.0.0/src/integration/gcluster_test.py +0 -107
- xpk-1.0.0/src/xpk/utils/__init__.py +0 -15
- xpk-1.0.0/src/xpk/utils/user_input.py +0 -48
- xpk-1.0.0/src/xpk/utils/user_input_test.py +0 -92
- xpk-1.0.0/src/xpk.egg-info/top_level.txt +0 -2
- {xpk-1.0.0 → xpk-1.1.1}/.dockerignore +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/CODEOWNERS +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/actions/install-kueue/action.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/actions/setup-test-env/action.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/release.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/README.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/build_tests.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/build_wheels.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/cleanup.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-dispatch.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-invoke.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-review.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-scheduled-triage.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/gemini-triage.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/integration_storage_tests.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/label-validation.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/periodic_release.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/release_branch_versioning.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_build_scripts.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_build_wheel.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_lint_and_format.yml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_storage_create.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_storage_delete.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.github/workflows/reusable_unit_tests.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.gitignore +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/.pre-commit-config.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/LICENSE +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/backoff_retry.sh +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/data/Dockerfile +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/code-of-conduct.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/contributing.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/installation.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/permissions.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/troubleshooting.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/advanced.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/autoprovisioning.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/cpu.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/docker.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/gpu.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/inspector.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/storage.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/fake_training.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/requirements.txt +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/train.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/train.slurm +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/nccl/nccl-a3mega.sh +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/nccl/nccl-a3ultra.sh +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/nccl/nccl.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/storage/filestore-manifest-attach.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/storage/gcsfuse-manifest.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/storage/lustre-manifest-attach.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/examples/storage/pd-manifest-attach.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/pylintrc +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/pyproject.toml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/setup.cfg +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/__init__.py +0 -0
- {xpk-1.0.0/src/integration → xpk-1.1.1/src/xpk/api}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
- {xpk-1.0.0/src/xpk/api → xpk-1.1.1/src/xpk/commands}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/cluster_gcluster_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/config.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/storage.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/commands/version.py +0 -0
- {xpk-1.0.0/src/xpk/commands → xpk-1.1.1/src/xpk/core}/__init__.py +0 -0
- {xpk-1.0.0/src/xpk/core → xpk-1.1.1/src/xpk/core/blueprint}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/blueprint_test.py +0 -0
- {xpk-1.0.0/src/xpk/core/blueprint → xpk-1.1.1/src/xpk/core/blueprint/testing}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/commands.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/config.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/config_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/docker_image.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/filestore.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcloud_context_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/jobset.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/monitoring.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/mtc.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/network.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/pathways_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/ray.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/resources.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/storage.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/system_characteristics_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/testing/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/updates.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/updates_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/vertex.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload.py +0 -0
- {xpk-1.0.0/src/xpk/core/blueprint/testing → xpk-1.1.1/src/xpk/core/workload_decorators}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/core/workload_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/main.py +0 -0
- {xpk-1.0.0/src/xpk/core/workload_decorators → xpk-1.1.1/src/xpk/parser}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/common_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/config.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/core.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/info.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/inspector.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/storage_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/validators.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/version.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/workload.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/parser/workload_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/telemetry_uploader.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/filestore-pv.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/filestore-pvc.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/filestore-sc.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/fuse-pv.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/fuse-pvc.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/mtc-cpc.yaml +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/templates/storage.yaml +0 -0
- {xpk-1.0.0/src/xpk/parser → xpk-1.1.1/src/xpk/utils}/__init__.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/console_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/execution_context.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/file.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/kubectl.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/kueue.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/network.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/objects.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/templates.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/topology.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/topology_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/user_agent.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/user_agent_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/validation.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/validation_test.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/versions.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk/utils/yaml.py +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/tools/install-gke-auth-plugin.sh +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/xpk-large-scale-guide.sh +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/xpk-notebooks.md +0 -0
- {xpk-1.0.0 → xpk-1.1.1}/xpk.py +0 -0
|
@@ -31,7 +31,7 @@ jobs:
|
|
|
31
31
|
group: nightly-test-cluster-group-empty
|
|
32
32
|
cancel-in-progress: false
|
|
33
33
|
env:
|
|
34
|
-
EMPTY_CLUSTER_NAME: nightly-xpk-zero
|
|
34
|
+
EMPTY_CLUSTER_NAME: nightly-xpk-zero
|
|
35
35
|
steps:
|
|
36
36
|
- uses: actions/download-artifact@v4
|
|
37
37
|
with:
|
|
@@ -59,7 +59,7 @@ jobs:
|
|
|
59
59
|
group: nightly-test-cluster-group-private
|
|
60
60
|
cancel-in-progress: false
|
|
61
61
|
env:
|
|
62
|
-
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8
|
|
62
|
+
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8
|
|
63
63
|
steps:
|
|
64
64
|
- uses: actions/download-artifact@v4
|
|
65
65
|
with:
|
|
@@ -83,38 +83,6 @@ jobs:
|
|
|
83
83
|
with:
|
|
84
84
|
name: empty-private-cluster-nodepool-log-${{github.run_id}}
|
|
85
85
|
path: /tmp/NodepoolCreate-${{ env.PRIVATE_CLUSTER_NAME }}-np-*
|
|
86
|
-
dws_flex_cluster:
|
|
87
|
-
runs-on: [ubuntu-22.04]
|
|
88
|
-
concurrency: # We support one build test to run at a time currently.
|
|
89
|
-
group: nightly-test-cluster-group-flex
|
|
90
|
-
cancel-in-progress: false
|
|
91
|
-
env:
|
|
92
|
-
DWS_FLEX_CLUSTER_NAME: xpk-dws-nightly-test-2-v4-8
|
|
93
|
-
steps:
|
|
94
|
-
- uses: actions/download-artifact@v4
|
|
95
|
-
with:
|
|
96
|
-
name: custom-scripts
|
|
97
|
-
- name: Setup environment
|
|
98
|
-
uses: ./.github/actions/setup-test-env
|
|
99
|
-
with:
|
|
100
|
-
credentials_json: "${{ secrets.GCP_SA_KEY }}"
|
|
101
|
-
- name: Check xpk installation
|
|
102
|
-
run: xpk version
|
|
103
|
-
- name: Create a DWS flex queued xpk cluster
|
|
104
|
-
run: xpk cluster create --cluster ${DWS_FLEX_CLUSTER_NAME} --tpu-type=v5p-8 --num-slices=1 --zone=us-east5-a --default-pool-cpu-num-nodes=2 --flex --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS_DWS}"
|
|
105
|
-
- name: Run dws flex queued TPU workload
|
|
106
|
-
run: xpk workload create --workload xpktest-build-${{ github.run_attempt }}-dws --cluster ${DWS_FLEX_CLUSTER_NAME} --zone=us-east5-a --tpu-type=v5p-8 --flex --command "echo foo" --num-slices=1
|
|
107
|
-
- name: Wait for workload completion and confirm it succeeded
|
|
108
|
-
run: xpk workload list --cluster ${DWS_FLEX_CLUSTER_NAME} --zone=us-east5-a --wait-for-job-completion xpktest-build-${{ github.run_attempt }}-dws --timeout 1000
|
|
109
|
-
- name: Delete the DWS flex queued cluster
|
|
110
|
-
if: always()
|
|
111
|
-
run: xpk cluster delete --cluster ${DWS_FLEX_CLUSTER_NAME} --zone=us-east5-a --force
|
|
112
|
-
- name: Upload DWS cluster nodepool creation log
|
|
113
|
-
if: always()
|
|
114
|
-
uses: actions/upload-artifact@v4
|
|
115
|
-
with:
|
|
116
|
-
name: empty-dws-cluster-nodepool-log-${{github.run_id}}
|
|
117
|
-
path: /tmp/NodepoolCreate-${{ env.DWS_FLEX_CLUSTER_NAME }}-np-*
|
|
118
86
|
|
|
119
87
|
cluster-create-and-delete:
|
|
120
88
|
runs-on: [ubuntu-22.04]
|
|
@@ -122,7 +90,7 @@ jobs:
|
|
|
122
90
|
group: nightly-test-cluster-group-tpu
|
|
123
91
|
cancel-in-progress: false
|
|
124
92
|
env:
|
|
125
|
-
TPU_CLUSTER_NAME: nightly-xpk-2-v5p-8
|
|
93
|
+
TPU_CLUSTER_NAME: nightly-xpk-2-v5p-8
|
|
126
94
|
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
|
|
127
95
|
steps:
|
|
128
96
|
- uses: actions/download-artifact@v4
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Copyright 2025 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
name: Basic GPU cluster create
|
|
16
|
+
|
|
17
|
+
on:
|
|
18
|
+
workflow_call:
|
|
19
|
+
|
|
20
|
+
permissions:
|
|
21
|
+
contents: read
|
|
22
|
+
|
|
23
|
+
jobs:
|
|
24
|
+
gpu-cluster-create-and-delete:
|
|
25
|
+
runs-on: [ubuntu-22.04]
|
|
26
|
+
concurrency:
|
|
27
|
+
group: nightly-test-cluster-group-gpu
|
|
28
|
+
cancel-in-progress: false
|
|
29
|
+
env:
|
|
30
|
+
GPU_CLUSTER_NAME: nightly-xpk-b200
|
|
31
|
+
WORKLOAD_NAME: xpktest-gpu-nightly-${{ github.run_attempt }}
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/download-artifact@v4
|
|
34
|
+
with:
|
|
35
|
+
name: custom-scripts
|
|
36
|
+
- name: Setup environment
|
|
37
|
+
uses: ./.github/actions/setup-test-env
|
|
38
|
+
with:
|
|
39
|
+
credentials_json: "${{ secrets.GCP_SA_KEY }}"
|
|
40
|
+
- name: Check xpk installation
|
|
41
|
+
run: xpk version
|
|
42
|
+
- name: 'Setup Service Account for XPK'
|
|
43
|
+
run: |
|
|
44
|
+
# 1. Clear any existing WIF configurations to avoid conflicts
|
|
45
|
+
rm -rf $HOME/.config/gcloud
|
|
46
|
+
mkdir -p $HOME/.config/gcloud
|
|
47
|
+
|
|
48
|
+
# 2. Write the Key File
|
|
49
|
+
echo '${{ secrets.GCP_SA_KEY }}' > $HOME/.config/gcloud/application_default_credentials.json
|
|
50
|
+
|
|
51
|
+
# 3. Activate the Service Account
|
|
52
|
+
# This updates the internal config files to point to the key file.
|
|
53
|
+
# When Docker mounts the directory, it will now see "Active Account: Service Account"
|
|
54
|
+
gcloud auth activate-service-account --key-file=$HOME/.config/gcloud/application_default_credentials.json --project=cloud-tpu-multipod-dev
|
|
55
|
+
|
|
56
|
+
# 4. Set Env Var for the host (GitHub Runner)
|
|
57
|
+
echo "GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/gcloud/application_default_credentials.json" >> $GITHUB_ENV
|
|
58
|
+
- name: Create an XPK Cluster with 1 x b200 GPU
|
|
59
|
+
run: xpk cluster create --cluster $GPU_CLUSTER_NAME --device-type=b200-8 --zone=asia-northeast1-b --default-pool-cpu-machine-type=n1-standard-16 --spot
|
|
60
|
+
- name: Authenticate Docker
|
|
61
|
+
run: gcloud auth configure-docker --quiet
|
|
62
|
+
- name: Run a base-docker-image workload
|
|
63
|
+
run: xpk workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --docker-image='nvidia/cuda:12.1.0-base-ubuntu22.04' --command "nvidia-smi" --zone=asia-northeast1-b --device-type=b200-8
|
|
64
|
+
- name: List out the workloads on the cluster
|
|
65
|
+
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
|
|
66
|
+
- name: Wait for workload completion and confirm it succeeded
|
|
67
|
+
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --wait-for-job-completion $WORKLOAD_NAME --timeout 600
|
|
68
|
+
- name: Delete the workload on the cluster
|
|
69
|
+
run: xpk workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
|
|
70
|
+
- name: Delete the cluster created
|
|
71
|
+
if: always()
|
|
72
|
+
run: xpk cluster delete --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --force
|
|
73
|
+
- name: Upload cluster nodepool creation log
|
|
74
|
+
if: always()
|
|
75
|
+
uses: actions/upload-artifact@v4
|
|
76
|
+
with:
|
|
77
|
+
name: gpu-cluster-nodepool-log-${{github.run_id}}
|
|
78
|
+
path: /tmp/NodepoolCreate-${{ env.GPU_CLUSTER_NAME }}-np-*
|
|
@@ -16,8 +16,8 @@ name: Nightly Tests
|
|
|
16
16
|
|
|
17
17
|
on:
|
|
18
18
|
workflow_dispatch:
|
|
19
|
-
schedule: # Schedule the job run at
|
|
20
|
-
- cron: "0
|
|
19
|
+
schedule: # Schedule the job run at 6AM UTC daily.
|
|
20
|
+
- cron: "0 6 * * *"
|
|
21
21
|
|
|
22
22
|
permissions:
|
|
23
23
|
contents: read
|
|
@@ -32,6 +32,11 @@ jobs:
|
|
|
32
32
|
uses: ./.github/workflows/integration_basic_cluster_create.yaml
|
|
33
33
|
secrets: inherit
|
|
34
34
|
|
|
35
|
+
gpu_cluster_create:
|
|
36
|
+
needs: [build_actions, build_wheel]
|
|
37
|
+
uses: ./.github/workflows/integration_gpu_cluster_create.yaml
|
|
38
|
+
secrets: inherit
|
|
39
|
+
|
|
35
40
|
pathways_cluster_create:
|
|
36
41
|
needs: [build_actions, build_wheel]
|
|
37
42
|
uses: ./.github/workflows/integration_pathways_cluster_create.yaml
|
|
@@ -41,10 +46,6 @@ jobs:
|
|
|
41
46
|
needs: [build_actions, build_wheel]
|
|
42
47
|
uses: ./.github/workflows/integration_ray_cluster_create.yaml
|
|
43
48
|
secrets: inherit
|
|
44
|
-
legacy_integration:
|
|
45
|
-
needs: [build_actions, build_wheel]
|
|
46
|
-
uses: ./.github/workflows/integration_legacy_tests.yaml
|
|
47
|
-
secrets: inherit
|
|
48
49
|
storage-tests:
|
|
49
50
|
needs: [build_actions, build_wheel]
|
|
50
51
|
uses: ./.github/workflows/integration_storage_tests.yaml
|
|
@@ -38,7 +38,7 @@ jobs:
|
|
|
38
38
|
key: xpk-deps-3.10-${{github.run_id}}-${{github.run_attempt}}
|
|
39
39
|
restore-keys: xpk-deps-3.10-
|
|
40
40
|
- name: Verify goldens
|
|
41
|
-
run:
|
|
41
|
+
run: python3 tools/recipes.py golden recipes/*.md
|
|
42
42
|
env:
|
|
43
43
|
UPDATE_GOLDEN_COMMAND: make goldens
|
|
44
44
|
XPK_VERSION_OVERRIDE: v0.0.0
|
|
@@ -12,11 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
name: 'Close stale issues and PRs'
|
|
15
|
+
name: "Close stale issues and PRs"
|
|
17
16
|
on:
|
|
18
17
|
schedule:
|
|
19
|
-
- cron:
|
|
18
|
+
- cron: "30 1 * * *"
|
|
20
19
|
|
|
21
20
|
jobs:
|
|
22
21
|
stale:
|
|
@@ -24,7 +23,8 @@ jobs:
|
|
|
24
23
|
steps:
|
|
25
24
|
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
|
26
25
|
with:
|
|
27
|
-
|
|
26
|
+
days-before-issue-stale: -1
|
|
27
|
+
stale-pr-message: "This pull request is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
|
|
28
28
|
days-before-pr-stale: 30
|
|
29
29
|
days-before-pr-close: 7
|
|
30
30
|
operations-per-run: 100
|
{xpk-1.0.0 → xpk-1.1.1}/Makefile
RENAMED
|
@@ -1,12 +1,7 @@
|
|
|
1
|
-
KUEUE_REPO=https://github.com/kubernetes-sigs/kueue.git
|
|
2
|
-
|
|
3
|
-
KUBECTL_VERSION := $(shell curl -L -s https://dl.k8s.io/release/stable.txt)
|
|
4
|
-
KUEUE_VERSION=v0.14.3
|
|
5
|
-
|
|
6
1
|
OS := $(shell uname -s | tr A-Z a-z)
|
|
7
2
|
PLATFORM := $(shell uname -m | sed -e 's/aarch64/arm64/' | sed -e 's/x86_64/amd64/')
|
|
8
3
|
|
|
9
|
-
|
|
4
|
+
KUEUE_VERSION=v0.15.2
|
|
10
5
|
KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUEUE_VERSION)/kubectl-kueue-$(OS)-$(PLATFORM)"
|
|
11
6
|
|
|
12
7
|
PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
|
|
@@ -34,12 +29,9 @@ install-pytest:
|
|
|
34
29
|
run-unittests:
|
|
35
30
|
XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 pytest -vv src/xpk/
|
|
36
31
|
|
|
37
|
-
run-integrationtests:
|
|
38
|
-
XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 pytest src/integration/
|
|
39
|
-
|
|
40
32
|
.PHONY: goldens
|
|
41
33
|
goldens:
|
|
42
|
-
XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0
|
|
34
|
+
XPK_TESTER=false XPK_VERSION_OVERRIDE=v0.0.0 python3 tools/recipes.py update recipes/*.md
|
|
43
35
|
|
|
44
36
|
.PHONY: mkdir-bin
|
|
45
37
|
mkdir-bin:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -93,28 +93,41 @@ XPK supports a variety of hardware accelerators.
|
|
|
93
93
|
|
|
94
94
|
XPK also supports the following [Google Cloud Storage solutions](./docs/usage/storage.md):
|
|
95
95
|
|
|
96
|
-
| Storage Type | Documentation
|
|
97
|
-
|
|
98
|
-
| Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse)
|
|
99
|
-
| Filestore | [docs](./docs/usage/storage.md#filestore)
|
|
100
|
-
| Parallelstore | [docs](./docs/usage/storage.md#parallelstore)
|
|
101
|
-
| Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk)
|
|
96
|
+
| Storage Type | Documentation |
|
|
97
|
+
| ------------------------------------------ | ----------------------------------------------------------------------- |
|
|
98
|
+
| Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
|
|
99
|
+
| Filestore | [docs](./docs/usage/storage.md#filestore) |
|
|
100
|
+
| Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
|
|
101
|
+
| Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
|
|
102
102
|
|
|
103
103
|
# Documentation
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
105
|
+
- [Permissions](./docs/permissions.md)
|
|
106
|
+
- [Installation](./docs/installation.md)
|
|
107
|
+
- Usage:
|
|
108
|
+
- [Clusters](./docs/usage/clusters.md)
|
|
109
|
+
- [GPU](./docs/usage/gpu.md)
|
|
110
|
+
- [CPU](./docs/usage/cpu.md)
|
|
111
|
+
- [Autoprovisioning](./docs/usage/autoprovisioning.md)
|
|
112
|
+
- [Workloads](./docs/usage/workloads.md)
|
|
113
|
+
- [Docker](./docs/usage/docker.md)
|
|
114
|
+
- [Storage](./docs/usage/storage.md)
|
|
115
|
+
- [Advanced](./docs/usage/advanced.md)
|
|
116
|
+
- [Inspector](./docs/usage/inspector.md)
|
|
117
|
+
- [Troubleshooting](./docs/troubleshooting.md)
|
|
118
|
+
|
|
119
|
+
# Dependencies
|
|
120
|
+
|
|
121
|
+
| Dependency | When used |
|
|
122
|
+
| ------------------------------------------------------------------------------------------------------------ | --------------------------- |
|
|
123
|
+
| [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/install) | _always_ |
|
|
124
|
+
| [kubectl](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) | _always_ |
|
|
125
|
+
| [ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) | Provisioning GPU clusters |
|
|
126
|
+
| [Kueue](https://github.com/kubernetes-sigs/kueue) | Scheduling workloads |
|
|
127
|
+
| [JobSet](https://github.com/kubernetes-sigs/jobset) | Workload creation |
|
|
128
|
+
| [Docker](https://docs.docker.com/engine/install/) | Building workload container |
|
|
129
|
+
| [CoreDNS](https://github.com/coredns/deployment/tree/master/kubernetes) | Cluster set up |
|
|
130
|
+
| [PathwaysJob](https://github.com/google/pathways-job) | Running Pathways workloads |
|
|
118
131
|
|
|
119
132
|
# Privacy notice
|
|
120
133
|
|
|
@@ -129,11 +142,14 @@ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](
|
|
|
129
142
|
you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
|
|
130
143
|
[Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
|
|
131
144
|
|
|
132
|
-
|
|
133
145
|
# Contributing
|
|
134
146
|
|
|
135
147
|
Please read [`contributing.md`](./docs/contributing.md) for details on our code of conduct, and the process for submitting pull requests to us.
|
|
136
148
|
|
|
149
|
+
# Get involved
|
|
150
|
+
|
|
151
|
+
We'd love to hear from you! If you have questions or want to discuss ideas, join us on [GitHub Discussions](https://github.com/AI-Hypercomputer/xpk/discussions). Found a bug or have a feature request? Please let us know on [GitHub Issues](https://github.com/AI-Hypercomputer/xpk/issues).
|
|
152
|
+
|
|
137
153
|
# License
|
|
138
154
|
|
|
139
155
|
This project is licensed under the Apache License 2.0 - see the [`LICENSE`](./LICENSE) file for details
|
|
@@ -52,28 +52,41 @@ XPK supports a variety of hardware accelerators.
|
|
|
52
52
|
|
|
53
53
|
XPK also supports the following [Google Cloud Storage solutions](./docs/usage/storage.md):
|
|
54
54
|
|
|
55
|
-
| Storage Type | Documentation
|
|
56
|
-
|
|
57
|
-
| Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse)
|
|
58
|
-
| Filestore | [docs](./docs/usage/storage.md#filestore)
|
|
59
|
-
| Parallelstore | [docs](./docs/usage/storage.md#parallelstore)
|
|
60
|
-
| Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk)
|
|
55
|
+
| Storage Type | Documentation |
|
|
56
|
+
| ------------------------------------------ | ----------------------------------------------------------------------- |
|
|
57
|
+
| Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
|
|
58
|
+
| Filestore | [docs](./docs/usage/storage.md#filestore) |
|
|
59
|
+
| Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
|
|
60
|
+
| Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
|
|
61
61
|
|
|
62
62
|
# Documentation
|
|
63
63
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
64
|
+
- [Permissions](./docs/permissions.md)
|
|
65
|
+
- [Installation](./docs/installation.md)
|
|
66
|
+
- Usage:
|
|
67
|
+
- [Clusters](./docs/usage/clusters.md)
|
|
68
|
+
- [GPU](./docs/usage/gpu.md)
|
|
69
|
+
- [CPU](./docs/usage/cpu.md)
|
|
70
|
+
- [Autoprovisioning](./docs/usage/autoprovisioning.md)
|
|
71
|
+
- [Workloads](./docs/usage/workloads.md)
|
|
72
|
+
- [Docker](./docs/usage/docker.md)
|
|
73
|
+
- [Storage](./docs/usage/storage.md)
|
|
74
|
+
- [Advanced](./docs/usage/advanced.md)
|
|
75
|
+
- [Inspector](./docs/usage/inspector.md)
|
|
76
|
+
- [Troubleshooting](./docs/troubleshooting.md)
|
|
77
|
+
|
|
78
|
+
# Dependencies
|
|
79
|
+
|
|
80
|
+
| Dependency | When used |
|
|
81
|
+
| ------------------------------------------------------------------------------------------------------------ | --------------------------- |
|
|
82
|
+
| [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/install) | _always_ |
|
|
83
|
+
| [kubectl](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) | _always_ |
|
|
84
|
+
| [ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) | Provisioning GPU clusters |
|
|
85
|
+
| [Kueue](https://github.com/kubernetes-sigs/kueue) | Scheduling workloads |
|
|
86
|
+
| [JobSet](https://github.com/kubernetes-sigs/jobset) | Workload creation |
|
|
87
|
+
| [Docker](https://docs.docker.com/engine/install/) | Building workload container |
|
|
88
|
+
| [CoreDNS](https://github.com/coredns/deployment/tree/master/kubernetes) | Cluster set up |
|
|
89
|
+
| [PathwaysJob](https://github.com/google/pathways-job) | Running Pathways workloads |
|
|
77
90
|
|
|
78
91
|
# Privacy notice
|
|
79
92
|
|
|
@@ -88,11 +101,14 @@ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](
|
|
|
88
101
|
you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
|
|
89
102
|
[Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
|
|
90
103
|
|
|
91
|
-
|
|
92
104
|
# Contributing
|
|
93
105
|
|
|
94
106
|
Please read [`contributing.md`](./docs/contributing.md) for details on our code of conduct, and the process for submitting pull requests to us.
|
|
95
107
|
|
|
108
|
+
# Get involved
|
|
109
|
+
|
|
110
|
+
We'd love to hear from you! If you have questions or want to discuss ideas, join us on [GitHub Discussions](https://github.com/AI-Hypercomputer/xpk/discussions). Found a bug or have a feature request? Please let us know on [GitHub Issues](https://github.com/AI-Hypercomputer/xpk/issues).
|
|
111
|
+
|
|
96
112
|
# License
|
|
97
113
|
|
|
98
114
|
This project is licensed under the Apache License 2.0 - see the [`LICENSE`](./LICENSE) file for details
|
|
@@ -47,36 +47,57 @@ A crucial aspect of effective unit testing is isolation. A unit test should only
|
|
|
47
47
|
|
|
48
48
|
A good, state-of-the-art sample of [code](https://github.com/AI-Hypercomputer/xpk/blob/0434cf6a023069522f90d5846c6d980b68382b66/src/xpk/core/nodepool.py#L614) that has been correctly covered with unit tests can be found [here](https://github.com/AI-Hypercomputer/xpk/blob/8464ce26cd0fd24c681e346b2c915ad918724e53/src/xpk/core/nodepool_test.py#L26). This provided example serves as a practical guide and "source of truth" for developers, demonstrating best practices in unit test structure like naming. Another sample, leveraging mocks could be found [here](https://github.com/AI-Hypercomputer/xpk/blob/8464ce26cd0fd24c681e346b2c915ad918724e53/src/xpk/core/nodepool_test.py#L86).
|
|
49
49
|
|
|
50
|
-
## Golden
|
|
50
|
+
## Golden Recipes
|
|
51
51
|
|
|
52
|
-
Golden
|
|
52
|
+
Golden recipes encompass a broad scope within XPK, effectively covering entire user journeys. Their primary objective is to orchestrate multiple commands to achieve a high-level goal, simulating a real user interacting with the system. They also serve as regression tests by asserting on the output of each step, ensuring that the user experience remains consistent. These tests are executed on feature branches and serve as the main tool for raising awareness, enabling developers to thoroughly double-check changes across various complex scenarios and understand their potential impact.
|
|
53
53
|
|
|
54
54
|
### Naming Conventions
|
|
55
55
|
|
|
56
|
-
Each Golden
|
|
56
|
+
Each Golden recipe file in the `recipes` directory corresponds to a specific use case or persona utilizing the system. The filename should clearly indicate the scenario, for example `NAP_cluster-create_with_pathways.md` or `Cluster_create_with_Managed_Lustre_driver.md`.
|
|
57
57
|
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
### Developer guide to Golden Recipes
|
|
59
|
+
|
|
60
|
+
All golden recipes are located in the `recipes` directory. Each recipe is a Markdown file that describes the user journey and contains the sequence of commands to be executed.
|
|
61
|
+
|
|
62
|
+
A sample structure of a recipe file is:
|
|
60
63
|
|
|
61
|
-
|
|
64
|
+
```markdown
|
|
65
|
+
# Recipe Title
|
|
62
66
|
|
|
63
|
-
|
|
64
|
-
All golden tests are registered in the `goldens.yaml` file in the root directory. Their reference output is stored in text files located in goldens directory in the root directory.
|
|
67
|
+
Description of the recipe.
|
|
65
68
|
|
|
66
|
-
|
|
69
|
+
## Step 1: Create Cluster
|
|
70
|
+
\`\`\`shell #golden
|
|
71
|
+
xpk cluster create ...
|
|
72
|
+
\`\`\`
|
|
73
|
+
<!--
|
|
74
|
+
Expected output block
|
|
75
|
+
-->
|
|
67
76
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
## Step 2: Submit Workload
|
|
78
|
+
\`\`\`shell #golden
|
|
79
|
+
xpk workload create ...
|
|
80
|
+
\`\`\`
|
|
81
|
+
<!--
|
|
82
|
+
Expected output block
|
|
83
|
+
-->
|
|
73
84
|
```
|
|
74
85
|
|
|
75
|
-
|
|
86
|
+
Recipe files are self-contained, storing both the commands and their expected golden outputs in comment blocks. The recipe executor runs these commands in order, maintaining state between them (e.g. environment variables).
|
|
76
87
|
|
|
77
88
|
### Underlying execution mechanisms
|
|
78
89
|
|
|
79
|
-
These tests are executed through the
|
|
90
|
+
These tests are executed through the `tools/recipes.py` script. The framework executes the sequence of commands in `dry_run` mode (by injecting a mock `xpk` function) and compares the output of each step with the expected output stored in the recipe file.
|
|
91
|
+
|
|
92
|
+
**Usage:**
|
|
93
|
+
|
|
94
|
+
* **Regenerate Goldens:** `make goldens`
|
|
95
|
+
* This is the primary command for developers. It executes all recipes in `update` mode, regenerating the golden outputs. Run this after making changes to the code or adding new recipes.
|
|
96
|
+
|
|
97
|
+
* **Advanced:** *(These commands are primarily used by CI/CD pipelines or for debugging specific scenarios.)*
|
|
98
|
+
* **Verification:** `python3 tools/recipes.py golden <file or files>` (Verifies outputs match without updating)
|
|
99
|
+
* **Integration Run:** `python3 tools/recipes.py run <file or files>` (Executes commands for real)
|
|
100
|
+
* **Selective Update:** `python3 tools/recipes.py update <file or files>` (Updates a specific recipe)
|
|
80
101
|
|
|
81
102
|
## Integration Test
|
|
82
103
|
Integration tests sit at the apex of the testing pyramid, being the most expensive and slowest to execute. This is primarily because they rely on actual Google Cloud Platform (GCP) infrastructure, which introduces potential flakiness due to external factors and makes it challenging to write given capacity constraints. Consequently, these tests should be reserved for ultimate verification before release, ensuring all of XPK's components function seamlessly together within a real GCP environment. They are not run on feature branches; instead, they are executed on the mainline (`main`) branch nightly after code merges, and right before a release to validate a new XPK release candidate. This strategic placement ensures a final, comprehensive check of the entire system's functionality in its production-like setting.
|
|
@@ -51,7 +51,7 @@ all zones.
|
|
|
51
51
|
--num-slices=4 --on-demand
|
|
52
52
|
```
|
|
53
53
|
|
|
54
|
-
* Cluster Create (provision spot /
|
|
54
|
+
* Cluster Create (provision spot / preemptible capacity):
|
|
55
55
|
|
|
56
56
|
```shell
|
|
57
57
|
xpk cluster create \
|
|
@@ -274,7 +274,7 @@ xpk cluster create-pathways \
|
|
|
274
274
|
--managed-mldiagnostics
|
|
275
275
|
```
|
|
276
276
|
|
|
277
|
-
* Cluster Create (provision spot /
|
|
277
|
+
* Cluster Create (provision spot / preemptible capacity) with flag **--managed-mldiagnostics**:
|
|
278
278
|
|
|
279
279
|
```shell
|
|
280
280
|
xpk cluster create \
|
|
@@ -29,14 +29,10 @@ Before you start, complete the following steps:
|
|
|
29
29
|
|
|
30
30
|
### Create a single-NIC, single slice cluster
|
|
31
31
|
|
|
32
|
-
Currently flex start provisioning for Ironwood works only in single slice and multi-host or multi-slice and single host setups. More options will be added soon
|
|
33
|
-
|
|
34
32
|
1. Set the following environment variables:
|
|
35
33
|
|
|
36
34
|
> **NOTE:** For multi-host provisioning use an ACCELERATOR_TYPE with any topology that results to more than 8 chips, e.g. `tpu7x-2x2x2` or `tpu7x-16`. For single-host provisioning use an ACCELERATOR_TYPE with any topology that results to 8 or less chips, e.g. `tpu7x-2x2x1` or `tpu7x-8`.
|
|
37
35
|
|
|
38
|
-
> **NOTE:** Single-host provisioning is not supported for single-slice. If you want to create a single-host cluster, you need to set `--num-slices` to 2 or higher on the `xpk cluster create` command.
|
|
39
|
-
|
|
40
36
|
```shell
|
|
41
37
|
export PROJECT_ID=<project_id> # Your GCP project name
|
|
42
38
|
export ZONE=<zone> # Example: us-central1-c
|
|
@@ -29,14 +29,10 @@ Before you start, complete the following steps:
|
|
|
29
29
|
|
|
30
30
|
### Create a single-NIC, single slice cluster
|
|
31
31
|
|
|
32
|
-
Currently flex start provisioning for Ironwood works only in single slice and multi-host or multi-slice and single host setups. More options will be added soon
|
|
33
|
-
|
|
34
32
|
1. Set the following environment variables:
|
|
35
33
|
|
|
36
34
|
> **NOTE:** For multi-host provisioning use an ACCELERATOR_TYPE with any topology that results to more than 8 chips, e.g. `tpu7x-2x2x2` or `tpu7x-16`. For single-host provisioning use an ACCELERATOR_TYPE with any topology that results to 8 or less chips, e.g. `tpu7x-2x2x1` or `tpu7x-8`.
|
|
37
35
|
|
|
38
|
-
> **NOTE:** Single-host provisioning is not supported for single-slice. If you want to create a single-host cluster, you need to set `--num-slices` to 2 or higher on the `xpk cluster create` command.
|
|
39
|
-
|
|
40
36
|
```shell
|
|
41
37
|
export PROJECT_ID=<project_id> # Your GCP project name
|
|
42
38
|
export ZONE=<zone> # Example: us-central1-c
|