xpk 0.17.2__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.17.2 → xpk-1.0.0}/.github/actions/setup-test-env/action.yml +0 -1
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/build_tests.yaml +1 -2
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_basic_cluster_create.yaml +0 -56
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_legacy_tests.yaml +1 -2
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/label-validation.yaml +2 -2
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/nightly_tests.yaml +5 -7
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_goldens.yaml +0 -1
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_integration_tests.yaml +0 -1
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_lint_and_format.yml +0 -1
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_storage_create.yaml +0 -41
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_storage_delete.yaml +0 -3
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_unit_tests.yaml +0 -1
- {xpk-0.17.2 → xpk-1.0.0}/Makefile +2 -16
- {xpk-0.17.2/src/xpk.egg-info → xpk-1.0.0}/PKG-INFO +15 -4
- {xpk-0.17.2 → xpk-1.0.0}/README.md +14 -3
- {xpk-0.17.2 → xpk-1.0.0}/docs/installation.md +0 -1
- {xpk-0.17.2 → xpk-1.0.0}/docs/troubleshooting.md +1 -1
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/clusters.md +29 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Basic_cluster_create.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_for_multi-host_nodepool.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_private.txt +5 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_sub-slicing.txt +6 -89
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_super-slicing.txt +5 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_Managed_Lustre_driver.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_gb200-4.txt +6 -89
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_create_with_shared_reservation.txt +5 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/NAP_cluster-create.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/NAP_cluster-create_with_pathways.txt +3 -88
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_super-slicing.txt +3 -3
- {xpk-0.17.2 → xpk-1.0.0}/goldens.yaml +0 -8
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster.py +4 -35
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster_gcluster.py +1 -13
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster_gcluster_test.py +2 -10
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/cluster_test.py +0 -4
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/workload.py +10 -3
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/workload_test.py +1 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/cluster.py +10 -9
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/config.py +5 -17
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/kueue_manager_test.py +2 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/nodepool.py +6 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/nodepool_test.py +4 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/scheduling.py +28 -3
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/scheduling_test.py +38 -1
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/system_characteristics.py +39 -16
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/system_characteristics_test.py +11 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -15
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -8
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/common.py +0 -17
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/core.py +0 -39
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/storage.py +0 -11
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/feature_flags.py +1 -1
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/validation.py +0 -8
- {xpk-0.17.2 → xpk-1.0.0/src/xpk.egg-info}/PKG-INFO +15 -4
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/SOURCES.txt +0 -26
- {xpk-0.17.2 → xpk-1.0.0}/tools/install-xpk.sh +0 -4
- xpk-0.17.2/.github/actions/install-kjob/action.yml +0 -35
- xpk-0.17.2/.github/workflows/reusable_build_kjob.yaml +0 -23
- xpk-0.17.2/docs/local_testing.md +0 -61
- xpk-0.17.2/docs/usage/job.md +0 -41
- xpk-0.17.2/docs/usage/run.md +0 -44
- xpk-0.17.2/examples/batch.md +0 -24
- xpk-0.17.2/examples/job.sh +0 -12
- xpk-0.17.2/goldens/Batch.txt +0 -19
- xpk-0.17.2/goldens/Cluster_create_for_single-host_single-slice_TPU.txt +0 -199
- xpk-0.17.2/goldens/Job_cancel.txt +0 -14
- xpk-0.17.2/goldens/Job_info.txt +0 -21
- xpk-0.17.2/goldens/Job_list.txt +0 -14
- xpk-0.17.2/src/xpk/commands/batch.py +0 -144
- xpk-0.17.2/src/xpk/commands/job.py +0 -244
- xpk-0.17.2/src/xpk/commands/kind.py +0 -286
- xpk-0.17.2/src/xpk/commands/kjob_common.py +0 -60
- xpk-0.17.2/src/xpk/commands/run.py +0 -140
- xpk-0.17.2/src/xpk/commands/shell.py +0 -142
- xpk-0.17.2/src/xpk/parser/batch.py +0 -43
- xpk-0.17.2/src/xpk/parser/job.py +0 -147
- xpk-0.17.2/src/xpk/parser/kind.py +0 -95
- xpk-0.17.2/src/xpk/parser/run.py +0 -47
- xpk-0.17.2/src/xpk/parser/shell.py +0 -59
- xpk-0.17.2/tools/Dockerfile-kjob +0 -33
- xpk-0.17.2/tools/build-kjob.sh +0 -9
- xpk-0.17.2/xpk-slurm-commands.md +0 -382
- {xpk-0.17.2 → xpk-1.0.0}/.dockerignore +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/CODEOWNERS +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/actions/install-kueue/action.yml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/release.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/README.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/build_wheels.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/cleanup.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-dispatch.yml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-invoke.yml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-review.yml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-scheduled-triage.yml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/gemini-triage.yml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/integration_storage_tests.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/periodic_release.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/release_branch_versioning.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_build_scripts.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/reusable_build_wheel.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.github/workflows/stale.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.gitignore +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/.pre-commit-config.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/LICENSE +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/backoff_retry.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/data/Dockerfile +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/code-of-conduct.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/contributing.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/permissions.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/testing.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/advanced.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/autoprovisioning.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/cpu.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/docker.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/gpu.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/inspector.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/storage.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/clusters.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/tpu7x/workloads.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/docs/usage/workloads.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/fake_training.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/requirements.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/train.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/train.slurm +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/nccl/nccl-a3mega.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/nccl/nccl-a3ultra.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/nccl/nccl.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/storage/filestore-manifest-attach.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/storage/gcsfuse-manifest.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/storage/lustre-manifest-attach.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/examples/storage/pd-manifest-attach.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/golden_buddy.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_delete.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Cluster_delete_force.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Storage_list.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_pathways.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_sub-slicing.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_create_with_output-manifest-file.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_delete.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/goldens/Workload_list.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/pylintrc +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/pyproject.toml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/setup.cfg +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/README.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/docker_manager_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_a3mega_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_a3ultra_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_a4_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/integration/gcluster_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/common.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/storage.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/capacity.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/capacity_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/cluster_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/commands.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/config_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/docker_resources.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/filestore.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcloud_context_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/jobset.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/kueue_manager.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/mtc.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/nap.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/network.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/pathways.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/pathways_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/ray.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/resources.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/storage.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/telemetry.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/telemetry_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/testing/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/testing/commands_tester.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/testing/commands_tester_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/updates.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/updates_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/core/workload_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/main.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/cluster.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/cluster_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/common_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/storage_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/workload.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/parser/workload_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/telemetry_uploader.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/filestore-pv.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/filestore-pvc.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/filestore-sc.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/fuse-pv.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/fuse-pvc.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/mtc-cpc.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/console_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/execution_context.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/file.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/kueue.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/network.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/topology.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/topology_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_agent.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_agent_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_input.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/user_input_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/validation_test.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/versions.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/src/xpk.egg-info/top_level.txt +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/tools/install-gke-auth-plugin.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/xpk-large-scale-guide.sh +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/xpk-notebooks.md +0 -0
- {xpk-0.17.2 → xpk-1.0.0}/xpk.py +0 -0
|
@@ -49,14 +49,13 @@ jobs:
|
|
|
49
49
|
lookup-only: true
|
|
50
50
|
- name: install dependencies
|
|
51
51
|
if : steps.check-cache.outputs.cache-hit != 'true'
|
|
52
|
-
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
|
|
52
|
+
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
|
|
53
53
|
- name: Cache dependencies
|
|
54
54
|
if : steps.check-cache.outputs.cache-hit != 'true'
|
|
55
55
|
uses: actions/cache/save@v3
|
|
56
56
|
with:
|
|
57
57
|
path: |
|
|
58
58
|
/usr/local/bin/kubectl-kueue
|
|
59
|
-
/usr/local/bin/kubectl-kjob
|
|
60
59
|
~/.cache/pip
|
|
61
60
|
${{env.pythonLocation}}
|
|
62
61
|
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
|
|
@@ -152,62 +152,6 @@ jobs:
|
|
|
152
152
|
run: xpk info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
|
|
153
153
|
- name: Delete the workload on the cluster
|
|
154
154
|
run: xpk workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
|
|
155
|
-
- name: Create test script to execute in batch
|
|
156
|
-
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
|
|
157
|
-
- name: Run a batch job on the cluster
|
|
158
|
-
run: xpk batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
|
|
159
|
-
- name: List out the jobs on the cluster
|
|
160
|
-
run: xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
|
|
161
|
-
- name: Get created job name
|
|
162
|
-
run: |
|
|
163
|
-
JOB_NAME=$(xpk job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | grep 'multislice-queue' | head -1 | awk '{print $1}')
|
|
164
|
-
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
|
|
165
|
-
- name: Check job spec
|
|
166
|
-
run: |
|
|
167
|
-
job_spec=$(kubectl get job ${JOB_NAME} -o jsonpath='{.spec}')
|
|
168
|
-
echo "$job_spec" | grep '"completions":2'
|
|
169
|
-
echo "$job_spec" | grep '"parallelism":2'
|
|
170
|
-
echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
|
|
171
|
-
- name: Get job info for the last job created on the cluster
|
|
172
|
-
run: xpk job info ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
|
|
173
|
-
- name: Cancel the batch job on the cluster
|
|
174
|
-
run: xpk job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
|
|
175
|
-
- name: Create shell and exit it immediately
|
|
176
|
-
run: |
|
|
177
|
-
cat <<EOF > create-shell.exp
|
|
178
|
-
#!/usr/bin/expect
|
|
179
|
-
set timeout 180
|
|
180
|
-
spawn sh -c "xpk shell --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee shell.log"
|
|
181
|
-
send "\n"
|
|
182
|
-
expect {
|
|
183
|
-
"/ # " {
|
|
184
|
-
send "exit\n"
|
|
185
|
-
# Wait for EOF after exit
|
|
186
|
-
expect eof
|
|
187
|
-
exit 0
|
|
188
|
-
}
|
|
189
|
-
timeout {
|
|
190
|
-
puts "Timed out waiting for pod to be running"
|
|
191
|
-
exit 1
|
|
192
|
-
}
|
|
193
|
-
eof {
|
|
194
|
-
puts "Unexpected EOF before getting prompt"
|
|
195
|
-
exit 1
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
EOF
|
|
199
|
-
chmod +x ./create-shell.exp
|
|
200
|
-
expect ./create-shell.exp
|
|
201
|
-
- name: Check if shell exists and is running
|
|
202
|
-
run: |
|
|
203
|
-
pod_name=$(grep 'waiting for pod' shell.log | awk -F'"' '{print $2}')
|
|
204
|
-
kubectl wait --for='jsonpath={.status.conditions[?(@.type=="Ready")].status}=True' --timeout=1m pod/${pod_name}
|
|
205
|
-
- name: Stop the shell
|
|
206
|
-
run: xpk shell stop --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
|
|
207
|
-
- name: Delete create-shell.exp file
|
|
208
|
-
run: rm create-shell.exp
|
|
209
|
-
- name: Delete shell.log file
|
|
210
|
-
run: rm shell.log
|
|
211
155
|
- name: Delete the cluster created
|
|
212
156
|
if: always()
|
|
213
157
|
run: xpk cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
|
|
@@ -47,14 +47,13 @@ jobs:
|
|
|
47
47
|
lookup-only: true
|
|
48
48
|
- name: install dependencies
|
|
49
49
|
if: steps.check-cache.outputs.cache-hit != 'true'
|
|
50
|
-
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
|
|
50
|
+
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue
|
|
51
51
|
- name: Cache dependencies
|
|
52
52
|
if: steps.check-cache.outputs.cache-hit != 'true'
|
|
53
53
|
uses: actions/cache/save@v3
|
|
54
54
|
with:
|
|
55
55
|
path: |
|
|
56
56
|
/usr/local/bin/kubectl-kueue
|
|
57
|
-
/usr/local/bin/kubectl-kjob
|
|
58
57
|
~/.cache/pip
|
|
59
58
|
${{env.pythonLocation}}
|
|
60
59
|
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
|
|
@@ -36,8 +36,8 @@ jobs:
|
|
|
36
36
|
with:
|
|
37
37
|
mode: minimum
|
|
38
38
|
count: 1
|
|
39
|
-
labels: "release-improvements, release-bugfix, release-features"
|
|
40
|
-
message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features"
|
|
39
|
+
labels: "release-improvements, release-bugfix, release-features, release-breaking"
|
|
40
|
+
message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-improvements, release-bugfix, release-features, release-breaking"
|
|
41
41
|
- id: do-not-merge
|
|
42
42
|
uses: mheap/github-action-required-labels@v5
|
|
43
43
|
with:
|
|
@@ -23,31 +23,29 @@ permissions:
|
|
|
23
23
|
contents: read
|
|
24
24
|
|
|
25
25
|
jobs:
|
|
26
|
-
build_kjob:
|
|
27
|
-
uses: ./.github/workflows/reusable_build_kjob.yaml
|
|
28
26
|
build_wheel:
|
|
29
27
|
uses: ./.github/workflows/reusable_build_wheel.yaml
|
|
30
28
|
build_actions:
|
|
31
29
|
uses: ./.github/workflows/reusable_build_scripts.yaml
|
|
32
30
|
basic_cluster_create:
|
|
33
|
-
needs: [
|
|
31
|
+
needs: [build_actions, build_wheel]
|
|
34
32
|
uses: ./.github/workflows/integration_basic_cluster_create.yaml
|
|
35
33
|
secrets: inherit
|
|
36
34
|
|
|
37
35
|
pathways_cluster_create:
|
|
38
|
-
needs: [
|
|
36
|
+
needs: [build_actions, build_wheel]
|
|
39
37
|
uses: ./.github/workflows/integration_pathways_cluster_create.yaml
|
|
40
38
|
secrets: inherit
|
|
41
39
|
|
|
42
40
|
ray_cluster_create:
|
|
43
|
-
needs: [
|
|
41
|
+
needs: [build_actions, build_wheel]
|
|
44
42
|
uses: ./.github/workflows/integration_ray_cluster_create.yaml
|
|
45
43
|
secrets: inherit
|
|
46
44
|
legacy_integration:
|
|
47
|
-
needs: [
|
|
45
|
+
needs: [build_actions, build_wheel]
|
|
48
46
|
uses: ./.github/workflows/integration_legacy_tests.yaml
|
|
49
47
|
secrets: inherit
|
|
50
48
|
storage-tests:
|
|
51
|
-
needs: [
|
|
49
|
+
needs: [build_actions, build_wheel]
|
|
52
50
|
uses: ./.github/workflows/integration_storage_tests.yaml
|
|
53
51
|
secrets: inherit
|
|
@@ -92,8 +92,6 @@ jobs:
|
|
|
92
92
|
--auto-mount=true --vol=vol1 --mount-point='/${{inputs.storage-type}}-test-mount-point' --readonly=false
|
|
93
93
|
- name: List and verify existing Storages
|
|
94
94
|
run: xpk storage list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | tee output.txt | grep ${{inputs.storage-name}} || (echo 'No storage found' && exit 143)
|
|
95
|
-
- name: Verify VolumeBundle created
|
|
96
|
-
run: kubectl get volumebundle ${{inputs.storage-name}} -o jsonpath='{.spec.containerVolumeMounts[0].mountPath}' | grep '/${{inputs.storage-type}}-test-mount-point'
|
|
97
95
|
- name: Verify Persistent Volume mount options
|
|
98
96
|
if: inputs.storage-command == 'attach' && inputs.storage-type == 'gcsfuse'
|
|
99
97
|
run: kubectl get pv ${{inputs.storage-name}}-pv -oyaml | grep rename-dir-limit=10000 || (echo 'Invalid storage mount options' && exit 143)
|
|
@@ -114,45 +112,6 @@ jobs:
|
|
|
114
112
|
run: xpk workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $STORAGE_READ_WORKLOAD --timeout 300
|
|
115
113
|
- name: Delete the reader workload on the cluster
|
|
116
114
|
run: xpk workload delete --workload $STORAGE_READ_WORKLOAD --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
|
|
117
|
-
- name: Create batch-read.sh script
|
|
118
|
-
run: |
|
|
119
|
-
cat <<EOF > batch-read.sh
|
|
120
|
-
#!/bin/bash
|
|
121
|
-
grep 'Test text message' /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)
|
|
122
|
-
EOF
|
|
123
|
-
- name: Run a batch-read job on the cluster
|
|
124
|
-
run: xpk batch --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh | tee batch-read.log
|
|
125
|
-
- name: Get job name
|
|
126
|
-
run: |
|
|
127
|
-
cat batch-read.log | grep 'xpk-def-app-profile-slurm-'
|
|
128
|
-
READ_JOB_NAME=$(grep 'Job name: xpk-def-app-profile-slurm-' batch-read.log | awk -F': ' '{print $2}')
|
|
129
|
-
echo "READ_JOB_NAME=${READ_JOB_NAME}" >> $GITHUB_ENV
|
|
130
|
-
- name: Wait for the batch-read job to finish
|
|
131
|
-
run: kubectl wait job.batch/$READ_JOB_NAME --for=condition=Complete --timeout=1m
|
|
132
|
-
- name: Cancel the batch-read job
|
|
133
|
-
run: xpk job cancel $READ_JOB_NAME --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} | grep "job.batch/$READ_JOB_NAME deleted"
|
|
134
|
-
- name: Delete batch-read.log file
|
|
135
|
-
run: rm batch-read.log
|
|
136
|
-
- name: Run a run-read job on the cluster
|
|
137
|
-
run: xpk run --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} batch-read.sh --timeout 60
|
|
138
|
-
- name: Delete batch-read.sh file
|
|
139
|
-
run: rm batch-read.sh
|
|
140
|
-
- name: Create shell and exit it immediately
|
|
141
|
-
run: |
|
|
142
|
-
cat <<EOF >> create-shell.exp
|
|
143
|
-
##!/usr/bin/expect
|
|
144
|
-
spawn xpk shell --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
|
|
145
|
-
expect "/ # "
|
|
146
|
-
send "cat /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt\n"
|
|
147
|
-
expect "Test text message"
|
|
148
|
-
send "exit\n"
|
|
149
|
-
EOF
|
|
150
|
-
chmod +x ./create-shell.exp
|
|
151
|
-
expect ./create-shell.exp
|
|
152
|
-
- name: Stop the shell
|
|
153
|
-
run: xpk shell stop --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}}
|
|
154
|
-
- name: Delete create-shell.exp file
|
|
155
|
-
run: rm create-shell.exp
|
|
156
115
|
- name: Run workload to delete file on filestore
|
|
157
116
|
run : xpk workload create --workload $STORAGE_DELETE_WORKLOAD --command "rm -rf /${{inputs.storage-type}}-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster ${{inputs.cluster-name}} --device-type=${{inputs.device-type}} --zone ${{inputs.zone}}
|
|
158
117
|
- name: Wait for delete workload completion and confirm it succeeded
|
|
@@ -61,9 +61,6 @@ jobs:
|
|
|
61
61
|
- name: Detach storage volumes
|
|
62
62
|
if: always()
|
|
63
63
|
run: xpk storage detach ${{inputs.storage-name}} --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}}
|
|
64
|
-
- name: Verify VolumeBundle deleted
|
|
65
|
-
run: |
|
|
66
|
-
! kubectl get volumebundle | grep ${{inputs.storage-name}}
|
|
67
64
|
- name: Delete GCP Filestore Storage instance
|
|
68
65
|
if: always() && inputs.storage-command == 'delete'
|
|
69
66
|
run: xpk storage delete ${{inputs.storage-name}} --cluster=${{inputs.cluster-name}} --zone=${{inputs.zone}}
|
|
@@ -2,25 +2,21 @@ KUEUE_REPO=https://github.com/kubernetes-sigs/kueue.git
|
|
|
2
2
|
|
|
3
3
|
KUBECTL_VERSION := $(shell curl -L -s https://dl.k8s.io/release/stable.txt)
|
|
4
4
|
KUEUE_VERSION=v0.14.3
|
|
5
|
-
KJOB_VERSION=v0.1.0
|
|
6
5
|
|
|
7
6
|
OS := $(shell uname -s | tr A-Z a-z)
|
|
8
7
|
PLATFORM := $(shell uname -m | sed -e 's/aarch64/arm64/' | sed -e 's/x86_64/amd64/')
|
|
9
8
|
|
|
10
9
|
KUBECTL_URL = "https://dl.k8s.io/release/$(KUBECTL_VERSION)/bin/$(OS)/$(PLATFORM)/kubectl"
|
|
11
10
|
KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUEUE_VERSION)/kubectl-kueue-$(OS)-$(PLATFORM)"
|
|
12
|
-
KJOBCTL_URL = "https://github.com/kubernetes-sigs/kjob/releases/download/$(KJOB_VERSION)/kubectl-kjob-$(OS)-$(PLATFORM)"
|
|
13
11
|
|
|
14
12
|
PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
|
|
15
|
-
KJOB_DOCKER_IMG := xpk_kjob
|
|
16
|
-
KJOB_DOCKER_CONTAINER := xpk_kjob_container
|
|
17
13
|
BIN_PATH=$(PROJECT_DIR)/bin
|
|
18
14
|
|
|
19
15
|
.PHONY: install
|
|
20
|
-
install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl
|
|
16
|
+
install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl pip-install
|
|
21
17
|
|
|
22
18
|
.PHONY: install-dev
|
|
23
|
-
install-dev: check-python check-gcloud mkdir-bin install-kueuectl
|
|
19
|
+
install-dev: check-python check-gcloud mkdir-bin install-kueuectl pip-install pip-install-dev install-pytest install-lint
|
|
24
20
|
|
|
25
21
|
.PHONY: pip-install-dev
|
|
26
22
|
pip-install-dev:
|
|
@@ -54,16 +50,6 @@ install-kueuectl: mkdir-bin
|
|
|
54
50
|
curl -Lo $(BIN_PATH)/kubectl-kueue $(KUEUECTL_URL);
|
|
55
51
|
chmod +x $(BIN_PATH)/kubectl-kueue;
|
|
56
52
|
|
|
57
|
-
.PHONY: install-kjobctl
|
|
58
|
-
install-kjobctl: mkdir-bin
|
|
59
|
-
#curl -Lo $(BIN_PATH)/kubectl-kjob $(KJOBCTL_URL)
|
|
60
|
-
#chmod +x $(BIN_PATH)/kubectl-kjob
|
|
61
|
-
# TODO: Switch to kjob release-based installation once version >=0.2.0 is available.
|
|
62
|
-
chmod +x tools/build-kjob.sh
|
|
63
|
-
./tools/build-kjob.sh
|
|
64
|
-
mv kubectl-kjob $(BIN_PATH)/kubectl-kjob
|
|
65
|
-
chmod +x $(BIN_PATH)/kubectl-kjob
|
|
66
|
-
|
|
67
53
|
.PHONY: install-gcloud-auth-plugin
|
|
68
54
|
install-gcloud-auth-plugin:
|
|
69
55
|
chmod +x tools/install-gke-auth-plugin.sh
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -114,10 +114,21 @@ XPK also supports the following [Google Cloud Storage solutions](./docs/usage/st
|
|
|
114
114
|
* [Storage](./docs/usage/storage.md)
|
|
115
115
|
* [Advanced](./docs/usage/advanced.md)
|
|
116
116
|
* [Inspector](./docs/usage/inspector.md)
|
|
117
|
-
* [Run](./docs/usage/run.md)
|
|
118
|
-
* [Job](./docs/usage/job.md)
|
|
119
117
|
* [Troubleshooting](./docs/troubleshooting.md)
|
|
120
|
-
|
|
118
|
+
|
|
119
|
+
# Privacy notice
|
|
120
|
+
|
|
121
|
+
To help improve XPK, feature usage statistics are collected and sent to Google. You can opt-out at any time by executing
|
|
122
|
+
the following shell command:
|
|
123
|
+
|
|
124
|
+
```shell
|
|
125
|
+
xpk config set send-telemetry <true/false>
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
XPK telemetry overall is handled in accordance with the [Google Privacy Policy](https://policies.google.com/privacy). When
|
|
129
|
+
you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
|
|
130
|
+
[Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
|
|
131
|
+
|
|
121
132
|
|
|
122
133
|
# Contributing
|
|
123
134
|
|
|
@@ -73,10 +73,21 @@ XPK also supports the following [Google Cloud Storage solutions](./docs/usage/st
|
|
|
73
73
|
* [Storage](./docs/usage/storage.md)
|
|
74
74
|
* [Advanced](./docs/usage/advanced.md)
|
|
75
75
|
* [Inspector](./docs/usage/inspector.md)
|
|
76
|
-
* [Run](./docs/usage/run.md)
|
|
77
|
-
* [Job](./docs/usage/job.md)
|
|
78
76
|
* [Troubleshooting](./docs/troubleshooting.md)
|
|
79
|
-
|
|
77
|
+
|
|
78
|
+
# Privacy notice
|
|
79
|
+
|
|
80
|
+
To help improve XPK, feature usage statistics are collected and sent to Google. You can opt-out at any time by executing
|
|
81
|
+
the following shell command:
|
|
82
|
+
|
|
83
|
+
```shell
|
|
84
|
+
xpk config set send-telemetry <true/false>
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
XPK telemetry overall is handled in accordance with the [Google Privacy Policy](https://policies.google.com/privacy). When
|
|
88
|
+
you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
|
|
89
|
+
[Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
|
|
90
|
+
|
|
80
91
|
|
|
81
92
|
# Contributing
|
|
82
93
|
|
|
@@ -44,7 +44,6 @@ Depending on your chosen installation method, you may need these additional tool
|
|
|
44
44
|
| Install Method | Tool | Notes |
|
|
45
45
|
| :--- | :--- | :--- |
|
|
46
46
|
| **Pip** | **kueuectl** | [Installation instructions](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/) |
|
|
47
|
-
| **Pip** | **kjob** | [Installation instructions](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md) |
|
|
48
47
|
| **Source** | **git** | Install via your package manager (e.g., `sudo apt-get install git` on Debian/Ubuntu) |
|
|
49
48
|
| **Source** | **make** | Install via your package manager (e.g., `sudo apt-get install make` on Debian/Ubuntu) |
|
|
50
49
|
|
|
@@ -38,7 +38,7 @@ Some XPK cluster configuration might be missing, if workload creation fails with
|
|
|
38
38
|
|
|
39
39
|
`[XPK] b'error: the server doesn\'t have a resource type "workloads"\n'`
|
|
40
40
|
|
|
41
|
-
Mitigate this error by re-running your `xpk
|
|
41
|
+
Mitigate this error by re-running your `xpk cluster create ...` command, to refresh the cluster configurations.
|
|
42
42
|
|
|
43
43
|
## Permission Issues: `requires one of ["permission_name"] permission(s)`.
|
|
44
44
|
|
|
@@ -254,6 +254,35 @@ xpk cluster create \
|
|
|
254
254
|
|
|
255
255
|
will fail the cluster creation process because Vertex AI Tensorboard is not supported in `us-central2`.
|
|
256
256
|
|
|
257
|
+
### Create Cluster With Google Cloud ML Diagnostics Enabled
|
|
258
|
+
|
|
259
|
+
Google Cloud ML Diagnostics is an end-to-end managed platform for ML Engineers to optimize and diagnose their AI/ML workloads on Google Cloud. The product allows ML Engineers to collect and visualize all their workload metrics, configs and profiles with one single platform, all within the same UI. The current product offering focuses on workloads running on XLA-based frameworks (JAX, Pytorch XLA, Tensorflow/Keras) on Google Cloud TPUs and GPUs. Current support is for JAX on Google Cloud TPUs only.
|
|
260
|
+
|
|
261
|
+
Enabling ML Diagnostics is streamlined and simplified through XPK cluster creation commands.
|
|
262
|
+
|
|
263
|
+
By adding the **--managed-mldiagnostics** flag during the execution of either **xpk cluster create** or **xpk cluster create-pathways**, the ML Diagnostics functionality is enabled. This flag ensures the necessary supporting components (such as the injection-webhook and connection-operator) are automatically configured, allowing the feature to function seamlessly in both Pathways and non-Pathways execution environments.
|
|
264
|
+
|
|
265
|
+
**Example Usage:**
|
|
266
|
+
|
|
267
|
+
* Cluster Create for Pathways with flag **--managed-mldiagnostics**:
|
|
268
|
+
|
|
269
|
+
```shell
|
|
270
|
+
xpk cluster create-pathways \
|
|
271
|
+
--cluster xpk-pw-test \
|
|
272
|
+
--num-slices=4 --spot \
|
|
273
|
+
--tpu-type=v5litepod-16 \
|
|
274
|
+
--managed-mldiagnostics
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
* Cluster Create (provision spot / preemptable capacity) with flag **--managed-mldiagnostics**:
|
|
278
|
+
|
|
279
|
+
```shell
|
|
280
|
+
xpk cluster create \
|
|
281
|
+
--cluster xpk-test --tpu-type=v5litepod-16 \
|
|
282
|
+
--num-slices=4 --spot \
|
|
283
|
+
--managed-mldiagnostics
|
|
284
|
+
```
|
|
285
|
+
|
|
257
286
|
## Cluster Delete
|
|
258
287
|
* Cluster Delete (deprovision capacity):
|
|
259
288
|
|
|
@@ -9,7 +9,7 @@ gcloud container get-server-config --project=golden-project --region=us-central1
|
|
|
9
9
|
[XPK] Task: `Find if Cluster Exists` is implemented by the following command not running since it is a dry run.
|
|
10
10
|
gcloud container clusters list --project=golden-project --filter=location~"us-central1.*" --format="csv[no-heading](name)"
|
|
11
11
|
[XPK] Task: `GKE Cluster Create` is implemented by the following command not running since it is a dry run.
|
|
12
|
-
gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --location-policy=BALANCED --scopes=storage-full,gke-default
|
|
12
|
+
gcloud beta container clusters create golden-cluster --project=golden-project --region=us-central1 --node-locations=us-central1-a --cluster-version=0 --machine-type=e2-standard-16 --enable-autoscaling --total-min-nodes 1 --total-max-nodes 1000 --num-nodes 6 --enable-dns-access --autoscaling-profile=optimize-utilization --labels=gke_product_type=xpk --release-channel=rapid --enable-ip-alias --enable-dataplane-v2 --enable-multi-networking --location-policy=BALANCED --scopes=storage-full,gke-default
|
|
13
13
|
[XPK] Task: `Find cluster region or zone` is implemented by the following command not running since it is a dry run.
|
|
14
14
|
gcloud container clusters list --project=golden-project --filter=name=golden-cluster --format="value(location)"
|
|
15
15
|
[XPK] Task: `Check if Private Nodes is enabled in cluster.` is implemented by the following command not running since it is a dry run.
|
|
@@ -37,11 +37,11 @@ kubectl wait deployment/coredns --for=condition=Available=true --namespace=kube-
|
|
|
37
37
|
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
|
|
38
38
|
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
|
|
39
39
|
[XPK] Creating 1 node pool or pools of tpu7x-8
|
|
40
|
-
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
40
|
+
We assume that the underlying system is: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
41
41
|
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
|
|
42
42
|
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
|
|
43
43
|
[XPK] Creating 1 node pool or pools of tpu7x-8
|
|
44
|
-
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
44
|
+
Underlyingly, we assume that means: SystemCharacteristics(topology='2x2x1', vms_per_slice=1, gke_accelerator='tpu7x', gce_machine_type='tpu7x-standard-4t', chips_per_vm=4, accelerator_type=TPU, device_type='tpu7x-8', supports_sub_slicing=False, supports_super_slicing=False, supports_accelerator_network_profile=False, docker_platform=<DockerPlatform.AMD: 'linux/amd64'>, requires_workload_policy=False, gpu_config=None)
|
|
45
45
|
[XPK] Task: `Get Node Pool Zone` is implemented by the following command not running since it is a dry run.
|
|
46
46
|
gcloud beta container node-pools describe 0 --cluster golden-cluster --project=golden-project --location=us-central1 --format="value(locations)"
|
|
47
47
|
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
|
|
@@ -261,91 +261,6 @@ kubectl get node --no-headers | wc -l
|
|
|
261
261
|
[XPK] Try 1: Updating Kueue Controller Manager resources
|
|
262
262
|
[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
263
263
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
264
|
-
[XPK] Verifying kjob installation
|
|
265
|
-
[XPK] Task: `Verify kjob installation ` is implemented by the following command not running since it is a dry run.
|
|
266
|
-
kubectl-kjob help
|
|
267
|
-
[XPK] kjob found
|
|
268
|
-
[XPK] Applying kjob CDRs
|
|
269
|
-
[XPK] Task: `Create kjob CRDs on cluster` is implemented by the following command not running since it is a dry run.
|
|
270
|
-
kubectl kjob printcrds | kubectl apply --server-side -f -
|
|
271
|
-
[XPK] Creating kjob CRDs succeeded
|
|
272
|
-
[XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run.
|
|
273
|
-
kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true
|
|
274
|
-
[XPK] Temp file (4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61) content:
|
|
275
|
-
|
|
276
|
-
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
277
|
-
kind: JobTemplate
|
|
278
|
-
metadata:
|
|
279
|
-
name: xpk-def-batch
|
|
280
|
-
namespace: default
|
|
281
|
-
template:
|
|
282
|
-
spec:
|
|
283
|
-
parallelism: 1
|
|
284
|
-
completions: 1
|
|
285
|
-
completionMode: Indexed
|
|
286
|
-
template:
|
|
287
|
-
spec:
|
|
288
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
289
|
-
tolerations:
|
|
290
|
-
- operator: "Exists"
|
|
291
|
-
key: nvidia.com/gpu
|
|
292
|
-
containers:
|
|
293
|
-
- name: xpk-batch-container
|
|
294
|
-
image: ubuntu:22.04
|
|
295
|
-
workingDir: /
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
priorityClassName: medium
|
|
299
|
-
restartPolicy: OnFailure
|
|
300
|
-
serviceAccountName:
|
|
301
|
-
|
|
302
|
-
[XPK] Task: `Creating JobTemplate` is implemented by the following command not running since it is a dry run.
|
|
303
|
-
kubectl apply -f 4abb796ed6e7c9d7256a51f13124efd989fc12ee83839bed432fcf7d64f68e61
|
|
304
|
-
[XPK] Temp file (a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8) content:
|
|
305
|
-
|
|
306
|
-
apiVersion: v1
|
|
307
|
-
kind: PodTemplate
|
|
308
|
-
metadata:
|
|
309
|
-
name: xpk-def-pod
|
|
310
|
-
namespace: default
|
|
311
|
-
template:
|
|
312
|
-
spec:
|
|
313
|
-
tolerations:
|
|
314
|
-
- effect: NoSchedule
|
|
315
|
-
key: components.gke.io/gke-managed-components
|
|
316
|
-
operator: Equal
|
|
317
|
-
value: "true"
|
|
318
|
-
containers:
|
|
319
|
-
- name: xpk-interactive-container
|
|
320
|
-
image: busybox:1.28
|
|
321
|
-
command: [/bin/sh]
|
|
322
|
-
workingDir: /
|
|
323
|
-
initContainers:
|
|
324
|
-
- name: init
|
|
325
|
-
image: busybox:1.28
|
|
326
|
-
command: ['/bin/mkdir', '-p', '/']
|
|
327
|
-
serviceAccountName:
|
|
328
|
-
|
|
329
|
-
[XPK] Task: `Creating PodTemplate` is implemented by the following command not running since it is a dry run.
|
|
330
|
-
kubectl apply -f a63aa3c4593c38ad90671fd8b067d1886f6313ad558379b364b51791aa50f4e8
|
|
331
|
-
[XPK] Temp file (1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486) content:
|
|
332
|
-
|
|
333
|
-
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
334
|
-
kind: ApplicationProfile
|
|
335
|
-
metadata:
|
|
336
|
-
name: xpk-def-app-profile
|
|
337
|
-
namespace: default
|
|
338
|
-
spec:
|
|
339
|
-
supportedModes:
|
|
340
|
-
- name: Slurm
|
|
341
|
-
template: xpk-def-batch
|
|
342
|
-
requiredFlags: []
|
|
343
|
-
- name: Interactive
|
|
344
|
-
template: xpk-def-pod
|
|
345
|
-
volumeBundles: []
|
|
346
|
-
|
|
347
|
-
[XPK] Task: `Creating AppProfile` is implemented by the following command not running since it is a dry run.
|
|
348
|
-
kubectl apply -f 1d13ddebae3c90a05ba26b312df088982dd0df0edc4f4013b88384e476c20486
|
|
349
264
|
[XPK] GKE commands done! Resources are created.
|
|
350
265
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
351
266
|
[XPK] Exiting XPK cleanly
|