xpk 1.1.1__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-1.1.1 → xpk-1.2.0}/Makefile +3 -2
- {xpk-1.1.1/src/xpk.egg-info → xpk-1.2.0}/PKG-INFO +1 -1
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Basic_cluster_adapt.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Basic_cluster_create.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_RayCluster.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_for_multi-host_nodepool.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_for_single-host_nodepool.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_private.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_sub-slicing.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_super-slicing.md +6 -5
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_CPU_and_memory_limits_above_capacity.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_CPU_and_memory_limits_below_capacity.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_Managed_Lustre_driver.md +8 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.md +8 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_gb200-4.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_shared_reservation.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/NAP_cluster-create.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/NAP_cluster-create_with_pathways.md +2 -2
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_super-slicing.md +3 -3
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster.py +15 -6
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster_test.py +16 -1
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/cluster.py +31 -1
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/cluster_test.py +61 -4
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_container.py +3 -1
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_resources.py +5 -5
- xpk-1.2.0/src/xpk/core/kubectl_common.py +77 -0
- xpk-1.2.0/src/xpk/core/kubectl_common_test.py +174 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/kueue_manager.py +26 -26
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/kueue_manager_test.py +52 -12
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/nodepool.py +34 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/nodepool_test.py +104 -0
- {xpk-1.1.1 → xpk-1.2.0/src/xpk.egg-info}/PKG-INFO +1 -1
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/SOURCES.txt +2 -0
- {xpk-1.1.1 → xpk-1.2.0}/.dockerignore +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/CODEOWNERS +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/actions/install-kueue/action.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/actions/setup-test-env/action.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/release.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/README.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/build_tests.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/build_wheels.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/cleanup.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-dispatch.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-invoke.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-review.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-scheduled-triage.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/gemini-triage.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_basic_cluster_create.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_gpu_cluster_create.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/integration_storage_tests.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/label-validation.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/nightly_tests.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/periodic_release.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/release_branch_versioning.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_build_scripts.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_build_wheel.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_goldens.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_lint_and_format.yml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_storage_create.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_storage_delete.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/reusable_unit_tests.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.github/workflows/stale.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.gitignore +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/.pre-commit-config.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/LICENSE +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/README.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/backoff_retry.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/data/Dockerfile +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/code-of-conduct.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/contributing.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/installation.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/permissions.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/testing.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/troubleshooting.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/advanced.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/autoprovisioning.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/clusters.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/cpu.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/docker.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/gpu.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/inspector.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/storage.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/docs/usage/workloads.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/fake_training.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/requirements.txt +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/train.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/train.slurm +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/nccl/nccl-a3mega.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/nccl/nccl-a3ultra.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/nccl/nccl.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/storage/filestore-manifest-attach.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/storage/gcsfuse-manifest.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/storage/lustre-manifest-attach.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/examples/storage/pd-manifest-attach.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/pylintrc +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/pyproject.toml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_delete.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_delete_force.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Storage_list.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_pathways.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_sub-slicing.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_create_with_output-manifest-file.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_delete.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/Workload_list.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/recipes/comprehensive-demo.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/setup.cfg +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/api/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster_gcluster.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/cluster_gcluster_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/common.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/common_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/config.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/info.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/inspector.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/inspector_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/storage.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/version.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/workload.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/commands/workload_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/capacity.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/capacity_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/cluster_private.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/commands.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/config.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/config_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_image.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/filestore.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcloud_context_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/jobset.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/mtc.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/nap.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/network.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/pathways.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/pathways_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/ray.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/resources.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/scheduling.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/scheduling_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/storage.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/system_characteristics.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/system_characteristics_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/telemetry.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/telemetry_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/testing/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/testing/commands_tester.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/testing/commands_tester_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/updates.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/updates_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/vertex.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/core/workload_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/main.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/cluster.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/cluster_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/common.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/common_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/config.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/core.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/info.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/storage.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/storage_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/validators.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/version.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/workload.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/parser/workload_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/telemetry_uploader.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/filestore-pv.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/filestore-pvc.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/filestore-sc.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/fuse-pv.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/fuse-pvc.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/mtc-cpc.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/console.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/console_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/execution_context.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/feature_flags.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/file.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/kueue.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/network.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/objects.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/templates.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/topology.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/topology_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/user_agent.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/user_agent_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/validation.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/validation_test.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/versions.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/src/xpk.egg-info/top_level.txt +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/tools/install-gke-auth-plugin.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/tools/install-xpk.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/tools/recipes.py +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/xpk-large-scale-guide.sh +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/xpk-notebooks.md +0 -0
- {xpk-1.1.1 → xpk-1.2.0}/xpk.py +0 -0
{xpk-1.1.1 → xpk-1.2.0}/Makefile
RENAMED
|
@@ -6,6 +6,7 @@ KUEUECTL_URL = "https://github.com/kubernetes-sigs/kueue/releases/download/$(KUE
|
|
|
6
6
|
|
|
7
7
|
PROJECT_DIR := $(realpath $(shell dirname $(firstword $(MAKEFILE_LIST))))
|
|
8
8
|
BIN_PATH=$(PROJECT_DIR)/bin
|
|
9
|
+
PIP_OPTS ?=
|
|
9
10
|
|
|
10
11
|
.PHONY: install
|
|
11
12
|
install: check-python check-gcloud install-gcloud-auth-plugin install-kueuectl pip-install
|
|
@@ -15,11 +16,11 @@ install-dev: check-python check-gcloud mkdir-bin install-kueuectl pip-install pi
|
|
|
15
16
|
|
|
16
17
|
.PHONY: pip-install-dev
|
|
17
18
|
pip-install-dev:
|
|
18
|
-
pip install -e ".[dev]"
|
|
19
|
+
pip install $(PIP_OPTS) -e ".[dev]"
|
|
19
20
|
|
|
20
21
|
.PHONY: pip-install
|
|
21
22
|
pip-install:
|
|
22
|
-
pip install -e .
|
|
23
|
+
pip install $(PIP_OPTS) -e .
|
|
23
24
|
|
|
24
25
|
.PHONY: install-pytest
|
|
25
26
|
install-pytest:
|
|
@@ -134,8 +134,8 @@ description: "Very High"
|
|
|
134
134
|
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
135
135
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
136
136
|
kubectl get node --no-headers | wc -l
|
|
137
|
-
[XPK] Try 1: Updating
|
|
138
|
-
[XPK] Task: `Updating
|
|
137
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
138
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
139
139
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
140
140
|
[XPK] GKE commands done! Resources are created.
|
|
141
141
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -266,8 +266,8 @@ description: "Very High"
|
|
|
266
266
|
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
267
267
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
268
268
|
kubectl get node --no-headers | wc -l
|
|
269
|
-
[XPK] Try 1: Updating
|
|
270
|
-
[XPK] Task: `Updating
|
|
269
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
270
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
271
271
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
272
272
|
[XPK] GKE commands done! Resources are created.
|
|
273
273
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -273,8 +273,8 @@ description: "Very High"
|
|
|
273
273
|
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
274
274
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
275
275
|
kubectl get node --no-headers | wc -l
|
|
276
|
-
[XPK] Try 1: Updating
|
|
277
|
-
[XPK] Task: `Updating
|
|
276
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
277
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
278
278
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
279
279
|
[XPK] Try 1: Deleting old RayCluster
|
|
280
280
|
[XPK] Task: `Deleting old RayCluster` is implemented by the following command not running since it is a dry run.
|
|
@@ -268,8 +268,8 @@ description: "Very High"
|
|
|
268
268
|
kubectl apply -f b58f50dd88cb1211d51276b9b445f6bca02f0e97fa984656d47992aecd9322cc
|
|
269
269
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
270
270
|
kubectl get node --no-headers | wc -l
|
|
271
|
-
[XPK] Try 1: Updating
|
|
272
|
-
[XPK] Task: `Updating
|
|
271
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
272
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
273
273
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
274
274
|
[XPK] GKE commands done! Resources are created.
|
|
275
275
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -266,8 +266,8 @@ description: "Very High"
|
|
|
266
266
|
kubectl apply -f f228edecda8022002fe1876e83ebf4c0c280eb4aeb0f72da3a5d746b5dfb1c91
|
|
267
267
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
268
268
|
kubectl get node --no-headers | wc -l
|
|
269
|
-
[XPK] Try 1: Updating
|
|
270
|
-
[XPK] Task: `Updating
|
|
269
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
270
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
271
271
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
272
272
|
[XPK] GKE commands done! Resources are created.
|
|
273
273
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -285,8 +285,8 @@ description: "Very High"
|
|
|
285
285
|
kubectl apply -f 2e0015f210b664c3b767ae4e11af51387b01d4d6b36e20fecbdee137d3d2700b
|
|
286
286
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
287
287
|
kubectl get node --no-headers | wc -l
|
|
288
|
-
[XPK] Try 1: Updating
|
|
289
|
-
[XPK] Task: `Updating
|
|
288
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
289
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
290
290
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
291
291
|
[XPK] GKE commands done! Resources are created.
|
|
292
292
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster-private/details?project=golden-project
|
|
@@ -290,8 +290,8 @@ spec:
|
|
|
290
290
|
kubectl apply -f 2f2b4591858b4bc50348c575cd2cc048c79d1e4ffb67e0a6d6e1eafad21c5002
|
|
291
291
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
292
292
|
kubectl get node --no-headers | wc -l
|
|
293
|
-
[XPK] Try 1: Updating
|
|
294
|
-
[XPK] Task: `Updating
|
|
293
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
294
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
295
295
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
296
296
|
[XPK] GKE commands done! Resources are created.
|
|
297
297
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -98,6 +98,9 @@ data:
|
|
|
98
98
|
[XPK] Try 1: Install Jobset on golden-cluster
|
|
99
99
|
[XPK] Task: `Install Jobset on golden-cluster` is implemented by the following command not running since it is a dry run.
|
|
100
100
|
kubectl apply --server-side --force-conflicts -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.8.0/manifests.yaml
|
|
101
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
102
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
103
|
+
kubectl patch deployment jobset-controller-manager -n jobset-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"requests": {"cpu": "4", "memory": "16Gi"}, "limits": {"cpu": "4", "memory": "16Gi"}}}]}}}}'
|
|
101
104
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
102
105
|
kubectl get node --no-headers | wc -l
|
|
103
106
|
[XPK] Temp file (1b31e624e490f9c8c4ef4e369f08d3fa467990af5a261e4405bd045265d70e95) content:
|
|
@@ -299,11 +302,9 @@ spec:
|
|
|
299
302
|
- nodeLabel: kubernetes.io/hostname
|
|
300
303
|
[XPK] Task: `Applying Kueue Custom Resources` is implemented by the following command not running since it is a dry run.
|
|
301
304
|
kubectl apply -f 6df31e8df3d8970d7ed3bf3aa948ae7cea9487c15ed6cfb1577ca6c948cf5525
|
|
302
|
-
[XPK]
|
|
303
|
-
|
|
304
|
-
[
|
|
305
|
-
[XPK] Task: `Updating Kueue Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
306
|
-
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
305
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
306
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
307
|
+
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"replicas": 3, "template": {"spec": {"containers": [{"name": "manager", "resources": {"requests": {"cpu": "16", "memory": "64Gi"}, "limits": {"cpu": "16", "memory": "64Gi"}}}]}}}}'
|
|
307
308
|
[XPK] GKE commands done! Resources are created.
|
|
308
309
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
309
310
|
[XPK] Exiting XPK cleanly
|
|
@@ -270,8 +270,8 @@ description: "Very High"
|
|
|
270
270
|
kubectl apply -f 1ce6c42efe0834ff0519978ad09539c725a5d6f22267c5f1b41b6e458668e45f
|
|
271
271
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
272
272
|
kubectl get node --no-headers | wc -l
|
|
273
|
-
[XPK] Try 1: Updating
|
|
274
|
-
[XPK] Task: `Updating
|
|
273
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
274
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
275
275
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
276
276
|
[XPK] GKE commands done! Resources are created.
|
|
277
277
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -270,8 +270,8 @@ description: "Very High"
|
|
|
270
270
|
kubectl apply -f 1ce6c42efe0834ff0519978ad09539c725a5d6f22267c5f1b41b6e458668e45f
|
|
271
271
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
272
272
|
kubectl get node --no-headers | wc -l
|
|
273
|
-
[XPK] Try 1: Updating
|
|
274
|
-
[XPK] Task: `Updating
|
|
273
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
274
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
275
275
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
276
276
|
[XPK] GKE commands done! Resources are created.
|
|
277
277
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -47,6 +47,12 @@ gcloud container clusters describe golden-cluster --project=golden-project --loc
|
|
|
47
47
|
[XPK] Updating GKE cluster to enable Lustre CSI driver, may take a while!
|
|
48
48
|
[XPK] Task: `GKE Cluster Update to enable Lustre CSI driver` is implemented by the following command not running since it is a dry run.
|
|
49
49
|
gcloud container clusters update golden-cluster --project=golden-project --location=us-central1 --quiet --update-addons=LustreCsiDriver=ENABLED
|
|
50
|
+
[XPK] Recreating existing nodes (if any) to complete the Lustre CSI driver installation.
|
|
51
|
+
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
|
|
52
|
+
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
|
|
53
|
+
[XPK] To complete NodesRecreate-0 we are executing gcloud container clusters upgrade golden-cluster --project=golden-project --node-pool=0 --location=us-central1 --quiet
|
|
54
|
+
[XPK] Breaking up a total of 1 commands into 1 batches
|
|
55
|
+
[XPK] Pretending all the jobs succeeded
|
|
50
56
|
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
|
|
51
57
|
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
|
|
52
58
|
[XPK] Creating 1 node pool or pools of tpu7x-8
|
|
@@ -271,8 +277,8 @@ description: "Very High"
|
|
|
271
277
|
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
272
278
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
273
279
|
kubectl get node --no-headers | wc -l
|
|
274
|
-
[XPK] Try 1: Updating
|
|
275
|
-
[XPK] Task: `Updating
|
|
280
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
281
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
276
282
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
277
283
|
[XPK] GKE commands done! Resources are created.
|
|
278
284
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
{xpk-1.1.1 → xpk-1.2.0}/recipes/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.md
RENAMED
|
@@ -47,6 +47,12 @@ gcloud container clusters describe golden-cluster --project=golden-project --loc
|
|
|
47
47
|
[XPK] Updating GKE cluster to enable Lustre CSI driver, may take a while!
|
|
48
48
|
[XPK] Task: `GKE Cluster Update to enable Lustre CSI driver` is implemented by the following command not running since it is a dry run.
|
|
49
49
|
gcloud container clusters update golden-cluster --project=golden-project --location=us-central1 --quiet --enable-legacy-lustre-port
|
|
50
|
+
[XPK] Recreating existing nodes (if any) to complete the Lustre CSI driver installation.
|
|
51
|
+
[XPK] Task: `Get All Node Pools` is implemented by the following command not running since it is a dry run.
|
|
52
|
+
gcloud beta container node-pools list --cluster golden-cluster --project=golden-project --location=us-central1 --format="csv[no-heading](name)"
|
|
53
|
+
[XPK] To complete NodesRecreate-0 we are executing gcloud container clusters upgrade golden-cluster --project=golden-project --node-pool=0 --location=us-central1 --quiet
|
|
54
|
+
[XPK] Breaking up a total of 1 commands into 1 batches
|
|
55
|
+
[XPK] Pretending all the jobs succeeded
|
|
50
56
|
[XPK] Task: `Determine current gke master version` is implemented by the following command not running since it is a dry run.
|
|
51
57
|
gcloud beta container clusters describe golden-cluster --location us-central1 --project golden-project --format="value(currentMasterVersion)"
|
|
52
58
|
[XPK] Creating 1 node pool or pools of tpu7x-8
|
|
@@ -271,8 +277,8 @@ description: "Very High"
|
|
|
271
277
|
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
272
278
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
273
279
|
kubectl get node --no-headers | wc -l
|
|
274
|
-
[XPK] Try 1: Updating
|
|
275
|
-
[XPK] Task: `Updating
|
|
280
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
281
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
276
282
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
277
283
|
[XPK] GKE commands done! Resources are created.
|
|
278
284
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -287,8 +287,8 @@ spec:
|
|
|
287
287
|
kubectl apply -f c177e643775bb8e3462648245162a984934b0e09a13b0e3bfb62adf8585442b0
|
|
288
288
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
289
289
|
kubectl get node --no-headers | wc -l
|
|
290
|
-
[XPK] Try 1: Updating
|
|
291
|
-
[XPK] Task: `Updating
|
|
290
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
291
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
292
292
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
293
293
|
[XPK] Installing NCCL Plugin for cluster
|
|
294
294
|
[XPK] Task: `Install NCCL Plugin On Cluster` is implemented by the following command not running since it is a dry run.
|
|
@@ -273,8 +273,8 @@ description: "Very High"
|
|
|
273
273
|
kubectl apply -f 6083d72fc3ba2ac7d243c1269dd67717abd4086bf64e397e3a1737de415dd133
|
|
274
274
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
275
275
|
kubectl get node --no-headers | wc -l
|
|
276
|
-
[XPK] Try 1: Updating
|
|
277
|
-
[XPK] Task: `Updating
|
|
276
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
277
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
278
278
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
279
279
|
[XPK] GKE commands done! Resources are created.
|
|
280
280
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -307,8 +307,8 @@ description: "Very High"
|
|
|
307
307
|
kubectl apply -f ff0e8bb58b2038c4b29f1bce1aabe9f02ac0757ae2e80ad3657f704542371839
|
|
308
308
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
309
309
|
kubectl get node --no-headers | wc -l
|
|
310
|
-
[XPK] Try 1: Updating
|
|
311
|
-
[XPK] Task: `Updating
|
|
310
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
311
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
312
312
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
313
313
|
[XPK] GKE commands done! Resources are created.
|
|
314
314
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -317,8 +317,8 @@ description: "Very High"
|
|
|
317
317
|
kubectl apply -f fc46093b5c0d291fe7c53c15aebd624b485d767cabf99a73500e95952c70b6f6
|
|
318
318
|
[XPK] Task: `Count total nodes` is implemented by the following command not running since it is a dry run.
|
|
319
319
|
kubectl get node --no-headers | wc -l
|
|
320
|
-
[XPK] Try 1: Updating
|
|
321
|
-
[XPK] Task: `Updating
|
|
320
|
+
[XPK] Try 1: Updating Controller Manager resources
|
|
321
|
+
[XPK] Task: `Updating Controller Manager resources` is implemented by the following command not running since it is a dry run.
|
|
322
322
|
kubectl patch deployment kueue-controller-manager -n kueue-system --type='strategic' --patch='{"spec": {"template": {"spec": {"containers": [{"name": "manager", "resources": {"limits": {"memory": "4096Mi"}}}]}}}}'
|
|
323
323
|
[XPK] GKE commands done! Resources are created.
|
|
324
324
|
[XPK] See your GKE Cluster here: https://console.cloud.google.com/kubernetes/clusters/details/us-central1/golden-cluster/details?project=golden-project
|
|
@@ -47,7 +47,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94
|
|
|
47
47
|
docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
|
|
48
48
|
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
|
|
49
49
|
docker push gcr.io/golden-project/dry-run-runner:prefix-current
|
|
50
|
-
[XPK] Temp file (
|
|
50
|
+
[XPK] Temp file (2c5ab381c0d643f8512a07d296d411413080ec652c15e8c676fd58435de5a327) content:
|
|
51
51
|
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
52
52
|
kind: JobSet
|
|
53
53
|
metadata:
|
|
@@ -136,7 +136,7 @@ spec:
|
|
|
136
136
|
exit $EXIT_CODE
|
|
137
137
|
resources:
|
|
138
138
|
limits:
|
|
139
|
-
google.com/tpu:
|
|
139
|
+
google.com/tpu: 4
|
|
140
140
|
|
|
141
141
|
volumeMounts:
|
|
142
142
|
- mountPath: /dev/shm
|
|
@@ -156,7 +156,7 @@ spec:
|
|
|
156
156
|
|
|
157
157
|
|
|
158
158
|
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
|
|
159
|
-
kubectl apply -f
|
|
159
|
+
kubectl apply -f 2c5ab381c0d643f8512a07d296d411413080ec652c15e8c676fd58435de5a327
|
|
160
160
|
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
|
|
161
161
|
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
|
|
162
162
|
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
|
|
@@ -369,7 +369,7 @@ def cluster_create(args) -> None:
|
|
|
369
369
|
|
|
370
370
|
get_cluster_credentials(args)
|
|
371
371
|
|
|
372
|
-
update_coredns_command_code = update_coredns_if_necessary()
|
|
372
|
+
update_coredns_command_code = update_coredns_if_necessary(args)
|
|
373
373
|
if update_coredns_command_code != 0:
|
|
374
374
|
xpk_exit(update_coredns_command_code)
|
|
375
375
|
|
|
@@ -927,7 +927,7 @@ def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
|
|
|
927
927
|
xpk_print(f'{deployment_name} has been scaled down.')
|
|
928
928
|
|
|
929
929
|
|
|
930
|
-
def scale_up_coredns(replicas: int
|
|
930
|
+
def scale_up_coredns(replicas: int, namespace: str = 'kube-system'):
|
|
931
931
|
"""Scales up the CoreDNS deployment to a specified number of replicas."""
|
|
932
932
|
command_coredns_scale = (
|
|
933
933
|
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
|
|
@@ -1008,7 +1008,14 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
|
1008
1008
|
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
1009
1009
|
|
|
1010
1010
|
|
|
1011
|
-
def
|
|
1011
|
+
def _get_coredns_replica_count(args) -> int:
|
|
1012
|
+
# XPK large scale guide recommends 15 coreDNS replicas for clusters with 5000 VMs.
|
|
1013
|
+
# Otherwise, limit the replica count to the desired number of default pool nodes.
|
|
1014
|
+
default_pool_node_count: int = args.default_pool_cpu_num_nodes
|
|
1015
|
+
return min(15, default_pool_node_count)
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
def update_coredns(args) -> int:
|
|
1012
1019
|
"""Updates and deploys CoreDNS within a cluster.
|
|
1013
1020
|
|
|
1014
1021
|
Returns:
|
|
@@ -1018,6 +1025,8 @@ def update_coredns() -> int:
|
|
|
1018
1025
|
coredns_repo_dir_name = 'deployment'
|
|
1019
1026
|
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
|
|
1020
1027
|
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
|
|
1028
|
+
coredns_replica_count = _get_coredns_replica_count(args)
|
|
1029
|
+
|
|
1021
1030
|
# 1. Install jq
|
|
1022
1031
|
install_jq()
|
|
1023
1032
|
|
|
@@ -1034,7 +1043,7 @@ def update_coredns() -> int:
|
|
|
1034
1043
|
scale_down_deployment('kube-dns')
|
|
1035
1044
|
|
|
1036
1045
|
# 6. Scale up coredns and verify readiness
|
|
1037
|
-
scale_up_coredns(
|
|
1046
|
+
scale_up_coredns(coredns_replica_count)
|
|
1038
1047
|
verify_coredns_readiness()
|
|
1039
1048
|
|
|
1040
1049
|
xpk_print('The CoreDNS setup process has been completed.')
|
|
@@ -1074,7 +1083,7 @@ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
|
|
|
1074
1083
|
return False
|
|
1075
1084
|
|
|
1076
1085
|
|
|
1077
|
-
def update_coredns_if_necessary() -> int:
|
|
1086
|
+
def update_coredns_if_necessary(args) -> int:
|
|
1078
1087
|
"""Updates and deploys CoreDNS within the cluster if it's not already present.
|
|
1079
1088
|
|
|
1080
1089
|
This function checks for the existence of the CoreDNS deployment.
|
|
@@ -1089,7 +1098,7 @@ def update_coredns_if_necessary() -> int:
|
|
|
1089
1098
|
return 0
|
|
1090
1099
|
else:
|
|
1091
1100
|
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
|
|
1092
|
-
return update_coredns()
|
|
1101
|
+
return update_coredns(args)
|
|
1093
1102
|
|
|
1094
1103
|
|
|
1095
1104
|
def create_cluster_if_necessary(
|
|
@@ -22,7 +22,7 @@ from unittest.mock import MagicMock, patch
|
|
|
22
22
|
import pytest
|
|
23
23
|
|
|
24
24
|
from xpk.core.telemetry import MetricsCollector
|
|
25
|
-
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
|
|
25
|
+
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, _get_coredns_replica_count, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
|
|
26
26
|
from xpk.core.capacity import CapacityType
|
|
27
27
|
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
|
|
28
28
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
@@ -787,3 +787,18 @@ def test_validate_cluster_create_args_sets_correct_num_slices(
|
|
|
787
787
|
_validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
|
|
788
788
|
|
|
789
789
|
assert args.num_slices == expected
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def test_get_coredns_replica_count_lower_limit_is_number_of_nodes():
|
|
793
|
+
args = construct_args(
|
|
794
|
+
default_pool_cpu_num_nodes=7,
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
assert _get_coredns_replica_count(args) == 7
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def test_get_coredns_replica_count_upper_limit_is_15():
|
|
801
|
+
args = construct_args(
|
|
802
|
+
default_pool_cpu_num_nodes=20,
|
|
803
|
+
)
|
|
804
|
+
assert _get_coredns_replica_count(args) == 15
|
|
@@ -21,6 +21,8 @@ from kubernetes import client as k8s_client
|
|
|
21
21
|
from kubernetes import config
|
|
22
22
|
from kubernetes.client.exceptions import ApiException
|
|
23
23
|
|
|
24
|
+
from .kubectl_common import PatchResources, patch_controller_manager_resources
|
|
25
|
+
from ..utils.feature_flags import FeatureFlags
|
|
24
26
|
from ..utils.console import xpk_exit, xpk_print
|
|
25
27
|
from .capacity import H200_DEVICE_TYPE
|
|
26
28
|
from .commands import (
|
|
@@ -33,6 +35,7 @@ from .gcloud_context import (
|
|
|
33
35
|
get_cluster_location,
|
|
34
36
|
zone_to_region,
|
|
35
37
|
)
|
|
38
|
+
from .nodepool import recreate_nodes_in_existing_node_pools
|
|
36
39
|
from .resources import get_cluster_system_characteristics
|
|
37
40
|
from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
|
|
38
41
|
|
|
@@ -72,7 +75,21 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
72
75
|
' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
|
|
73
76
|
' instructions on how to fix these permissions.'
|
|
74
77
|
)
|
|
75
|
-
|
|
78
|
+
return return_code
|
|
79
|
+
|
|
80
|
+
if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
|
|
81
|
+
return patch_controller_manager_resources(
|
|
82
|
+
name='jobset-controller-manager',
|
|
83
|
+
namespace='jobset-system',
|
|
84
|
+
patch_resources=PatchResources(
|
|
85
|
+
cpu_request=4,
|
|
86
|
+
cpu_limit=4,
|
|
87
|
+
memory_request='16Gi',
|
|
88
|
+
memory_limit='16Gi',
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return 0
|
|
76
93
|
|
|
77
94
|
|
|
78
95
|
def set_pathways_job_on_cluster(args) -> int:
|
|
@@ -605,6 +622,19 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
605
622
|
if return_code != 0:
|
|
606
623
|
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
607
624
|
return 1
|
|
625
|
+
|
|
626
|
+
xpk_print(
|
|
627
|
+
'Recreating existing nodes (if any) to complete the Lustre CSI driver'
|
|
628
|
+
' installation.'
|
|
629
|
+
)
|
|
630
|
+
return_code = recreate_nodes_in_existing_node_pools(args)
|
|
631
|
+
if return_code != 0:
|
|
632
|
+
xpk_print(
|
|
633
|
+
f'Node recreation failed with ERROR {return_code}. You must recreate'
|
|
634
|
+
' the nodes manually in order to access Lustre storage from your'
|
|
635
|
+
' workloads.'
|
|
636
|
+
)
|
|
637
|
+
return 1
|
|
608
638
|
return 0
|
|
609
639
|
|
|
610
640
|
|
|
@@ -14,10 +14,12 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from unittest.mock import MagicMock
|
|
17
18
|
import pytest
|
|
18
19
|
from .testing.commands_tester import CommandsTester
|
|
19
|
-
from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
|
|
20
|
+
from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary, set_jobset_on_cluster
|
|
20
21
|
from pytest_mock import MockerFixture
|
|
22
|
+
from ..utils.feature_flags import FeatureFlags
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
@pytest.fixture(autouse=True)
|
|
@@ -26,6 +28,9 @@ def commands_tester(mocker: MockerFixture) -> CommandsTester:
|
|
|
26
28
|
mocker=mocker,
|
|
27
29
|
run_command_for_value_path="xpk.core.cluster.run_command_for_value",
|
|
28
30
|
run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
|
|
31
|
+
run_command_with_updates_retry_path=(
|
|
32
|
+
"xpk.core.cluster.run_command_with_updates_retry"
|
|
33
|
+
),
|
|
29
34
|
)
|
|
30
35
|
|
|
31
36
|
|
|
@@ -38,7 +43,17 @@ def mock_location(mocker: MockerFixture):
|
|
|
38
43
|
|
|
39
44
|
@pytest.fixture(autouse=True)
|
|
40
45
|
def command_args(mocker: MockerFixture):
|
|
41
|
-
return mocker.Mock(
|
|
46
|
+
return mocker.Mock(
|
|
47
|
+
cluster="cluster", project="project", zone="zone", super_slicing=False
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.fixture(autouse=True)
|
|
52
|
+
def mock_patch_controller_manager_resources(mocker: MockerFixture) -> MagicMock:
|
|
53
|
+
return mocker.patch(
|
|
54
|
+
"xpk.core.cluster.patch_controller_manager_resources",
|
|
55
|
+
return_value=0,
|
|
56
|
+
)
|
|
42
57
|
|
|
43
58
|
|
|
44
59
|
def test_get_cluster_credentials_returns_1_when_retrieval_commands_fail(
|
|
@@ -166,11 +181,14 @@ def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_co
|
|
|
166
181
|
|
|
167
182
|
|
|
168
183
|
def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
|
|
169
|
-
commands_tester: CommandsTester, command_args
|
|
184
|
+
commands_tester: CommandsTester, command_args, mocker: MockerFixture
|
|
170
185
|
):
|
|
171
186
|
commands_tester.set_result_for_command(
|
|
172
187
|
(0, ""), "gcloud container clusters update"
|
|
173
188
|
)
|
|
189
|
+
mocker.patch(
|
|
190
|
+
"xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=0
|
|
191
|
+
)
|
|
174
192
|
command_args.enable_legacy_lustre_port = None
|
|
175
193
|
update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
176
194
|
|
|
@@ -181,12 +199,30 @@ def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
|
|
|
181
199
|
]
|
|
182
200
|
|
|
183
201
|
|
|
202
|
+
def test_update_gke_cluster_with_lustre_driver_enabled_fails_if_node_recreation_failed(
|
|
203
|
+
commands_tester: CommandsTester, command_args, mocker: MockerFixture
|
|
204
|
+
):
|
|
205
|
+
commands_tester.set_result_for_command(
|
|
206
|
+
(0, ""), "gcloud container clusters update"
|
|
207
|
+
)
|
|
208
|
+
mocker.patch(
|
|
209
|
+
"xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=123
|
|
210
|
+
)
|
|
211
|
+
command_args.enable_legacy_lustre_port = None
|
|
212
|
+
return_code = update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
213
|
+
|
|
214
|
+
assert return_code != 0
|
|
215
|
+
|
|
216
|
+
|
|
184
217
|
def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
|
|
185
|
-
commands_tester: CommandsTester, command_args
|
|
218
|
+
commands_tester: CommandsTester, command_args, mocker: MockerFixture
|
|
186
219
|
):
|
|
187
220
|
commands_tester.set_result_for_command(
|
|
188
221
|
(0, ""), "gcloud container clusters update"
|
|
189
222
|
)
|
|
223
|
+
mocker.patch(
|
|
224
|
+
"xpk.core.cluster.recreate_nodes_in_existing_node_pools", return_value=0
|
|
225
|
+
)
|
|
190
226
|
command_args.enable_legacy_lustre_port = True
|
|
191
227
|
update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
192
228
|
|
|
@@ -195,3 +231,24 @@ def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
|
|
|
195
231
|
"gcloud container clusters update cluster --project=project"
|
|
196
232
|
" --location=us-central1 --quiet --enable-legacy-lustre-port"
|
|
197
233
|
]
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def test_set_jobset_on_cluster_not_setting_resources_by_default(
|
|
237
|
+
mock_patch_controller_manager_resources: MagicMock, command_args
|
|
238
|
+
):
|
|
239
|
+
result = set_jobset_on_cluster(command_args)
|
|
240
|
+
|
|
241
|
+
assert result == 0
|
|
242
|
+
mock_patch_controller_manager_resources.assert_not_called()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def test_set_jobset_on_cluster_super_slicing_resources(
|
|
246
|
+
mock_patch_controller_manager_resources: MagicMock, command_args
|
|
247
|
+
):
|
|
248
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
249
|
+
command_args.super_slicing = True
|
|
250
|
+
|
|
251
|
+
result = set_jobset_on_cluster(command_args)
|
|
252
|
+
|
|
253
|
+
assert result == 0
|
|
254
|
+
mock_patch_controller_manager_resources.assert_called()
|
|
@@ -181,7 +181,9 @@ def get_main_container(
|
|
|
181
181
|
tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
|
|
182
182
|
gpu_workload_terminate_command=gpu_workload_terminate_command,
|
|
183
183
|
xpk_internal_commands=xpk_internal_commands,
|
|
184
|
-
resources=get_main_container_resources(
|
|
184
|
+
resources=get_main_container_resources(
|
|
185
|
+
args, system, resource_type, parallel_containers
|
|
186
|
+
),
|
|
185
187
|
volume_mounts=volume_mounts,
|
|
186
188
|
)
|
|
187
189
|
)
|
|
@@ -23,7 +23,10 @@ from ..utils.execution_context import is_dry_run
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
def get_main_container_resources(
|
|
26
|
-
args,
|
|
26
|
+
args,
|
|
27
|
+
system: SystemCharacteristics,
|
|
28
|
+
resource_type: str,
|
|
29
|
+
parallel_containers: int,
|
|
27
30
|
) -> str:
|
|
28
31
|
"""Resources for the main container.
|
|
29
32
|
Args:
|
|
@@ -53,10 +56,7 @@ def get_main_container_resources(
|
|
|
53
56
|
offset_vCPUs = int(system.chips_per_vm) * 0.95
|
|
54
57
|
return f'{resource_type}: {offset_vCPUs}'
|
|
55
58
|
|
|
56
|
-
return (
|
|
57
|
-
f'{resource_type}:'
|
|
58
|
-
f' {int(system.chips_per_vm / system.parallel_containers)}'
|
|
59
|
-
)
|
|
59
|
+
return f'{resource_type}: {int(system.chips_per_vm / parallel_containers)}'
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def get_env_container(args, system: SystemCharacteristics) -> str:
|