xpk 0.17.0__tar.gz → 0.17.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.17.0/src/xpk.egg-info → xpk-0.17.2}/PKG-INFO +1 -1
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_create.txt +3 -2
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_create_sub-slicing.txt +3 -2
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_create_super-slicing.txt +3 -2
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_create_with_output-manifest-file.txt +3 -2
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/storage.py +0 -25
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/workload.py +1 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/cluster.py +1 -3
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/common.py +0 -151
- {xpk-0.17.0 → xpk-0.17.2/src/xpk.egg-info}/PKG-INFO +1 -1
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk.egg-info/SOURCES.txt +0 -2
- xpk-0.17.0/src/xpk/core/kjob.py +0 -473
- xpk-0.17.0/src/xpk/templates/volume_bundle.yaml +0 -7
- {xpk-0.17.0 → xpk-0.17.2}/.dockerignore +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/CODEOWNERS +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/actions/install-kjob/action.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/actions/install-kueue/action.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/actions/setup-test-env/action.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/release.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/README.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/build_tests.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/build_wheels.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/cleanup.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/gemini-dispatch.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/gemini-invoke.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/gemini-review.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/gemini-scheduled-triage.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/gemini-triage.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/integration_basic_cluster_create.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/integration_legacy_tests.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/integration_pathways_cluster_create.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/integration_ray_cluster_create.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/integration_storage_tests.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/label-validation.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/nightly_tests.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/periodic_release.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/release_branch_versioning.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_build_kjob.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_build_scripts.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_build_wheel.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_goldens.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_integration_tests.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_lint_and_format.yml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_storage_create.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_storage_delete.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/reusable_unit_tests.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.github/workflows/stale.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.gitignore +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/.pre-commit-config.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/LICENSE +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/Makefile +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/README.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/backoff_retry.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/data/Dockerfile +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/code-of-conduct.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/contributing.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/installation.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/local_testing.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/permissions.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/testing.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/troubleshooting.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/advanced.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/autoprovisioning.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/clusters.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/cpu.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/docker.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/gpu.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/inspector.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/job.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/run.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/storage.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/tpu7x/clusters.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/tpu7x/recipes/flex_filestore_recipe.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/tpu7x/recipes/flex_lustre_recipe.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/tpu7x/recipes/reservation_gcs_bucket_recipe.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/tpu7x/workloads.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/docs/usage/workloads.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/batch.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/fake_training.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/job.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/llama-3.1-finetuning/check_cuda.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/llama-3.1-finetuning/requirements.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/llama-3.1-finetuning/train.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/llama-3.1-finetuning/train.slurm +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/llama-3.1-finetuning/training_data.jsonl +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/nccl/nccl-a3mega.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/nccl/nccl-a3ultra.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/nccl/nccl.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/storage/filestore-manifest-attach.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/storage/gcsfuse-manifest.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/storage/lustre-manifest-attach.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/storage/parallelstore-manifest-attach.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/examples/storage/pd-manifest-attach.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/golden_buddy.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Basic_cluster_create.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Batch.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_for_multi-host_nodepool.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_for_single-host_single-slice_TPU.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_private.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_sub-slicing.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_super-slicing.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_with_CPU_and_memory_limits_above_capacity.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_with_CPU_and_memory_limits_below_capacity.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_with_Managed_Lustre_driver.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_with_Managed_Lustre_driver_and_legacy_port.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_with_gb200-4.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_create_with_shared_reservation.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_delete.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Cluster_delete_force.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Job_cancel.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Job_info.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Job_list.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/NAP_cluster-create.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/NAP_cluster-create_with_pathways.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Storage_list.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_create_pathways.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_delete.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens/Workload_list.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/goldens.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/pylintrc +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/pyproject.toml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/setup.cfg +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/README.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/docker_manager_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/gcluster_a3mega_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/gcluster_a3ultra_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/gcluster_a4_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/integration/gcluster_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/api/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a3mega/config-map.yaml.tftpl +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a3mega/storage_crd.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a3ultra/config-map.yaml.tftpl +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a3ultra/mlgru-disable.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a3ultra/nccl-installer.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a3ultra/storage_crd.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a4/config-map.yaml.tftpl +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/blueprints/a4/storage_crd.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/batch.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/cluster.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/cluster_gcluster.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/cluster_gcluster_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/cluster_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/common.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/config.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/info.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/inspector.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/job.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/kind.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/kjob_common.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/managed_ml_diagnostics.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/managed_ml_diagnostics_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/run.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/shell.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/version.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/commands/workload_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/blueprint_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/testing/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/testing/data/a3_mega.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/testing/data/a3_ultra.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/blueprint/testing/data/a4.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/capacity.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/capacity_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/cluster_private.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/cluster_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/commands.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/config.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/config_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/docker_image.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/docker_resources.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/filestore.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/gcloud_context_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/jobset.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/kueue_manager.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/kueue_manager_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/mtc.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/nap.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/network.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/nodepool.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/nodepool_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/pathways.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/pathways_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/ray.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/resources.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/scheduling.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/scheduling_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/storage.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/system_characteristics.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/system_characteristics_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/telemetry.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/telemetry_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/testing/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/testing/commands_tester.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/testing/commands_tester_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/updates.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/updates_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/vertex.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/core/workload_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/main.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/batch.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/cluster.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/cluster_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/common_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/config.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/core.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/info.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/job.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/kind.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/run.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/shell.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/storage.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/storage_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/validators.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/version.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/workload.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/parser/workload_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/telemetry_uploader.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/arm_gpu_workload_crate.yaml.j2 +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/cluster_preheat.yaml.j2 +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/filestore-pv.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/filestore-pvc.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/filestore-sc.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/fuse-pv.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/fuse-pvc.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/kueue_config.yaml.j2 +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/kueue_gke_default_topology.yaml.j2 +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/kueue_sub_slicing_topology.yaml.j2 +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/kueue_super_slicing_topology.yaml.j2 +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/mtc-cpc.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/console.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/console_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/execution_context.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/feature_flags.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/file.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/kueue.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/network.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/objects.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/templates.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/topology.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/topology_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/user_agent.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/user_agent_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/user_input.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/user_input_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/validation.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/validation_test.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/versions.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/src/xpk.egg-info/top_level.txt +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/tools/Dockerfile-kjob +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/tools/build-kjob.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/tools/install-gke-auth-plugin.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/tools/install-xpk.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/xpk-large-scale-guide.sh +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/xpk-notebooks.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/xpk-slurm-commands.md +0 -0
- {xpk-0.17.0 → xpk-0.17.2}/xpk.py +0 -0
|
@@ -35,7 +35,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94
|
|
|
35
35
|
docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
|
|
36
36
|
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
|
|
37
37
|
docker push gcr.io/golden-project/dry-run-runner:prefix-current
|
|
38
|
-
[XPK] Temp file (
|
|
38
|
+
[XPK] Temp file (9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5) content:
|
|
39
39
|
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
40
40
|
kind: JobSet
|
|
41
41
|
metadata:
|
|
@@ -65,6 +65,7 @@ spec:
|
|
|
65
65
|
podFailurePolicy:
|
|
66
66
|
rules:
|
|
67
67
|
- action: FailJob
|
|
68
|
+
onPodConditions: []
|
|
68
69
|
onExitCodes:
|
|
69
70
|
containerName: jax-tpu
|
|
70
71
|
operator: NotIn
|
|
@@ -145,7 +146,7 @@ spec:
|
|
|
145
146
|
|
|
146
147
|
|
|
147
148
|
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
|
|
148
|
-
kubectl apply -f
|
|
149
|
+
kubectl apply -f 9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5
|
|
149
150
|
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
|
|
150
151
|
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
|
|
151
152
|
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
|
|
@@ -39,7 +39,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94
|
|
|
39
39
|
docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
|
|
40
40
|
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
|
|
41
41
|
docker push gcr.io/golden-project/dry-run-runner:prefix-current
|
|
42
|
-
[XPK] Temp file (
|
|
42
|
+
[XPK] Temp file (8d5155a477cf99bc463104e0b22de0d21ee90548f51297fe429cdaa721d70a63) content:
|
|
43
43
|
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
44
44
|
kind: JobSet
|
|
45
45
|
metadata:
|
|
@@ -69,6 +69,7 @@ spec:
|
|
|
69
69
|
podFailurePolicy:
|
|
70
70
|
rules:
|
|
71
71
|
- action: FailJob
|
|
72
|
+
onPodConditions: []
|
|
72
73
|
onExitCodes:
|
|
73
74
|
containerName: jax-tpu
|
|
74
75
|
operator: NotIn
|
|
@@ -150,7 +151,7 @@ spec:
|
|
|
150
151
|
|
|
151
152
|
|
|
152
153
|
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
|
|
153
|
-
kubectl apply -f
|
|
154
|
+
kubectl apply -f 8d5155a477cf99bc463104e0b22de0d21ee90548f51297fe429cdaa721d70a63
|
|
154
155
|
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
|
|
155
156
|
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
|
|
156
157
|
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
|
|
@@ -39,7 +39,7 @@ docker buildx build --platform=linux/amd64 -f 4b6736a12db8ea0f78ce793fd0d4ee0c94
|
|
|
39
39
|
docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
|
|
40
40
|
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
|
|
41
41
|
docker push gcr.io/golden-project/dry-run-runner:prefix-current
|
|
42
|
-
[XPK] Temp file (
|
|
42
|
+
[XPK] Temp file (5c6c507500cfbde66c80baa4f3a642c49ec3501b383057e8b68595c4121e95aa) content:
|
|
43
43
|
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
44
44
|
kind: JobSet
|
|
45
45
|
metadata:
|
|
@@ -69,6 +69,7 @@ spec:
|
|
|
69
69
|
podFailurePolicy:
|
|
70
70
|
rules:
|
|
71
71
|
- action: FailJob
|
|
72
|
+
onPodConditions: []
|
|
72
73
|
onExitCodes:
|
|
73
74
|
containerName: jax-tpu
|
|
74
75
|
operator: NotIn
|
|
@@ -149,7 +150,7 @@ spec:
|
|
|
149
150
|
|
|
150
151
|
|
|
151
152
|
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
|
|
152
|
-
kubectl apply -f
|
|
153
|
+
kubectl apply -f 5c6c507500cfbde66c80baa4f3a642c49ec3501b383057e8b68595c4121e95aa
|
|
153
154
|
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
|
|
154
155
|
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
|
|
155
156
|
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
|
|
@@ -36,7 +36,7 @@ docker tag dry-run-runner gcr.io/golden-project/dry-run-runner:prefix-current
|
|
|
36
36
|
[XPK] Task: `Upload Docker Image` is implemented by the following command not running since it is a dry run.
|
|
37
37
|
docker push gcr.io/golden-project/dry-run-runner:prefix-current
|
|
38
38
|
[XPK] Workload golden-workload manifest written to /var/tmp/manifest.yaml
|
|
39
|
-
[XPK] Temp file (
|
|
39
|
+
[XPK] Temp file (9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5) content:
|
|
40
40
|
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
41
41
|
kind: JobSet
|
|
42
42
|
metadata:
|
|
@@ -66,6 +66,7 @@ spec:
|
|
|
66
66
|
podFailurePolicy:
|
|
67
67
|
rules:
|
|
68
68
|
- action: FailJob
|
|
69
|
+
onPodConditions: []
|
|
69
70
|
onExitCodes:
|
|
70
71
|
containerName: jax-tpu
|
|
71
72
|
operator: NotIn
|
|
@@ -146,7 +147,7 @@ spec:
|
|
|
146
147
|
|
|
147
148
|
|
|
148
149
|
[XPK] Task: `Creating Workload` is implemented by the following command not running since it is a dry run.
|
|
149
|
-
kubectl apply -f
|
|
150
|
+
kubectl apply -f 9c33f52b12eab63cfb9ce1aba6ff74f4642cb9e40e2a0ad9517d189ee41f09a5
|
|
150
151
|
[XPK] Task: `GKE Dashboard List` is implemented by the following command not running since it is a dry run.
|
|
151
152
|
gcloud monitoring dashboards list --project=golden-project --filter="displayName:'GKE - TPU Monitoring Dashboard'" --format="value(name)" --verbosity=error
|
|
152
153
|
[XPK] Check statistics and outlier mode of GKE metrics here: https://console.cloud.google.com/monitoring/dashboards/builder/0?project=golden-project&f.rlabel.cluster_name.ClusterName=golden-cluster. To view the metric data for your workload, select golden-workload from the JobName filter on the dashboard.
|
|
@@ -23,7 +23,6 @@ from kubernetes.client.rest import ApiException
|
|
|
23
23
|
|
|
24
24
|
from ..core import gcsfuse
|
|
25
25
|
from ..core.cluster import (
|
|
26
|
-
DEFAULT_NAMESPACE,
|
|
27
26
|
add_zone_and_project,
|
|
28
27
|
get_cluster_network,
|
|
29
28
|
setup_k8s_env,
|
|
@@ -35,12 +34,6 @@ from ..core.cluster import (
|
|
|
35
34
|
update_cluster_with_workload_identity_if_necessary,
|
|
36
35
|
)
|
|
37
36
|
from ..core.filestore import FilestoreClient, get_storage_class_name
|
|
38
|
-
from ..core.kjob import (
|
|
39
|
-
KJOB_API_GROUP_NAME,
|
|
40
|
-
KJOB_API_GROUP_VERSION,
|
|
41
|
-
KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
42
|
-
create_volume_bundle_instance,
|
|
43
|
-
)
|
|
44
37
|
from ..core.storage import (
|
|
45
38
|
GCP_FILESTORE_TYPE,
|
|
46
39
|
GCS_FUSE_TYPE,
|
|
@@ -98,9 +91,6 @@ def storage_create(args: Namespace) -> None:
|
|
|
98
91
|
|
|
99
92
|
k8s_api_client = setup_k8s_env(args)
|
|
100
93
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
101
|
-
create_volume_bundle_instance(
|
|
102
|
-
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
103
|
-
)
|
|
104
94
|
# Not required for Filestore. Will be uncommented when adding GCSFuse create
|
|
105
95
|
# return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
106
96
|
# if return_code > 0:
|
|
@@ -214,9 +204,6 @@ def storage_attach(args: Namespace) -> None:
|
|
|
214
204
|
|
|
215
205
|
k8s_api_client = setup_k8s_env(args)
|
|
216
206
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
217
|
-
create_volume_bundle_instance(
|
|
218
|
-
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
219
|
-
)
|
|
220
207
|
|
|
221
208
|
enable_csi_drivers_if_necessary(args)
|
|
222
209
|
|
|
@@ -332,18 +319,6 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
|
|
|
332
319
|
"Storage Class",
|
|
333
320
|
)
|
|
334
321
|
|
|
335
|
-
delete_resource(
|
|
336
|
-
lambda name: api_instance.delete_namespaced_custom_object(
|
|
337
|
-
namespace=DEFAULT_NAMESPACE,
|
|
338
|
-
name=name,
|
|
339
|
-
group=KJOB_API_GROUP_NAME,
|
|
340
|
-
version=KJOB_API_GROUP_VERSION,
|
|
341
|
-
plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
342
|
-
),
|
|
343
|
-
storage.name,
|
|
344
|
-
"VolumeBundle",
|
|
345
|
-
)
|
|
346
|
-
|
|
347
322
|
delete_resource(
|
|
348
323
|
lambda name: api_instance.delete_cluster_custom_object(
|
|
349
324
|
name=name,
|
|
@@ -717,10 +717,8 @@ def get_cluster_credentials(args) -> int:
|
|
|
717
717
|
location=location,
|
|
718
718
|
dns_endpoint=True,
|
|
719
719
|
)
|
|
720
|
-
if return_code != 0:
|
|
721
|
-
return return_code
|
|
722
720
|
|
|
723
|
-
if not _are_credentials_valid():
|
|
721
|
+
if return_code != 0 or not _are_credentials_valid():
|
|
724
722
|
xpk_print('Detected error. Retrying without --dns-endpoint flag...')
|
|
725
723
|
return_code = _get_credentials(
|
|
726
724
|
project=args.project,
|
|
@@ -180,157 +180,6 @@ def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
|
180
180
|
)
|
|
181
181
|
|
|
182
182
|
|
|
183
|
-
def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
184
|
-
"""Add Slurm job arguments to the parser.
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
custom_parser_or_group: parser or argument group to add global arguments to.
|
|
188
|
-
"""
|
|
189
|
-
custom_parser_or_group.add_argument(
|
|
190
|
-
'--ignore-unknown-flags',
|
|
191
|
-
type=bool,
|
|
192
|
-
action=argparse.BooleanOptionalAction,
|
|
193
|
-
default=False,
|
|
194
|
-
help='Ignore all the unsupported flags in the bash script.',
|
|
195
|
-
)
|
|
196
|
-
custom_parser_or_group.add_argument(
|
|
197
|
-
'-a',
|
|
198
|
-
'--array',
|
|
199
|
-
type=str,
|
|
200
|
-
default=None,
|
|
201
|
-
help=(
|
|
202
|
-
'Submit a job array, multiple jobs to be executed with identical'
|
|
203
|
-
' parameters. The indexes specification identifies what array index'
|
|
204
|
-
' values should be used. For example, "--array=0-15" or'
|
|
205
|
-
' "--array=0,6,16-32". Multiple values may be specified using a comma'
|
|
206
|
-
' separated list and/or a range of values with a "-" separator. For'
|
|
207
|
-
' example "--array=0-15%%4" will limit the number of simultaneously'
|
|
208
|
-
' running tasks from this job array to 4. The minimum index value is'
|
|
209
|
-
' 0. The maximum index value is 2147483647.'
|
|
210
|
-
),
|
|
211
|
-
)
|
|
212
|
-
custom_parser_or_group.add_argument(
|
|
213
|
-
'-c',
|
|
214
|
-
'--cpus-per-task',
|
|
215
|
-
type=str,
|
|
216
|
-
default=None,
|
|
217
|
-
help='How much cpus a container inside a pod requires.',
|
|
218
|
-
)
|
|
219
|
-
custom_parser_or_group.add_argument(
|
|
220
|
-
'--gpus-per-task',
|
|
221
|
-
type=str,
|
|
222
|
-
default=None,
|
|
223
|
-
help='How much gpus a container inside a pod requires.',
|
|
224
|
-
)
|
|
225
|
-
custom_parser_or_group.add_argument(
|
|
226
|
-
'--mem',
|
|
227
|
-
type=str,
|
|
228
|
-
default=None,
|
|
229
|
-
help='How much memory a pod requires.',
|
|
230
|
-
)
|
|
231
|
-
custom_parser_or_group.add_argument(
|
|
232
|
-
'--mem-per-task',
|
|
233
|
-
type=str,
|
|
234
|
-
default=None,
|
|
235
|
-
help='How much memory a container requires.',
|
|
236
|
-
)
|
|
237
|
-
custom_parser_or_group.add_argument(
|
|
238
|
-
'--mem-per-cpu',
|
|
239
|
-
type=str,
|
|
240
|
-
default=None,
|
|
241
|
-
help=(
|
|
242
|
-
'How much memory a container requires, it multiplies the number '
|
|
243
|
-
'of requested cpus per task by mem-per-cpu.'
|
|
244
|
-
),
|
|
245
|
-
)
|
|
246
|
-
custom_parser_or_group.add_argument(
|
|
247
|
-
'--mem-per-gpu',
|
|
248
|
-
type=str,
|
|
249
|
-
default=None,
|
|
250
|
-
help=(
|
|
251
|
-
'How much memory a container requires, it multiplies the number '
|
|
252
|
-
'of requested gpus per task by mem-per-gpu.'
|
|
253
|
-
),
|
|
254
|
-
)
|
|
255
|
-
custom_parser_or_group.add_argument(
|
|
256
|
-
'-N',
|
|
257
|
-
'--nodes',
|
|
258
|
-
type=int,
|
|
259
|
-
default=None,
|
|
260
|
-
help='Number of pods to be used at a time.',
|
|
261
|
-
)
|
|
262
|
-
custom_parser_or_group.add_argument(
|
|
263
|
-
'-n',
|
|
264
|
-
'--ntasks',
|
|
265
|
-
type=int,
|
|
266
|
-
default=None,
|
|
267
|
-
help='Number of identical containers inside of a pod, usually 1.',
|
|
268
|
-
)
|
|
269
|
-
custom_parser_or_group.add_argument(
|
|
270
|
-
'-o',
|
|
271
|
-
'--output',
|
|
272
|
-
type=str,
|
|
273
|
-
default=None,
|
|
274
|
-
help=(
|
|
275
|
-
'Where to redirect the standard output stream of a task. If not'
|
|
276
|
-
' passed it proceeds to stdout, and is available via kubectl logs.'
|
|
277
|
-
),
|
|
278
|
-
)
|
|
279
|
-
custom_parser_or_group.add_argument(
|
|
280
|
-
'-e',
|
|
281
|
-
'--error',
|
|
282
|
-
type=str,
|
|
283
|
-
default=None,
|
|
284
|
-
help=(
|
|
285
|
-
'Where to redirect std error stream of a task. If not passed it'
|
|
286
|
-
' proceeds to stdout, and is available via kubectl logs.'
|
|
287
|
-
),
|
|
288
|
-
)
|
|
289
|
-
custom_parser_or_group.add_argument(
|
|
290
|
-
'--input',
|
|
291
|
-
type=str,
|
|
292
|
-
default=None,
|
|
293
|
-
help='What to pipe into the script.',
|
|
294
|
-
)
|
|
295
|
-
custom_parser_or_group.add_argument(
|
|
296
|
-
'-J',
|
|
297
|
-
'--job-name',
|
|
298
|
-
type=str,
|
|
299
|
-
default=None,
|
|
300
|
-
help='What is the job name.',
|
|
301
|
-
)
|
|
302
|
-
custom_parser_or_group.add_argument(
|
|
303
|
-
'-D',
|
|
304
|
-
'--chdir',
|
|
305
|
-
type=str,
|
|
306
|
-
default=None,
|
|
307
|
-
help='Change directory before executing the script.',
|
|
308
|
-
)
|
|
309
|
-
custom_parser_or_group.add_argument(
|
|
310
|
-
'-t',
|
|
311
|
-
'--time',
|
|
312
|
-
type=str,
|
|
313
|
-
default=None,
|
|
314
|
-
help=(
|
|
315
|
-
'Set a limit on the total run time of the job. '
|
|
316
|
-
'A time limit of zero requests that no time limit be imposed. '
|
|
317
|
-
'Acceptable time formats include "minutes", "minutes:seconds", '
|
|
318
|
-
'"hours:minutes:seconds", "days-hours", "days-hours:minutes" '
|
|
319
|
-
'and "days-hours:minutes:seconds".'
|
|
320
|
-
),
|
|
321
|
-
)
|
|
322
|
-
custom_parser_or_group.add_argument(
|
|
323
|
-
'--priority',
|
|
324
|
-
type=str,
|
|
325
|
-
default='medium',
|
|
326
|
-
choices=['very-low', 'low', 'medium', 'high', 'very-high'],
|
|
327
|
-
help=(
|
|
328
|
-
'A priority, one of `very-low`, `low`, `medium`, `high` or'
|
|
329
|
-
' `very-high`. Defaults to `medium`.'
|
|
330
|
-
),
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
|
|
334
183
|
def add_tpu_type_argument(
|
|
335
184
|
custom_parser_or_group: ParserOrArgumentGroup,
|
|
336
185
|
required: bool = False,
|
|
@@ -182,7 +182,6 @@ src/xpk/core/gcloud_context_test.py
|
|
|
182
182
|
src/xpk/core/gcluster_manager.py
|
|
183
183
|
src/xpk/core/gcsfuse.py
|
|
184
184
|
src/xpk/core/jobset.py
|
|
185
|
-
src/xpk/core/kjob.py
|
|
186
185
|
src/xpk/core/kueue_manager.py
|
|
187
186
|
src/xpk/core/kueue_manager_test.py
|
|
188
187
|
src/xpk/core/monitoring.py
|
|
@@ -262,7 +261,6 @@ src/xpk/templates/kueue_sub_slicing_topology.yaml.j2
|
|
|
262
261
|
src/xpk/templates/kueue_super_slicing_topology.yaml.j2
|
|
263
262
|
src/xpk/templates/mtc-cpc.yaml
|
|
264
263
|
src/xpk/templates/storage.yaml
|
|
265
|
-
src/xpk/templates/volume_bundle.yaml
|
|
266
264
|
src/xpk/utils/__init__.py
|
|
267
265
|
src/xpk/utils/console.py
|
|
268
266
|
src/xpk/utils/console_test.py
|