skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/core.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
|
2
|
-
import os
|
|
3
|
-
import shlex
|
|
4
2
|
import typing
|
|
5
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
4
|
|
|
@@ -9,7 +7,6 @@ import colorama
|
|
|
9
7
|
from sky import admin_policy
|
|
10
8
|
from sky import backends
|
|
11
9
|
from sky import catalog
|
|
12
|
-
from sky import check as sky_check
|
|
13
10
|
from sky import clouds
|
|
14
11
|
from sky import dag as dag_lib
|
|
15
12
|
from sky import data
|
|
@@ -20,16 +17,18 @@ from sky import optimizer
|
|
|
20
17
|
from sky import sky_logging
|
|
21
18
|
from sky import skypilot_config
|
|
22
19
|
from sky import task as task_lib
|
|
20
|
+
from sky.adaptors import common as adaptors_common
|
|
23
21
|
from sky.backends import backend_utils
|
|
22
|
+
from sky.backends import cloud_vm_ray_backend
|
|
24
23
|
from sky.clouds import cloud as sky_cloud
|
|
25
24
|
from sky.jobs.server import core as managed_jobs_core
|
|
26
25
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
26
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
28
27
|
from sky.schemas.api import responses
|
|
28
|
+
from sky.server.requests import request_names
|
|
29
29
|
from sky.skylet import autostop_lib
|
|
30
30
|
from sky.skylet import constants
|
|
31
31
|
from sky.skylet import job_lib
|
|
32
|
-
from sky.skylet import log_lib
|
|
33
32
|
from sky.usage import usage_lib
|
|
34
33
|
from sky.utils import admin_policy_utils
|
|
35
34
|
from sky.utils import common
|
|
@@ -44,6 +43,9 @@ from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
|
44
43
|
|
|
45
44
|
if typing.TYPE_CHECKING:
|
|
46
45
|
from sky import resources as resources_lib
|
|
46
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
47
|
+
else:
|
|
48
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
47
49
|
|
|
48
50
|
logger = sky_logging.init_logger(__name__)
|
|
49
51
|
|
|
@@ -83,7 +85,9 @@ def optimize(
|
|
|
83
85
|
# but we do not apply the admin policy there. We should apply the admin
|
|
84
86
|
# policy in the optimizer, but that will require some refactoring.
|
|
85
87
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
86
|
-
dag,
|
|
88
|
+
dag,
|
|
89
|
+
request_name=request_names.AdminPolicyRequestName.OPTIMIZE,
|
|
90
|
+
request_options=request_options) as dag:
|
|
87
91
|
dag.resolve_and_validate_volumes()
|
|
88
92
|
return optimizer.Optimizer.optimize(dag=dag,
|
|
89
93
|
minimize=minimize,
|
|
@@ -97,6 +101,8 @@ def status(
|
|
|
97
101
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
98
102
|
all_users: bool = False,
|
|
99
103
|
include_credentials: bool = False,
|
|
104
|
+
summary_response: bool = False,
|
|
105
|
+
include_handle: bool = True,
|
|
100
106
|
) -> List[responses.StatusResponse]:
|
|
101
107
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
102
108
|
"""Gets cluster statuses.
|
|
@@ -176,16 +182,25 @@ def status(
|
|
|
176
182
|
refresh=refresh,
|
|
177
183
|
cluster_names=cluster_names,
|
|
178
184
|
all_users=all_users,
|
|
179
|
-
include_credentials=include_credentials
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
185
|
+
include_credentials=include_credentials,
|
|
186
|
+
summary_response=summary_response,
|
|
187
|
+
include_handle=include_handle)
|
|
188
|
+
|
|
189
|
+
status_responses = []
|
|
190
|
+
for cluster in clusters:
|
|
191
|
+
try:
|
|
192
|
+
status_responses.append(
|
|
193
|
+
responses.StatusResponse.model_validate(cluster))
|
|
194
|
+
except Exception as e: # pylint: disable=broad-except
|
|
195
|
+
logger.warning('Failed to validate status responses for cluster '
|
|
196
|
+
f'{cluster.get("name")}: {e}')
|
|
197
|
+
return status_responses
|
|
183
198
|
|
|
184
199
|
|
|
185
200
|
def status_kubernetes(
|
|
186
201
|
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
187
202
|
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
188
|
-
List[
|
|
203
|
+
List[responses.ManagedJobRecord], Optional[str]]:
|
|
189
204
|
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
|
190
205
|
|
|
191
206
|
Managed jobs and services are also included in the clusters returned.
|
|
@@ -260,6 +275,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
|
|
|
260
275
|
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
|
261
276
|
for c in unmanaged_clusters
|
|
262
277
|
]
|
|
278
|
+
all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
|
|
263
279
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
|
264
280
|
|
|
265
281
|
|
|
@@ -288,7 +304,10 @@ def endpoints(cluster: str,
|
|
|
288
304
|
|
|
289
305
|
|
|
290
306
|
@usage_lib.entrypoint
|
|
291
|
-
def cost_report(
|
|
307
|
+
def cost_report(
|
|
308
|
+
days: Optional[int] = None,
|
|
309
|
+
dashboard_summary_response: bool = False,
|
|
310
|
+
cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
292
311
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
293
312
|
"""Get all cluster cost reports, including those that have been downed.
|
|
294
313
|
|
|
@@ -334,7 +353,12 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
334
353
|
if days is None:
|
|
335
354
|
days = constants.COST_REPORT_DEFAULT_DAYS
|
|
336
355
|
|
|
337
|
-
|
|
356
|
+
abbreviate_response = dashboard_summary_response and cluster_hashes is None
|
|
357
|
+
|
|
358
|
+
cluster_reports = global_user_state.get_clusters_from_history(
|
|
359
|
+
days=days,
|
|
360
|
+
abbreviate_response=abbreviate_response,
|
|
361
|
+
cluster_hashes=cluster_hashes)
|
|
338
362
|
logger.debug(
|
|
339
363
|
f'{len(cluster_reports)} clusters found from history with {days} days.')
|
|
340
364
|
|
|
@@ -352,43 +376,6 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
352
376
|
cost = (launched_resources.get_cost(duration) * launched_nodes)
|
|
353
377
|
return cost
|
|
354
378
|
|
|
355
|
-
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
|
356
|
-
"""Add resource fields for dashboard compatibility."""
|
|
357
|
-
if record is None:
|
|
358
|
-
return
|
|
359
|
-
resources = record.get('resources')
|
|
360
|
-
if resources is None:
|
|
361
|
-
return
|
|
362
|
-
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
|
363
|
-
for field in fields:
|
|
364
|
-
try:
|
|
365
|
-
record[field] = str(getattr(resources, field))
|
|
366
|
-
except Exception as e: # pylint: disable=broad-except
|
|
367
|
-
# Ok to skip the fields as this is just for display
|
|
368
|
-
# purposes.
|
|
369
|
-
logger.debug(f'Failed to get resources.{field} for cluster '
|
|
370
|
-
f'{record["name"]}: {str(e)}')
|
|
371
|
-
record[field] = None
|
|
372
|
-
|
|
373
|
-
# Add resources_str and resources_str_full for dashboard
|
|
374
|
-
# compatibility
|
|
375
|
-
num_nodes = record.get('num_nodes', 1)
|
|
376
|
-
try:
|
|
377
|
-
resource_str_simple = resources_utils.format_resource(
|
|
378
|
-
resources, simplify=True)
|
|
379
|
-
resource_str_full = resources_utils.format_resource(
|
|
380
|
-
resources, simplify=False)
|
|
381
|
-
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
|
382
|
-
record[
|
|
383
|
-
'resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
|
384
|
-
except Exception as e: # pylint: disable=broad-except
|
|
385
|
-
logger.debug(f'Failed to get resources_str for cluster '
|
|
386
|
-
f'{record["name"]}: {str(e)}')
|
|
387
|
-
for field in fields:
|
|
388
|
-
record[field] = None
|
|
389
|
-
record['resources_str'] = '-'
|
|
390
|
-
record['resources_str_full'] = '-'
|
|
391
|
-
|
|
392
379
|
try:
|
|
393
380
|
report['total_cost'] = get_total_cost(report)
|
|
394
381
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -397,17 +384,62 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
397
384
|
f'{report["name"]}: {str(e)}')
|
|
398
385
|
report['total_cost'] = 0.0
|
|
399
386
|
|
|
400
|
-
_update_record_with_resources(report)
|
|
401
387
|
return report
|
|
402
388
|
|
|
403
389
|
# Process clusters in parallel
|
|
404
390
|
if not cluster_reports:
|
|
405
391
|
return []
|
|
406
392
|
|
|
407
|
-
|
|
408
|
-
|
|
393
|
+
if not abbreviate_response:
|
|
394
|
+
cluster_reports = subprocess_utils.run_in_parallel(
|
|
395
|
+
_process_cluster_report, cluster_reports)
|
|
396
|
+
|
|
397
|
+
def _update_record_with_resources(record: Dict[str, Any]) -> None:
|
|
398
|
+
"""Add resource fields for dashboard compatibility."""
|
|
399
|
+
if record is None:
|
|
400
|
+
return
|
|
401
|
+
resources = record.get('resources')
|
|
402
|
+
if resources is None:
|
|
403
|
+
return
|
|
404
|
+
if not dashboard_summary_response:
|
|
405
|
+
fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
|
|
406
|
+
else:
|
|
407
|
+
fields = ['cloud']
|
|
408
|
+
for field in fields:
|
|
409
|
+
try:
|
|
410
|
+
record[field] = str(getattr(resources, field))
|
|
411
|
+
except Exception as e: # pylint: disable=broad-except
|
|
412
|
+
# Ok to skip the fields as this is just for display
|
|
413
|
+
# purposes.
|
|
414
|
+
logger.debug(f'Failed to get resources.{field} for cluster '
|
|
415
|
+
f'{record["name"]}: {str(e)}')
|
|
416
|
+
record[field] = None
|
|
409
417
|
|
|
410
|
-
|
|
418
|
+
# Add resources_str and resources_str_full for dashboard
|
|
419
|
+
# compatibility
|
|
420
|
+
num_nodes = record.get('num_nodes', 1)
|
|
421
|
+
try:
|
|
422
|
+
resource_str_simple, resource_str_full = (
|
|
423
|
+
resources_utils.format_resource(resources,
|
|
424
|
+
simplified_only=False))
|
|
425
|
+
record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
|
|
426
|
+
record['resources_str_full'] = f'{num_nodes}x{resource_str_full}'
|
|
427
|
+
except Exception as e: # pylint: disable=broad-except
|
|
428
|
+
logger.debug(f'Failed to get resources_str for cluster '
|
|
429
|
+
f'{record["name"]}: {str(e)}')
|
|
430
|
+
for field in fields:
|
|
431
|
+
record[field] = None
|
|
432
|
+
record['resources_str'] = '-'
|
|
433
|
+
record['resources_str_full'] = '-'
|
|
434
|
+
|
|
435
|
+
for report in cluster_reports:
|
|
436
|
+
_update_record_with_resources(report)
|
|
437
|
+
if dashboard_summary_response:
|
|
438
|
+
report.pop('usage_intervals')
|
|
439
|
+
report.pop('user_hash')
|
|
440
|
+
report.pop('resources')
|
|
441
|
+
|
|
442
|
+
return cluster_reports
|
|
411
443
|
|
|
412
444
|
|
|
413
445
|
def _start(
|
|
@@ -466,6 +498,32 @@ def _start(
|
|
|
466
498
|
controller_autostop_config.enabled):
|
|
467
499
|
idle_minutes_to_autostop = controller_autostop_config.idle_minutes
|
|
468
500
|
down = controller_autostop_config.down
|
|
501
|
+
else:
|
|
502
|
+
# For non-controller clusters, restore autostop configuration from
|
|
503
|
+
# database if not explicitly provided.
|
|
504
|
+
if idle_minutes_to_autostop is None:
|
|
505
|
+
cluster_record = global_user_state.get_cluster_from_name(
|
|
506
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
507
|
+
if cluster_record is not None:
|
|
508
|
+
stored_autostop = cluster_record.get('autostop', -1)
|
|
509
|
+
stored_to_down = cluster_record.get('to_down', False)
|
|
510
|
+
# Restore autostop if it was previously set (autostop > 0)
|
|
511
|
+
if stored_autostop > 0:
|
|
512
|
+
logger.warning(f'Restoring cluster {cluster_name!r} with '
|
|
513
|
+
f'autostop set to {stored_autostop} minutes'
|
|
514
|
+
f'. To turn off autostop, run: '
|
|
515
|
+
f'`sky autostop {cluster_name} --cancel`')
|
|
516
|
+
idle_minutes_to_autostop = stored_autostop
|
|
517
|
+
# Only restore 'down' if it was explicitly set and we're
|
|
518
|
+
# restoring autostop
|
|
519
|
+
if stored_to_down:
|
|
520
|
+
down = stored_to_down
|
|
521
|
+
elif stored_autostop == 0:
|
|
522
|
+
logger.warning(
|
|
523
|
+
f'Autostop was previously set to 0 minutes '
|
|
524
|
+
f'for cluster {cluster_name!r} so it will '
|
|
525
|
+
'not be restored. To turn on autostop, run: '
|
|
526
|
+
f'`sky autostop {cluster_name} -i <minutes>`')
|
|
469
527
|
|
|
470
528
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
471
529
|
|
|
@@ -773,7 +831,7 @@ def autostop(
|
|
|
773
831
|
@usage_lib.entrypoint
|
|
774
832
|
def queue(cluster_name: str,
|
|
775
833
|
skip_finished: bool = False,
|
|
776
|
-
all_users: bool = False) -> List[
|
|
834
|
+
all_users: bool = False) -> List[responses.ClusterJobRecord]:
|
|
777
835
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
778
836
|
"""Gets the job queue of a cluster.
|
|
779
837
|
|
|
@@ -811,7 +869,6 @@ def queue(cluster_name: str,
|
|
|
811
869
|
user_hash = None
|
|
812
870
|
else:
|
|
813
871
|
user_hash = common_utils.get_current_user().id
|
|
814
|
-
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
815
872
|
|
|
816
873
|
handle = backend_utils.check_cluster_available(
|
|
817
874
|
cluster_name,
|
|
@@ -819,18 +876,49 @@ def queue(cluster_name: str,
|
|
|
819
876
|
)
|
|
820
877
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
821
878
|
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
879
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
880
|
+
|
|
881
|
+
if not use_legacy:
|
|
882
|
+
try:
|
|
883
|
+
request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
|
|
884
|
+
all_jobs=all_jobs)
|
|
885
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
886
|
+
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
887
|
+
handle.get_grpc_channel()).get_job_queue(request))
|
|
888
|
+
jobs = []
|
|
889
|
+
for job_info in response.jobs:
|
|
890
|
+
job_dict = {
|
|
891
|
+
'job_id': job_info.job_id,
|
|
892
|
+
'job_name': job_info.job_name,
|
|
893
|
+
'submitted_at': job_info.submitted_at,
|
|
894
|
+
'status': job_lib.JobStatus.from_protobuf(job_info.status),
|
|
895
|
+
'run_timestamp': job_info.run_timestamp,
|
|
896
|
+
'start_at': job_info.start_at
|
|
897
|
+
if job_info.HasField('start_at') else None,
|
|
898
|
+
'end_at': job_info.end_at
|
|
899
|
+
if job_info.HasField('end_at') else None,
|
|
900
|
+
'resources': job_info.resources,
|
|
901
|
+
'log_path': job_info.log_path,
|
|
902
|
+
'user_hash': job_info.username,
|
|
903
|
+
}
|
|
904
|
+
# Copied from job_lib.load_job_queue.
|
|
905
|
+
user = global_user_state.get_user(job_dict['user_hash'])
|
|
906
|
+
job_dict['username'] = user.name if user is not None else None
|
|
907
|
+
jobs.append(job_dict)
|
|
908
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
909
|
+
use_legacy = True
|
|
910
|
+
if use_legacy:
|
|
911
|
+
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
912
|
+
returncode, jobs_payload, stderr = backend.run_on_head(
|
|
913
|
+
handle, code, require_outputs=True, separate_stderr=True)
|
|
914
|
+
subprocess_utils.handle_returncode(
|
|
915
|
+
returncode,
|
|
916
|
+
command=code,
|
|
917
|
+
error_msg=f'Failed to get job queue on cluster {cluster_name}.',
|
|
918
|
+
stderr=f'{jobs_payload + stderr}',
|
|
919
|
+
stream_logs=True)
|
|
920
|
+
jobs = job_lib.load_job_queue(jobs_payload)
|
|
921
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
834
922
|
|
|
835
923
|
|
|
836
924
|
@usage_lib.entrypoint
|
|
@@ -1070,25 +1158,25 @@ def job_status(cluster_name: str,
|
|
|
1070
1158
|
# = Storage Management =
|
|
1071
1159
|
# ======================
|
|
1072
1160
|
@usage_lib.entrypoint
|
|
1073
|
-
def storage_ls() -> List[
|
|
1161
|
+
def storage_ls() -> List[responses.StorageRecord]:
|
|
1074
1162
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1075
1163
|
"""Gets the storages.
|
|
1076
1164
|
|
|
1077
1165
|
Returns:
|
|
1078
|
-
[
|
|
1079
|
-
{
|
|
1080
|
-
'name': str,
|
|
1081
|
-
'launched_at': int timestamp of creation,
|
|
1082
|
-
'store': List[sky.StoreType],
|
|
1083
|
-
'last_use': int timestamp of last use,
|
|
1084
|
-
'status': sky.StorageStatus,
|
|
1085
|
-
}
|
|
1086
|
-
]
|
|
1166
|
+
List[responses.StorageRecord]: A list of storage records.
|
|
1087
1167
|
"""
|
|
1088
1168
|
storages = global_user_state.get_storage()
|
|
1169
|
+
storage_records = []
|
|
1089
1170
|
for storage in storages:
|
|
1090
|
-
|
|
1091
|
-
|
|
1171
|
+
storage_records.append(
|
|
1172
|
+
responses.StorageRecord(
|
|
1173
|
+
name=storage['name'],
|
|
1174
|
+
launched_at=storage['launched_at'],
|
|
1175
|
+
store=list(storage.pop('handle').sky_stores.keys()),
|
|
1176
|
+
last_use=storage['last_use'],
|
|
1177
|
+
status=storage['status'],
|
|
1178
|
+
))
|
|
1179
|
+
return storage_records
|
|
1092
1180
|
|
|
1093
1181
|
|
|
1094
1182
|
@usage_lib.entrypoint
|
|
@@ -1104,9 +1192,7 @@ def storage_delete(name: str) -> None:
|
|
|
1104
1192
|
if handle is None:
|
|
1105
1193
|
raise ValueError(f'Storage name {name!r} not found.')
|
|
1106
1194
|
else:
|
|
1107
|
-
storage_object = data.Storage(
|
|
1108
|
-
source=handle.source,
|
|
1109
|
-
sync_on_reconstruction=False)
|
|
1195
|
+
storage_object = data.Storage.from_handle(handle)
|
|
1110
1196
|
storage_object.delete()
|
|
1111
1197
|
|
|
1112
1198
|
|
|
@@ -1125,6 +1211,7 @@ def enabled_clouds(workspace: Optional[str] = None,
|
|
|
1125
1211
|
return [cloud.canonical_name() for cloud in cached_clouds]
|
|
1126
1212
|
enabled_ssh_infras = []
|
|
1127
1213
|
enabled_k8s_infras = []
|
|
1214
|
+
enabled_slurm_infras = []
|
|
1128
1215
|
enabled_cloud_infras = []
|
|
1129
1216
|
for cloud in cached_clouds:
|
|
1130
1217
|
cloud_infra = cloud.expand_infras()
|
|
@@ -1132,10 +1219,16 @@ def enabled_clouds(workspace: Optional[str] = None,
|
|
|
1132
1219
|
enabled_ssh_infras.extend(cloud_infra)
|
|
1133
1220
|
elif isinstance(cloud, clouds.Kubernetes):
|
|
1134
1221
|
enabled_k8s_infras.extend(cloud_infra)
|
|
1222
|
+
elif isinstance(cloud, clouds.Slurm):
|
|
1223
|
+
enabled_slurm_infras.extend(cloud_infra)
|
|
1135
1224
|
else:
|
|
1136
1225
|
enabled_cloud_infras.extend(cloud_infra)
|
|
1226
|
+
# We do not sort slurm infras alphabetically because the
|
|
1227
|
+
# default partition should appear first.
|
|
1228
|
+
# Ordering of slurm infras is enforced in Slurm implementation.
|
|
1137
1229
|
all_infras = sorted(enabled_ssh_infras) + sorted(
|
|
1138
|
-
enabled_k8s_infras) + sorted(
|
|
1230
|
+
enabled_k8s_infras) + enabled_slurm_infras + sorted(
|
|
1231
|
+
enabled_cloud_infras)
|
|
1139
1232
|
return all_infras
|
|
1140
1233
|
|
|
1141
1234
|
|
|
@@ -1146,7 +1239,14 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1146
1239
|
quantity_filter: Optional[int] = None,
|
|
1147
1240
|
is_ssh: Optional[bool] = None
|
|
1148
1241
|
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
|
1242
|
+
"""Gets the real-time Kubernetes GPU availability.
|
|
1149
1243
|
|
|
1244
|
+
Returns:
|
|
1245
|
+
A list of tuples, where each tuple contains:
|
|
1246
|
+
- context (str): The Kubernetes context.
|
|
1247
|
+
- availability_list (List[models.RealtimeGpuAvailability]): A list
|
|
1248
|
+
of RealtimeGpuAvailability objects for that context.
|
|
1249
|
+
"""
|
|
1150
1250
|
if context is None:
|
|
1151
1251
|
# Include contexts from both Kubernetes and SSH clouds
|
|
1152
1252
|
kubernetes_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
@@ -1228,132 +1328,133 @@ def realtime_kubernetes_gpu_availability(
|
|
|
1228
1328
|
return availability_lists
|
|
1229
1329
|
|
|
1230
1330
|
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
ssh_key: Optional[str],
|
|
1239
|
-
cleanup: bool,
|
|
1240
|
-
context_name: Optional[str] = None,
|
|
1241
|
-
password: Optional[str] = None) -> None:
|
|
1242
|
-
"""Creates a local or remote cluster."""
|
|
1243
|
-
|
|
1244
|
-
def _validate_args(ips, ssh_user, ssh_key, cleanup):
|
|
1245
|
-
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
|
1246
|
-
# all must be specified
|
|
1247
|
-
if bool(ips) or bool(ssh_user) or bool(ssh_key):
|
|
1248
|
-
if not (ips and ssh_user and ssh_key):
|
|
1249
|
-
with ux_utils.print_exception_no_traceback():
|
|
1250
|
-
raise ValueError(
|
|
1251
|
-
'All ips, ssh_user, and ssh_key must be specified '
|
|
1252
|
-
'together.')
|
|
1253
|
-
|
|
1254
|
-
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
|
1255
|
-
# are all provided
|
|
1256
|
-
if cleanup and not (ips and ssh_user and ssh_key):
|
|
1257
|
-
with ux_utils.print_exception_no_traceback():
|
|
1258
|
-
raise ValueError(
|
|
1259
|
-
'cleanup can only be used with ips, ssh_user and ssh_key.')
|
|
1260
|
-
|
|
1261
|
-
_validate_args(ips, ssh_user, ssh_key, cleanup)
|
|
1262
|
-
|
|
1263
|
-
# If remote deployment arguments are specified, run remote up script
|
|
1264
|
-
if ips:
|
|
1265
|
-
assert ssh_user is not None and ssh_key is not None
|
|
1266
|
-
kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
|
|
1267
|
-
cleanup, context_name,
|
|
1268
|
-
password)
|
|
1269
|
-
else:
|
|
1270
|
-
# Run local deployment (kind) if no remote args are specified
|
|
1271
|
-
kubernetes_deploy_utils.deploy_local_cluster(gpus)
|
|
1331
|
+
def realtime_slurm_gpu_availability(
|
|
1332
|
+
slurm_cluster_name: Optional[str] = None,
|
|
1333
|
+
name_filter: Optional[str] = None,
|
|
1334
|
+
quantity_filter: Optional[int] = None,
|
|
1335
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
1336
|
+
**kwargs) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
|
1337
|
+
"""Gets Slurm real-time GPU availability grouped by partition.
|
|
1272
1338
|
|
|
1339
|
+
This function calls the Slurm backend to fetch GPU info.
|
|
1273
1340
|
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1341
|
+
Args:
|
|
1342
|
+
name_filter: Optional name filter for GPUs.
|
|
1343
|
+
quantity_filter: Optional quantity filter for GPUs.
|
|
1344
|
+
env_vars: Environment variables (may be needed for backend).
|
|
1345
|
+
kwargs: Additional keyword arguments.
|
|
1277
1346
|
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1347
|
+
Returns:
|
|
1348
|
+
A list of tuples, where each tuple contains:
|
|
1349
|
+
- partition_name (str): The name of the Slurm partition.
|
|
1350
|
+
- availability_list (List[models.RealtimeGpuAvailability]): A list
|
|
1351
|
+
of RealtimeGpuAvailability objects for that partition.
|
|
1352
|
+
Example structure:
|
|
1353
|
+
[
|
|
1354
|
+
('gpu_partition_1', [
|
|
1355
|
+
RealtimeGpuAvailability(gpu='V100', counts=[4, 8],
|
|
1356
|
+
capacity=16, available=10),
|
|
1357
|
+
RealtimeGpuAvailability(gpu='A100', counts=[8],
|
|
1358
|
+
capacity=8, available=0),
|
|
1359
|
+
]),
|
|
1360
|
+
('gpu_partition_2', [
|
|
1361
|
+
RealtimeGpuAvailability(gpu='V100', counts=[4],
|
|
1362
|
+
capacity=4, available=4),
|
|
1363
|
+
])
|
|
1364
|
+
]
|
|
1281
1365
|
|
|
1282
|
-
|
|
1283
|
-
|
|
1366
|
+
Raises:
|
|
1367
|
+
ValueError: If Slurm is not configured or no matching GPUs are found.
|
|
1368
|
+
exceptions.NotSupportedError: If Slurm is not enabled or configured.
|
|
1369
|
+
"""
|
|
1370
|
+
del env_vars, kwargs # Currently unused
|
|
1284
1371
|
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1372
|
+
if slurm_cluster_name is None:
|
|
1373
|
+
# Include contexts from both Kubernetes and SSH clouds
|
|
1374
|
+
slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
|
|
1375
|
+
else:
|
|
1376
|
+
slurm_cluster_names = [slurm_cluster_name]
|
|
1289
1377
|
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
log_path=log_path,
|
|
1297
|
-
require_outputs=True,
|
|
1298
|
-
stream_logs=False,
|
|
1299
|
-
cwd=cwd)
|
|
1300
|
-
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
1301
|
-
|
|
1302
|
-
if returncode == 0:
|
|
1303
|
-
cluster_removed = True
|
|
1304
|
-
elif returncode == 100:
|
|
1305
|
-
logger.info(ux_utils.error_message('Local cluster does not exist.'))
|
|
1306
|
-
else:
|
|
1307
|
-
with ux_utils.print_exception_no_traceback():
|
|
1308
|
-
raise RuntimeError('Failed to create local cluster. '
|
|
1309
|
-
f'Stdout: {stdout}'
|
|
1310
|
-
f'\nError: {stderr}')
|
|
1311
|
-
if cluster_removed:
|
|
1312
|
-
# Run sky check
|
|
1313
|
-
with rich_utils.safe_status(
|
|
1314
|
-
ux_utils.spinner_message('Running sky check...')):
|
|
1315
|
-
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
1316
|
-
clouds=['kubernetes'],
|
|
1317
|
-
quiet=True)
|
|
1318
|
-
logger.info(
|
|
1319
|
-
ux_utils.finishing_message('Local cluster removed.',
|
|
1320
|
-
log_path=log_path,
|
|
1321
|
-
is_local=True))
|
|
1378
|
+
# Optional: Check if Slurm is enabled first
|
|
1379
|
+
# enabled = global_user_state.get_enabled_clouds(
|
|
1380
|
+
# capability=sky_cloud.CloudCapability.COMPUTE)
|
|
1381
|
+
# if not clouds.Slurm() in enabled:
|
|
1382
|
+
# raise exceptions.NotSupportedError(
|
|
1383
|
+
# "Slurm is not enabled. Run 'sky check' to enable it.")
|
|
1322
1384
|
|
|
1385
|
+
def realtime_slurm_gpu_availability_single(
|
|
1386
|
+
slurm_cluster_name: str) -> List[models.RealtimeGpuAvailability]:
|
|
1387
|
+
try:
|
|
1388
|
+
# This function now returns aggregated data per GPU type:
|
|
1389
|
+
# Tuple[Dict[str, List[InstanceTypeInfo]], Dict[str, int],
|
|
1390
|
+
# Dict[str, int]]
|
|
1391
|
+
# (qtys_map, total_capacity, total_available)
|
|
1392
|
+
accelerator_counts, total_capacity, total_available = (
|
|
1393
|
+
catalog.list_accelerator_realtime(
|
|
1394
|
+
gpus_only=True, # Ensure we only query for GPUs
|
|
1395
|
+
name_filter=name_filter,
|
|
1396
|
+
# Pass None for region_filter here; filtering happens
|
|
1397
|
+
# inside if needed, but we want all partitions returned
|
|
1398
|
+
# for grouping.
|
|
1399
|
+
region_filter=slurm_cluster_name,
|
|
1400
|
+
quantity_filter=quantity_filter,
|
|
1401
|
+
clouds='slurm',
|
|
1402
|
+
case_sensitive=False,
|
|
1403
|
+
))
|
|
1404
|
+
except exceptions.NotSupportedError as e:
|
|
1405
|
+
logger.error(f'Failed to query Slurm GPU availability: {e}')
|
|
1406
|
+
raise
|
|
1407
|
+
except ValueError as e:
|
|
1408
|
+
# Re-raise ValueError if no GPUs are found matching the filters
|
|
1409
|
+
logger.error(f'Error querying Slurm GPU availability: {e}')
|
|
1410
|
+
raise
|
|
1411
|
+
except Exception as e:
|
|
1412
|
+
logger.error(
|
|
1413
|
+
'Error querying Slurm GPU availability: '
|
|
1414
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1415
|
+
raise ValueError(
|
|
1416
|
+
f'Error querying Slurm GPU availability: {e}') from e
|
|
1323
1417
|
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1418
|
+
# --- Format the output ---
|
|
1419
|
+
realtime_gpu_availability_list: List[
|
|
1420
|
+
models.RealtimeGpuAvailability] = []
|
|
1421
|
+
for gpu_type, _ in sorted(accelerator_counts.items()):
|
|
1422
|
+
realtime_gpu_availability_list.append(
|
|
1423
|
+
models.RealtimeGpuAvailability(
|
|
1424
|
+
gpu_type,
|
|
1425
|
+
accelerator_counts.pop(gpu_type),
|
|
1426
|
+
total_capacity[gpu_type],
|
|
1427
|
+
total_available[gpu_type],
|
|
1428
|
+
))
|
|
1429
|
+
return realtime_gpu_availability_list
|
|
1327
1430
|
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1431
|
+
parallel_queried = subprocess_utils.run_in_parallel(
|
|
1432
|
+
realtime_slurm_gpu_availability_single, slurm_cluster_names)
|
|
1433
|
+
availability_lists: List[Tuple[str,
|
|
1434
|
+
List[models.RealtimeGpuAvailability]]] = []
|
|
1435
|
+
for slurm_cluster_name, queried in zip(slurm_cluster_names,
|
|
1436
|
+
parallel_queried):
|
|
1437
|
+
if len(queried) == 0:
|
|
1438
|
+
logger.debug(f'No gpus found in Slurm cluster {slurm_cluster_name}')
|
|
1439
|
+
continue
|
|
1440
|
+
availability_lists.append((slurm_cluster_name, queried))
|
|
1441
|
+
return availability_lists
|
|
1337
1442
|
|
|
1338
1443
|
|
|
1444
|
+
# =================
|
|
1445
|
+
# = Local Cluster =
|
|
1446
|
+
# =================
|
|
1339
1447
|
@usage_lib.entrypoint
|
|
1340
|
-
def
|
|
1341
|
-
|
|
1448
|
+
def local_up(gpus: bool,
|
|
1449
|
+
name: Optional[str] = None,
|
|
1450
|
+
port_start: Optional[int] = None) -> None:
|
|
1451
|
+
"""Creates a local cluster."""
|
|
1452
|
+
kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
|
|
1342
1453
|
|
|
1343
|
-
Args:
|
|
1344
|
-
context_name: The SSH context name (e.g., 'ssh-my-cluster')
|
|
1345
1454
|
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
- reason: Explanation of the status
|
|
1350
|
-
"""
|
|
1351
|
-
try:
|
|
1352
|
-
is_ready, reason = clouds.SSH.check_single_context(context_name)
|
|
1353
|
-
return is_ready, reason
|
|
1354
|
-
except Exception as e: # pylint: disable=broad-except
|
|
1355
|
-
return False, ('Failed to check SSH context: '
|
|
1356
|
-
f'{common_utils.format_exception(e)}')
|
|
1455
|
+
def local_down(name: Optional[str] = None) -> None:
|
|
1456
|
+
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1457
|
+
kubernetes_deploy_utils.teardown_local_cluster(name)
|
|
1357
1458
|
|
|
1358
1459
|
|
|
1359
1460
|
def get_all_contexts() -> List[str]:
|