skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/azure.py
CHANGED
|
@@ -15,6 +15,7 @@ from sky import exceptions
|
|
|
15
15
|
from sky import sky_logging
|
|
16
16
|
from sky import skypilot_config
|
|
17
17
|
from sky.adaptors import azure
|
|
18
|
+
from sky.adaptors import common as adaptors_common
|
|
18
19
|
from sky.clouds.utils import azure_utils
|
|
19
20
|
from sky.utils import annotations
|
|
20
21
|
from sky.utils import common_utils
|
|
@@ -86,7 +87,9 @@ class Azure(clouds.Cloud):
|
|
|
86
87
|
|
|
87
88
|
@classmethod
|
|
88
89
|
def _unsupported_features_for_resources(
|
|
89
|
-
cls,
|
|
90
|
+
cls,
|
|
91
|
+
resources: 'resources.Resources',
|
|
92
|
+
region: Optional[str] = None,
|
|
90
93
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
91
94
|
features = {
|
|
92
95
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
|
@@ -263,10 +266,15 @@ class Azure(clouds.Cloud):
|
|
|
263
266
|
return _DEFAULT_GPU_IMAGE_ID
|
|
264
267
|
|
|
265
268
|
@classmethod
|
|
266
|
-
def regions_with_offering(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
269
|
+
def regions_with_offering(
|
|
270
|
+
cls,
|
|
271
|
+
instance_type: str,
|
|
272
|
+
accelerators: Optional[Dict[str, int]],
|
|
273
|
+
use_spot: bool,
|
|
274
|
+
region: Optional[str],
|
|
275
|
+
zone: Optional[str],
|
|
276
|
+
resources: Optional['resources.Resources'] = None,
|
|
277
|
+
) -> List[clouds.Region]:
|
|
270
278
|
del accelerators # unused
|
|
271
279
|
assert zone is None, 'Azure does not support zones'
|
|
272
280
|
regions = catalog.get_region_zones_for_instance_type(
|
|
@@ -546,6 +554,7 @@ class Azure(clouds.Cloud):
|
|
|
546
554
|
@classmethod
|
|
547
555
|
def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
|
548
556
|
"""Checks if the user has access credentials to this cloud."""
|
|
557
|
+
|
|
549
558
|
help_str = (
|
|
550
559
|
' Run the following commands:'
|
|
551
560
|
f'\n{cls._INDENT_PREFIX} $ az login'
|
|
@@ -561,6 +570,16 @@ class Azure(clouds.Cloud):
|
|
|
561
570
|
return (False,
|
|
562
571
|
f'{azure_token_cache_file} does not exist.' + help_str)
|
|
563
572
|
|
|
573
|
+
dependency_installation_hints = (
|
|
574
|
+
'Azure dependencies are not installed. '
|
|
575
|
+
'Run the following commands:'
|
|
576
|
+
f'\n{cls._INDENT_PREFIX} $ pip install skypilot[azure]'
|
|
577
|
+
f'\n{cls._INDENT_PREFIX}Credentials may also need to be set.')
|
|
578
|
+
# Check if the azure blob storage dependencies are installed.
|
|
579
|
+
if not adaptors_common.can_import_modules(
|
|
580
|
+
['azure.storage.blob', 'msgraph']):
|
|
581
|
+
return False, dependency_installation_hints
|
|
582
|
+
|
|
564
583
|
try:
|
|
565
584
|
_run_output('az --version')
|
|
566
585
|
except subprocess.CalledProcessError as e:
|
|
@@ -580,19 +599,6 @@ class Azure(clouds.Cloud):
|
|
|
580
599
|
return False, (f'Getting user\'s Azure identity failed.{help_str}\n'
|
|
581
600
|
f'{cls._INDENT_PREFIX}Details: '
|
|
582
601
|
f'{common_utils.format_exception(e)}')
|
|
583
|
-
|
|
584
|
-
# Check if the azure blob storage dependencies are installed.
|
|
585
|
-
try:
|
|
586
|
-
# pylint: disable=redefined-outer-name, import-outside-toplevel, unused-import
|
|
587
|
-
from azure.storage import blob
|
|
588
|
-
import msgraph
|
|
589
|
-
except ImportError as e:
|
|
590
|
-
return False, (
|
|
591
|
-
f'Azure blob storage depdencies are not installed. '
|
|
592
|
-
'Run the following commands:'
|
|
593
|
-
f'\n{cls._INDENT_PREFIX} $ pip install skypilot[azure]'
|
|
594
|
-
f'\n{cls._INDENT_PREFIX}Details: '
|
|
595
|
-
f'{common_utils.format_exception(e)}')
|
|
596
602
|
return True, None
|
|
597
603
|
|
|
598
604
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
sky/clouds/cloud.py
CHANGED
|
@@ -185,10 +185,15 @@ class Cloud:
|
|
|
185
185
|
#### Regions/Zones ####
|
|
186
186
|
|
|
187
187
|
@classmethod
|
|
188
|
-
def regions_with_offering(
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
188
|
+
def regions_with_offering(
|
|
189
|
+
cls,
|
|
190
|
+
instance_type: str,
|
|
191
|
+
accelerators: Optional[Dict[str, int]],
|
|
192
|
+
use_spot: bool,
|
|
193
|
+
region: Optional[str],
|
|
194
|
+
zone: Optional[str],
|
|
195
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
196
|
+
) -> List[Region]:
|
|
192
197
|
"""Returns the regions that offer the specified resources.
|
|
193
198
|
|
|
194
199
|
The order of the regions follow the order of the regions returned by
|
|
@@ -340,6 +345,14 @@ class Cloud:
|
|
|
340
345
|
"""Returns {acc: acc_count} held by 'instance_type', if any."""
|
|
341
346
|
raise NotImplementedError
|
|
342
347
|
|
|
348
|
+
@classmethod
|
|
349
|
+
def get_arch_from_instance_type(
|
|
350
|
+
cls,
|
|
351
|
+
instance_type: str,
|
|
352
|
+
) -> Optional[str]:
|
|
353
|
+
"""Returns the arch of the instance type, if any."""
|
|
354
|
+
raise NotImplementedError
|
|
355
|
+
|
|
343
356
|
@classmethod
|
|
344
357
|
def get_default_instance_type(cls,
|
|
345
358
|
cpus: Optional[str] = None,
|
|
@@ -666,8 +679,11 @@ class Cloud:
|
|
|
666
679
|
|
|
667
680
|
@classmethod
|
|
668
681
|
def check_features_are_supported(
|
|
669
|
-
|
|
670
|
-
|
|
682
|
+
cls,
|
|
683
|
+
resources: 'resources_lib.Resources',
|
|
684
|
+
requested_features: Set[CloudImplementationFeatures],
|
|
685
|
+
region: Optional[str] = None,
|
|
686
|
+
) -> None:
|
|
671
687
|
"""Errors out if the cloud does not support all requested features.
|
|
672
688
|
|
|
673
689
|
For instance, Lambda Cloud does not support stop, so
|
|
@@ -685,7 +701,7 @@ class Cloud:
|
|
|
685
701
|
requested features.
|
|
686
702
|
"""
|
|
687
703
|
unsupported_features2reason = cls._unsupported_features_for_resources(
|
|
688
|
-
resources)
|
|
704
|
+
resources, region)
|
|
689
705
|
|
|
690
706
|
# Docker image is not compatible with ssh proxy command.
|
|
691
707
|
if skypilot_config.get_effective_region_config(
|
|
@@ -715,7 +731,9 @@ class Cloud:
|
|
|
715
731
|
|
|
716
732
|
@classmethod
|
|
717
733
|
def _unsupported_features_for_resources(
|
|
718
|
-
cls,
|
|
734
|
+
cls,
|
|
735
|
+
resources: 'resources_lib.Resources',
|
|
736
|
+
region: Optional[str] = None,
|
|
719
737
|
) -> Dict[CloudImplementationFeatures, str]:
|
|
720
738
|
"""The features not supported based on the resources provided.
|
|
721
739
|
|
|
@@ -726,7 +744,7 @@ class Cloud:
|
|
|
726
744
|
A dict of {feature: reason} for the features not supported by the
|
|
727
745
|
cloud implementation.
|
|
728
746
|
"""
|
|
729
|
-
del resources
|
|
747
|
+
del resources, region
|
|
730
748
|
raise NotImplementedError
|
|
731
749
|
|
|
732
750
|
@classmethod
|
|
@@ -800,12 +818,21 @@ class Cloud:
|
|
|
800
818
|
if acc_from_instance_type is None:
|
|
801
819
|
return False
|
|
802
820
|
|
|
803
|
-
for
|
|
804
|
-
|
|
821
|
+
for requested_acc in acc_requested:
|
|
822
|
+
for instance_acc in acc_from_instance_type:
|
|
823
|
+
# The requested accelerator can be canonicalized based on
|
|
824
|
+
# the accelerator registry, which may not has the same case
|
|
825
|
+
# as the cloud's catalog, e.g., 'RTXPro6000' in Shadeform
|
|
826
|
+
# catalog, and 'RTXPRO6000' in RunPod catalog.
|
|
827
|
+
if requested_acc.lower() == instance_acc.lower():
|
|
828
|
+
# Found the requested accelerator in the instance type.
|
|
829
|
+
break
|
|
830
|
+
else:
|
|
831
|
+
# Requested accelerator not found in instance type.
|
|
805
832
|
return False
|
|
806
833
|
# Avoid float point precision issue.
|
|
807
|
-
if not math.isclose(acc_requested[
|
|
808
|
-
acc_from_instance_type[
|
|
834
|
+
if not math.isclose(acc_requested[requested_acc],
|
|
835
|
+
acc_from_instance_type[instance_acc]):
|
|
809
836
|
return False
|
|
810
837
|
return True
|
|
811
838
|
|
sky/clouds/cudo.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
|
5
5
|
|
|
6
6
|
from sky import catalog
|
|
7
7
|
from sky import clouds
|
|
8
|
+
from sky.adaptors import common
|
|
8
9
|
from sky.utils import common_utils
|
|
9
10
|
from sky.utils import registry
|
|
10
11
|
from sky.utils import resources_utils
|
|
@@ -86,7 +87,9 @@ class Cudo(clouds.Cloud):
|
|
|
86
87
|
|
|
87
88
|
@classmethod
|
|
88
89
|
def _unsupported_features_for_resources(
|
|
89
|
-
cls,
|
|
90
|
+
cls,
|
|
91
|
+
resources: 'resources_lib.Resources',
|
|
92
|
+
region: Optional[str] = None,
|
|
90
93
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
91
94
|
"""The features not supported based on the resources provided.
|
|
92
95
|
|
|
@@ -105,10 +108,15 @@ class Cudo(clouds.Cloud):
|
|
|
105
108
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
106
109
|
|
|
107
110
|
@classmethod
|
|
108
|
-
def regions_with_offering(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
111
|
+
def regions_with_offering(
|
|
112
|
+
cls,
|
|
113
|
+
instance_type,
|
|
114
|
+
accelerators: Optional[Dict[str, int]],
|
|
115
|
+
use_spot: bool,
|
|
116
|
+
region: Optional[str],
|
|
117
|
+
zone: Optional[str],
|
|
118
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
119
|
+
) -> List[clouds.Region]:
|
|
112
120
|
assert zone is None, 'Cudo does not support zones.'
|
|
113
121
|
del accelerators, zone # unused
|
|
114
122
|
if use_spot:
|
|
@@ -287,14 +295,9 @@ class Cudo(clouds.Cloud):
|
|
|
287
295
|
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
288
296
|
"""Checks if the user has access credentials to
|
|
289
297
|
Cudo's compute service."""
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
except (ImportError, subprocess.CalledProcessError) as e:
|
|
294
|
-
return False, (
|
|
295
|
-
f'{cls._DEPENDENCY_HINT}\n'
|
|
296
|
-
f'{cls._INDENT_PREFIX}'
|
|
297
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
298
|
+
if not common.can_import_modules(['cudo_compute']):
|
|
299
|
+
return False, (f'{cls._DEPENDENCY_HINT}\n'
|
|
300
|
+
f'{cls._INDENT_PREFIX}')
|
|
298
301
|
|
|
299
302
|
try:
|
|
300
303
|
_run_output('cudoctl --version')
|
sky/clouds/do.py
CHANGED
|
@@ -57,7 +57,9 @@ class DO(clouds.Cloud):
|
|
|
57
57
|
|
|
58
58
|
@classmethod
|
|
59
59
|
def _unsupported_features_for_resources(
|
|
60
|
-
cls,
|
|
60
|
+
cls,
|
|
61
|
+
resources: 'resources_lib.Resources',
|
|
62
|
+
region: Optional[str] = None,
|
|
61
63
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
62
64
|
"""The features not supported based on the resources provided.
|
|
63
65
|
|
|
@@ -83,6 +85,7 @@ class DO(clouds.Cloud):
|
|
|
83
85
|
use_spot: bool,
|
|
84
86
|
region: Optional[str],
|
|
85
87
|
zone: Optional[str],
|
|
88
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
86
89
|
) -> List[clouds.Region]:
|
|
87
90
|
assert zone is None, 'DO does not support zones.'
|
|
88
91
|
del accelerators, zone # unused
|
|
@@ -283,18 +286,17 @@ class DO(clouds.Cloud):
|
|
|
283
286
|
"""Verify that the user has valid credentials for
|
|
284
287
|
DO's compute service."""
|
|
285
288
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
return False, str(err)
|
|
289
|
+
installed, err_msg = do.check_exceptions_dependencies_installed()
|
|
290
|
+
if not installed:
|
|
291
|
+
return False, err_msg
|
|
290
292
|
|
|
291
293
|
try:
|
|
292
294
|
# attempt to make a CURL request for listing instances
|
|
293
295
|
do_utils.client().droplets.list()
|
|
294
|
-
except do.exceptions().HttpResponseError as err:
|
|
295
|
-
return False, str(err)
|
|
296
296
|
except do_utils.DigitalOceanError as err:
|
|
297
297
|
return False, str(err)
|
|
298
|
+
except do.exceptions().HttpResponseError as err:
|
|
299
|
+
return False, str(err)
|
|
298
300
|
|
|
299
301
|
return True, None
|
|
300
302
|
|
sky/clouds/fluidstack.py
CHANGED
|
@@ -73,7 +73,9 @@ class Fluidstack(clouds.Cloud):
|
|
|
73
73
|
|
|
74
74
|
@classmethod
|
|
75
75
|
def _unsupported_features_for_resources(
|
|
76
|
-
cls,
|
|
76
|
+
cls,
|
|
77
|
+
resources: 'resources_lib.Resources',
|
|
78
|
+
region: Optional[str] = None,
|
|
77
79
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
78
80
|
"""The features not supported based on the resources provided.
|
|
79
81
|
|
|
@@ -92,10 +94,15 @@ class Fluidstack(clouds.Cloud):
|
|
|
92
94
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
93
95
|
|
|
94
96
|
@classmethod
|
|
95
|
-
def regions_with_offering(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
def regions_with_offering(
|
|
98
|
+
cls,
|
|
99
|
+
instance_type: str,
|
|
100
|
+
accelerators: Optional[Dict[str, int]],
|
|
101
|
+
use_spot: bool,
|
|
102
|
+
region: Optional[str],
|
|
103
|
+
zone: Optional[str],
|
|
104
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
105
|
+
) -> List[clouds.Region]:
|
|
99
106
|
assert zone is None, 'FluidStack does not support zones.'
|
|
100
107
|
del accelerators, zone # unused
|
|
101
108
|
if use_spot:
|
sky/clouds/gcp.py
CHANGED
|
@@ -211,7 +211,9 @@ class GCP(clouds.Cloud):
|
|
|
211
211
|
|
|
212
212
|
@classmethod
|
|
213
213
|
def _unsupported_features_for_resources(
|
|
214
|
-
cls,
|
|
214
|
+
cls,
|
|
215
|
+
resources: 'resources.Resources',
|
|
216
|
+
region: Optional[str] = None,
|
|
215
217
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
216
218
|
unsupported = {}
|
|
217
219
|
if gcp_utils.is_tpu_vm_pod(resources):
|
|
@@ -255,10 +257,15 @@ class GCP(clouds.Cloud):
|
|
|
255
257
|
|
|
256
258
|
#### Regions/Zones ####
|
|
257
259
|
@classmethod
|
|
258
|
-
def regions_with_offering(
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
260
|
+
def regions_with_offering(
|
|
261
|
+
cls,
|
|
262
|
+
instance_type: str,
|
|
263
|
+
accelerators: Optional[Dict[str, int]],
|
|
264
|
+
use_spot: bool,
|
|
265
|
+
region: Optional[str],
|
|
266
|
+
zone: Optional[str],
|
|
267
|
+
resources: Optional['resources.Resources'] = None,
|
|
268
|
+
) -> List[clouds.Region]:
|
|
262
269
|
if accelerators is None:
|
|
263
270
|
regions = catalog.get_region_zones_for_instance_type(instance_type,
|
|
264
271
|
use_spot,
|
|
@@ -1179,8 +1186,8 @@ class GCP(clouds.Cloud):
|
|
|
1179
1186
|
# These series don't support pd-standard, use pd-balanced for LOW.
|
|
1180
1187
|
_propagate_disk_type(
|
|
1181
1188
|
lowest=tier2name[resources_utils.DiskTier.MEDIUM])
|
|
1182
|
-
if instance_type.startswith('a3-ultragpu') or series
|
|
1183
|
-
# a3-ultragpu instances only support hyperdisk-balanced.
|
|
1189
|
+
if instance_type.startswith('a3-ultragpu') or series in ('n4', 'a4'):
|
|
1190
|
+
# a3-ultragpu, n4, and a4 instances only support hyperdisk-balanced.
|
|
1184
1191
|
_propagate_disk_type(all='hyperdisk-balanced')
|
|
1185
1192
|
|
|
1186
1193
|
# Series specific handling
|
sky/clouds/hyperbolic.py
CHANGED
|
@@ -65,7 +65,9 @@ class Hyperbolic(clouds.Cloud):
|
|
|
65
65
|
|
|
66
66
|
@classmethod
|
|
67
67
|
def _unsupported_features_for_resources(
|
|
68
|
-
cls,
|
|
68
|
+
cls,
|
|
69
|
+
resources: 'resources_lib.Resources',
|
|
70
|
+
region: Optional[str] = None,
|
|
69
71
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
70
72
|
del resources
|
|
71
73
|
return cls._CLOUD_UNSUPPORTED_FEATURES
|
|
@@ -78,10 +80,15 @@ class Hyperbolic(clouds.Cloud):
|
|
|
78
80
|
return catalog.instance_type_exists(instance_type, 'hyperbolic')
|
|
79
81
|
|
|
80
82
|
@classmethod
|
|
81
|
-
def regions_with_offering(
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
def regions_with_offering(
|
|
84
|
+
cls,
|
|
85
|
+
instance_type: str,
|
|
86
|
+
accelerators: Optional[Dict[str, int]],
|
|
87
|
+
use_spot: bool,
|
|
88
|
+
region: Optional[str],
|
|
89
|
+
zone: Optional[str],
|
|
90
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
91
|
+
) -> List[clouds.Region]:
|
|
85
92
|
assert zone is None, 'Hyperbolic does not support zones.'
|
|
86
93
|
del accelerators, zone # unused
|
|
87
94
|
|
sky/clouds/ibm.py
CHANGED
|
@@ -37,7 +37,9 @@ class IBM(clouds.Cloud):
|
|
|
37
37
|
|
|
38
38
|
@classmethod
|
|
39
39
|
def _unsupported_features_for_resources(
|
|
40
|
-
cls,
|
|
40
|
+
cls,
|
|
41
|
+
resources: 'resources_lib.Resources',
|
|
42
|
+
region: Optional[str] = None,
|
|
41
43
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
42
44
|
features = {
|
|
43
45
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
|
@@ -68,10 +70,15 @@ class IBM(clouds.Cloud):
|
|
|
68
70
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
69
71
|
|
|
70
72
|
@classmethod
|
|
71
|
-
def regions_with_offering(
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
def regions_with_offering(
|
|
74
|
+
cls,
|
|
75
|
+
instance_type: str,
|
|
76
|
+
accelerators: Optional[Dict[str, int]],
|
|
77
|
+
use_spot: bool,
|
|
78
|
+
region: Optional[str],
|
|
79
|
+
zone: Optional[str],
|
|
80
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
81
|
+
) -> List[clouds.Region]:
|
|
75
82
|
del accelerators # unused
|
|
76
83
|
if use_spot:
|
|
77
84
|
return []
|
sky/clouds/kubernetes.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Kubernetes."""
|
|
2
|
+
import concurrent.futures
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
4
5
|
import subprocess
|
|
@@ -25,6 +26,7 @@ from sky.provision.kubernetes.utils import normalize_tpu_accelerator_name
|
|
|
25
26
|
from sky.skylet import constants
|
|
26
27
|
from sky.utils import annotations
|
|
27
28
|
from sky.utils import common_utils
|
|
29
|
+
from sky.utils import env_options
|
|
28
30
|
from sky.utils import kubernetes_enums
|
|
29
31
|
from sky.utils import registry
|
|
30
32
|
from sky.utils import resources_utils
|
|
@@ -47,9 +49,6 @@ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
|
|
|
47
49
|
class Kubernetes(clouds.Cloud):
|
|
48
50
|
"""Kubernetes."""
|
|
49
51
|
|
|
50
|
-
SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys'
|
|
51
|
-
SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod'
|
|
52
|
-
|
|
53
52
|
# Limit the length of the cluster name to avoid exceeding the limit of 63
|
|
54
53
|
# characters for Kubernetes resources. We limit to 42 characters (63-21) to
|
|
55
54
|
# allow additional characters for creating ingress services to expose ports.
|
|
@@ -62,6 +61,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
62
61
|
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True
|
|
63
62
|
|
|
64
63
|
_DEFAULT_NUM_VCPUS = 2
|
|
64
|
+
_DEFAULT_NUM_VCPUS_WITH_GPU = 4
|
|
65
65
|
_DEFAULT_MEMORY_CPU_RATIO = 1
|
|
66
66
|
_DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
|
|
67
67
|
_REPR = 'Kubernetes'
|
|
@@ -97,44 +97,52 @@ class Kubernetes(clouds.Cloud):
|
|
|
97
97
|
# Set of contexts that has logged as temporarily unreachable
|
|
98
98
|
logged_unreachable_contexts: Set[str] = set()
|
|
99
99
|
|
|
100
|
-
@property
|
|
101
|
-
def ssh_key_secret_field_name(self):
|
|
102
|
-
# Use a fresh user hash to avoid conflicts in the secret object naming.
|
|
103
|
-
# This can happen when the controller is reusing the same user hash
|
|
104
|
-
# through USER_ID_ENV_VAR but has a different SSH key.
|
|
105
|
-
fresh_user_hash = common_utils.generate_user_hash()
|
|
106
|
-
return f'ssh-publickey-{fresh_user_hash}'
|
|
107
|
-
|
|
108
100
|
@classmethod
|
|
109
101
|
def _unsupported_features_for_resources(
|
|
110
|
-
cls,
|
|
102
|
+
cls,
|
|
103
|
+
resources: 'resources_lib.Resources',
|
|
104
|
+
region: Optional[str] = None,
|
|
111
105
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
112
106
|
# TODO(aylei): features need to be regional (per context) to make
|
|
113
107
|
# multi-kubernetes selection/failover work.
|
|
114
108
|
unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
|
|
115
|
-
context = resources.region
|
|
109
|
+
context = region if region is not None else resources.region
|
|
116
110
|
if context is None:
|
|
117
|
-
|
|
111
|
+
contexts = cls.existing_allowed_contexts()
|
|
112
|
+
else:
|
|
113
|
+
contexts = [context]
|
|
118
114
|
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
|
119
115
|
'Stopping clusters is not supported on Kubernetes.')
|
|
120
116
|
unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
|
|
121
117
|
'Auto-stop is not supported on Kubernetes.')
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
118
|
+
for context in contexts:
|
|
119
|
+
# Allow spot instances if supported by the cluster
|
|
120
|
+
try:
|
|
121
|
+
# Run spot label check and network type detection concurrently
|
|
122
|
+
# as they are independent operations
|
|
123
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
124
|
+
max_workers=2) as executor:
|
|
125
|
+
spot_future = executor.submit(
|
|
126
|
+
kubernetes_utils.get_spot_label, context)
|
|
127
|
+
network_future = executor.submit(cls._detect_network_type,
|
|
128
|
+
context,
|
|
129
|
+
resources.network_tier)
|
|
130
|
+
|
|
131
|
+
spot_label_key, _ = spot_future.result()
|
|
132
|
+
if spot_label_key is not None:
|
|
133
|
+
unsupported_features.pop(
|
|
134
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE,
|
|
135
|
+
None)
|
|
136
|
+
|
|
137
|
+
# Allow custom network tier if supported by the cluster
|
|
138
|
+
# (e.g., Nebius clusters with high performance networking)
|
|
139
|
+
network_type, _ = network_future.result()
|
|
140
|
+
if network_type.supports_high_performance_networking():
|
|
141
|
+
unsupported_features.pop(
|
|
142
|
+
clouds.CloudImplementationFeatures.
|
|
143
|
+
CUSTOM_NETWORK_TIER, None)
|
|
144
|
+
except exceptions.KubeAPIUnreachableError as e:
|
|
145
|
+
cls._log_unreachable_context(context, str(e))
|
|
138
146
|
return unsupported_features
|
|
139
147
|
|
|
140
148
|
@classmethod
|
|
@@ -187,6 +195,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
187
195
|
ctx for ctx in all_contexts if not ctx.startswith('ssh-')
|
|
188
196
|
]
|
|
189
197
|
|
|
198
|
+
allow_all_contexts = allowed_contexts == 'all' or (
|
|
199
|
+
allowed_contexts is None and
|
|
200
|
+
env_options.Options.ALLOW_ALL_KUBERNETES_CONTEXTS.get())
|
|
201
|
+
if allow_all_contexts:
|
|
202
|
+
allowed_contexts = all_contexts
|
|
203
|
+
|
|
190
204
|
if allowed_contexts is None:
|
|
191
205
|
# Try kubeconfig if present
|
|
192
206
|
current_context = (
|
|
@@ -244,10 +258,15 @@ class Kubernetes(clouds.Cloud):
|
|
|
244
258
|
'refresh Kubernetes availability if permanent.')
|
|
245
259
|
|
|
246
260
|
@classmethod
|
|
247
|
-
def regions_with_offering(
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
261
|
+
def regions_with_offering(
|
|
262
|
+
cls,
|
|
263
|
+
instance_type: Optional[str],
|
|
264
|
+
accelerators: Optional[Dict[str, int]],
|
|
265
|
+
use_spot: bool,
|
|
266
|
+
region: Optional[str],
|
|
267
|
+
zone: Optional[str],
|
|
268
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
269
|
+
) -> List[clouds.Region]:
|
|
251
270
|
del accelerators, zone, use_spot # unused
|
|
252
271
|
existing_contexts = cls.existing_allowed_contexts()
|
|
253
272
|
|
|
@@ -257,6 +276,19 @@ class Kubernetes(clouds.Cloud):
|
|
|
257
276
|
|
|
258
277
|
if region is not None:
|
|
259
278
|
regions = [r for r in regions if r.name == region]
|
|
279
|
+
if resources is not None:
|
|
280
|
+
filtered_regions = []
|
|
281
|
+
resources_required_features = resources.get_required_cloud_features(
|
|
282
|
+
)
|
|
283
|
+
for r in regions:
|
|
284
|
+
try:
|
|
285
|
+
cls.check_features_are_supported(
|
|
286
|
+
resources, resources_required_features, r.name)
|
|
287
|
+
filtered_regions.append(r)
|
|
288
|
+
except exceptions.NotSupportedError as e:
|
|
289
|
+
logger.info(f'Filter out context: {r.name}, reason: {e}')
|
|
290
|
+
continue
|
|
291
|
+
regions = filtered_regions
|
|
260
292
|
|
|
261
293
|
# Check if requested instance type will fit in the cluster.
|
|
262
294
|
# TODO(zhwu,romilb): autoscaler type needs to be regional (per
|
|
@@ -516,9 +548,6 @@ class Kubernetes(clouds.Cloud):
|
|
|
516
548
|
return image_id
|
|
517
549
|
|
|
518
550
|
image_id = _get_image_id(resources)
|
|
519
|
-
# TODO(romilb): Create a lightweight image for SSH jump host
|
|
520
|
-
ssh_jump_image = catalog.get_image_id_from_tag(self.IMAGE_CPU,
|
|
521
|
-
clouds='kubernetes')
|
|
522
551
|
|
|
523
552
|
# Set environment variables for the pod. Note that SkyPilot env vars
|
|
524
553
|
# are set separately when the task is run. These env vars are
|
|
@@ -566,6 +595,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
566
595
|
port_mode = network_utils.get_port_mode(None, context)
|
|
567
596
|
|
|
568
597
|
remote_identity = skypilot_config.get_effective_region_config(
|
|
598
|
+
# TODO(kyuds): Support SSH node pools as well.
|
|
569
599
|
cloud='kubernetes',
|
|
570
600
|
region=context,
|
|
571
601
|
keys=('remote_identity',),
|
|
@@ -640,6 +670,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
640
670
|
|
|
641
671
|
k8s_kueue_local_queue_name = (
|
|
642
672
|
skypilot_config.get_effective_region_config(
|
|
673
|
+
# TODO(kyuds): Support SSH node pools as well.
|
|
643
674
|
cloud='kubernetes',
|
|
644
675
|
region=context,
|
|
645
676
|
keys=('kueue', 'local_queue_name'),
|
|
@@ -654,6 +685,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
654
685
|
if enable_flex_start_queued_provisioning or enable_flex_start:
|
|
655
686
|
# DWS is only supported in GKE, check the autoscaler type.
|
|
656
687
|
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
688
|
+
# TODO(kyuds): Support SSH node pools as well.
|
|
657
689
|
cloud='kubernetes',
|
|
658
690
|
region=context,
|
|
659
691
|
keys=('autoscaler',),
|
|
@@ -677,8 +709,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
677
709
|
timeout = self._calculate_provision_timeout(
|
|
678
710
|
num_nodes, volume_mounts, enable_flex_start or
|
|
679
711
|
enable_flex_start_queued_provisioning)
|
|
712
|
+
|
|
713
|
+
# Use _REPR, instead of directly using 'kubernetes' as the config key,
|
|
714
|
+
# because it could be SSH node pool as well.
|
|
715
|
+
cloud_config_str = self._REPR.lower()
|
|
680
716
|
timeout = skypilot_config.get_effective_region_config(
|
|
681
|
-
cloud=
|
|
717
|
+
cloud=cloud_config_str,
|
|
682
718
|
region=context,
|
|
683
719
|
keys=('provision_timeout',),
|
|
684
720
|
default_value=timeout,
|
|
@@ -692,13 +728,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
692
728
|
'accelerator_count': str(acc_count),
|
|
693
729
|
'timeout': str(timeout),
|
|
694
730
|
'k8s_port_mode': port_mode.value,
|
|
695
|
-
'k8s_networking_mode': network_utils.get_networking_mode(
|
|
696
|
-
None, context=context).value,
|
|
697
|
-
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
|
|
698
731
|
'k8s_acc_label_key': k8s_acc_label_key,
|
|
699
732
|
'k8s_acc_label_values': k8s_acc_label_values,
|
|
700
|
-
'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
|
|
701
|
-
'k8s_ssh_jump_image': ssh_jump_image,
|
|
702
733
|
'k8s_service_account_name': k8s_service_account_name,
|
|
703
734
|
'k8s_automount_sa_token': 'true',
|
|
704
735
|
'k8s_fuse_device_required': fuse_device_required,
|
|
@@ -796,7 +827,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
796
827
|
accelerators=resources.accelerators,
|
|
797
828
|
use_spot=resources.use_spot,
|
|
798
829
|
region=resources.region,
|
|
799
|
-
zone=resources.zone
|
|
830
|
+
zone=resources.zone,
|
|
831
|
+
resources=resources)
|
|
800
832
|
if not regions:
|
|
801
833
|
return resources_utils.FeasibleResources([], [], None)
|
|
802
834
|
resources = resources.copy(accelerators=None)
|
|
@@ -841,6 +873,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
841
873
|
from_instance_type(default_instance_type))
|
|
842
874
|
|
|
843
875
|
gpu_task_cpus = k8s_instance_type.cpus
|
|
876
|
+
if resources.cpus is None:
|
|
877
|
+
gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
|
|
844
878
|
# Special handling to bump up memory multiplier for GPU instances
|
|
845
879
|
gpu_task_memory = (float(resources.memory.strip('+')) if
|
|
846
880
|
resources.memory is not None else gpu_task_cpus *
|
|
@@ -854,7 +888,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
854
888
|
accelerators=None,
|
|
855
889
|
use_spot=resources.use_spot,
|
|
856
890
|
region=resources.region,
|
|
857
|
-
zone=resources.zone
|
|
891
|
+
zone=resources.zone,
|
|
892
|
+
resources=resources)
|
|
858
893
|
if not available_regions:
|
|
859
894
|
return resources_utils.FeasibleResources([], [], None)
|
|
860
895
|
# No fuzzy lists for Kubernetes
|