skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Shadeform provisioner."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.shadeform.config import bootstrap_instances
|
|
4
|
+
from sky.provision.shadeform.instance import cleanup_ports
|
|
5
|
+
from sky.provision.shadeform.instance import get_cluster_info
|
|
6
|
+
from sky.provision.shadeform.instance import open_ports
|
|
7
|
+
from sky.provision.shadeform.instance import query_instances
|
|
8
|
+
from sky.provision.shadeform.instance import run_instances
|
|
9
|
+
from sky.provision.shadeform.instance import stop_instances
|
|
10
|
+
from sky.provision.shadeform.instance import terminate_instances
|
|
11
|
+
from sky.provision.shadeform.instance import wait_instances
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Shadeform configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
|
|
12
|
+
return config
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""Shadeform instance provisioning."""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky.provision import common
|
|
9
|
+
from sky.provision.shadeform import shadeform_utils
|
|
10
|
+
from sky.utils import status_lib
|
|
11
|
+
|
|
12
|
+
POLL_INTERVAL = 10
|
|
13
|
+
INSTANCE_READY_TIMEOUT = 3600
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Status mapping from Shadeform to SkyPilot
|
|
18
|
+
SHADEFORM_STATUS_MAP = {
|
|
19
|
+
'creating': status_lib.ClusterStatus.INIT,
|
|
20
|
+
'pending_provider': status_lib.ClusterStatus.INIT,
|
|
21
|
+
'pending': status_lib.ClusterStatus.INIT,
|
|
22
|
+
'active': status_lib.ClusterStatus.UP,
|
|
23
|
+
'deleted': status_lib.ClusterStatus.STOPPED,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
|
|
28
|
+
"""Get all instances belonging to a cluster."""
|
|
29
|
+
try:
|
|
30
|
+
response = shadeform_utils.get_instances()
|
|
31
|
+
instances = response.get('instances', [])
|
|
32
|
+
|
|
33
|
+
cluster_instances = {}
|
|
34
|
+
possible_names = [
|
|
35
|
+
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
for instance in instances:
|
|
39
|
+
if instance.get('name') in possible_names:
|
|
40
|
+
cluster_instances[instance['id']] = instance
|
|
41
|
+
|
|
42
|
+
return cluster_instances
|
|
43
|
+
except (ValueError, KeyError, requests.exceptions.RequestException) as e:
|
|
44
|
+
logger.warning(f'Failed to get instances: {e}')
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
49
|
+
"""Get the head instance ID from a list of instances."""
|
|
50
|
+
for instance_id, instance in instances.items():
|
|
51
|
+
if instance.get('name', '').endswith('-head'):
|
|
52
|
+
return instance_id
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _wait_for_instances_ready(cluster_name_on_cloud: str,
|
|
57
|
+
expected_count: int,
|
|
58
|
+
timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
|
|
59
|
+
"""Wait for instances to be ready (active state with SSH access)."""
|
|
60
|
+
start_time = time.time()
|
|
61
|
+
|
|
62
|
+
while time.time() - start_time < timeout:
|
|
63
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
64
|
+
ready_count = 0
|
|
65
|
+
|
|
66
|
+
for instance in instances.values():
|
|
67
|
+
if (instance.get('status') == 'active' and
|
|
68
|
+
instance.get('ip') is not None and
|
|
69
|
+
instance.get('ssh_port') is not None):
|
|
70
|
+
ready_count += 1
|
|
71
|
+
|
|
72
|
+
logger.info(f'Waiting for instances to be ready: '
|
|
73
|
+
f'({ready_count}/{expected_count})')
|
|
74
|
+
|
|
75
|
+
if ready_count >= expected_count:
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
time.sleep(POLL_INTERVAL)
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
84
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
85
|
+
"""Run instances for the given cluster."""
|
|
86
|
+
del cluster_name # unused - we use cluster_name_on_cloud
|
|
87
|
+
logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
|
|
88
|
+
f'in region {region}')
|
|
89
|
+
logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
|
|
90
|
+
logger.debug(f'DEBUG: config node_config={config.node_config}')
|
|
91
|
+
|
|
92
|
+
# Check existing instances
|
|
93
|
+
existing_instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
94
|
+
head_instance_id = _get_head_instance_id(existing_instances)
|
|
95
|
+
|
|
96
|
+
# Filter active instances
|
|
97
|
+
active_instances = {
|
|
98
|
+
iid: inst
|
|
99
|
+
for iid, inst in existing_instances.items()
|
|
100
|
+
if inst.get('status') == 'active'
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
current_count = len(active_instances)
|
|
104
|
+
target_count = config.count
|
|
105
|
+
|
|
106
|
+
logger.info(f'Current instances: {current_count}, target: {target_count}')
|
|
107
|
+
|
|
108
|
+
if current_count >= target_count:
|
|
109
|
+
if head_instance_id is None:
|
|
110
|
+
raise RuntimeError(
|
|
111
|
+
f'Cluster {cluster_name_on_cloud} has no head node')
|
|
112
|
+
logger.info(f'Cluster already has {current_count} instances, '
|
|
113
|
+
f'no need to start more')
|
|
114
|
+
return common.ProvisionRecord(
|
|
115
|
+
provider_name='shadeform',
|
|
116
|
+
cluster_name=cluster_name_on_cloud,
|
|
117
|
+
region=region,
|
|
118
|
+
zone=None, # Shadeform doesn't use separate zones
|
|
119
|
+
head_instance_id=head_instance_id,
|
|
120
|
+
resumed_instance_ids=[],
|
|
121
|
+
created_instance_ids=[])
|
|
122
|
+
|
|
123
|
+
# Create new instances
|
|
124
|
+
to_create = target_count - current_count
|
|
125
|
+
created_instance_ids = []
|
|
126
|
+
|
|
127
|
+
for _ in range(to_create):
|
|
128
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
|
129
|
+
instance_name = f'{cluster_name_on_cloud}-{node_type}'
|
|
130
|
+
|
|
131
|
+
# Extract configuration from node_config
|
|
132
|
+
|
|
133
|
+
# The node_config contains instance specs including InstanceType
|
|
134
|
+
# which follows the format: {cloud_provider}_{instance_type}
|
|
135
|
+
# (e.g., "massedcompute_A6000_basex2")
|
|
136
|
+
node_config = config.node_config
|
|
137
|
+
assert 'InstanceType' in node_config, \
|
|
138
|
+
'InstanceType must be present in node_config'
|
|
139
|
+
|
|
140
|
+
# Parse the instance type to extract cloud provider and instance specs
|
|
141
|
+
# Expected format: "{cloud}_{instance_type}" where cloud is provider
|
|
142
|
+
# (massedcompute, scaleway, lambda, etc.)
|
|
143
|
+
instance_type_full = node_config['InstanceType']
|
|
144
|
+
assert (isinstance(instance_type_full, str) and
|
|
145
|
+
'_' in instance_type_full), \
|
|
146
|
+
f'InstanceType must be in format cloud_instance_type, got: ' \
|
|
147
|
+
f'{instance_type_full}'
|
|
148
|
+
|
|
149
|
+
instance_type_split = instance_type_full.split('_')
|
|
150
|
+
assert len(instance_type_split) >= 2, \
|
|
151
|
+
f'InstanceType must contain at least one underscore, got: ' \
|
|
152
|
+
f'{instance_type_full}'
|
|
153
|
+
|
|
154
|
+
# Extract cloud provider (first part) and instance type (remaining)
|
|
155
|
+
# Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
|
|
156
|
+
# instance_type="A6000-basex2"
|
|
157
|
+
cloud = instance_type_split[0]
|
|
158
|
+
instance_type = '_'.join(instance_type_split[1:])
|
|
159
|
+
|
|
160
|
+
# Shadeform uses underscores instead of hyphens
|
|
161
|
+
instance_type = instance_type.replace('-', '_')
|
|
162
|
+
|
|
163
|
+
if instance_type.endswith('B'):
|
|
164
|
+
instance_type = instance_type[:-1]
|
|
165
|
+
|
|
166
|
+
# Replace "GBx" with "Gx" (case sensitive)
|
|
167
|
+
if 'GBx' in instance_type:
|
|
168
|
+
instance_type = instance_type.replace('GBx', 'Gx')
|
|
169
|
+
|
|
170
|
+
assert cloud, 'Cloud provider cannot be empty'
|
|
171
|
+
assert instance_type, 'Instance type cannot be empty'
|
|
172
|
+
|
|
173
|
+
# Get SSH key ID for authentication - this is optional and may be None
|
|
174
|
+
ssh_key_id = config.authentication_config.get('ssh_key_id')
|
|
175
|
+
|
|
176
|
+
create_config = {
|
|
177
|
+
'cloud': cloud,
|
|
178
|
+
'region': region,
|
|
179
|
+
'shade_instance_type': instance_type,
|
|
180
|
+
'name': instance_name,
|
|
181
|
+
'ssh_key_id': ssh_key_id
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
logger.info(f'Creating {node_type} instance: {instance_name}')
|
|
186
|
+
response = shadeform_utils.create_instance(create_config)
|
|
187
|
+
instance_id = response['id']
|
|
188
|
+
created_instance_ids.append(instance_id)
|
|
189
|
+
|
|
190
|
+
if head_instance_id is None:
|
|
191
|
+
head_instance_id = instance_id
|
|
192
|
+
|
|
193
|
+
logger.info(f'Created instance {instance_id} ({node_type})')
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f'Failed to create instance: {e}')
|
|
197
|
+
# Clean up any created instances
|
|
198
|
+
for iid in created_instance_ids:
|
|
199
|
+
try:
|
|
200
|
+
shadeform_utils.delete_instance(iid)
|
|
201
|
+
except requests.exceptions.RequestException as cleanup_e:
|
|
202
|
+
logger.warning(
|
|
203
|
+
f'Failed to cleanup instance {iid}: {cleanup_e}')
|
|
204
|
+
raise
|
|
205
|
+
|
|
206
|
+
# Wait for all instances to be ready
|
|
207
|
+
logger.info('Waiting for instances to become ready...')
|
|
208
|
+
if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
|
|
209
|
+
raise RuntimeError('Timed out waiting for instances to be ready')
|
|
210
|
+
|
|
211
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
212
|
+
|
|
213
|
+
return common.ProvisionRecord(provider_name='shadeform',
|
|
214
|
+
cluster_name=cluster_name_on_cloud,
|
|
215
|
+
region=region,
|
|
216
|
+
zone=region,
|
|
217
|
+
head_instance_id=head_instance_id,
|
|
218
|
+
resumed_instance_ids=[],
|
|
219
|
+
created_instance_ids=created_instance_ids)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
|
223
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
|
224
|
+
"""Wait for instances to reach the specified state."""
|
|
225
|
+
del region, cluster_name_on_cloud, state # unused
|
|
226
|
+
# For Shadeform, instances are ready when they reach 'active' status
|
|
227
|
+
# This is already handled in run_instances
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def stop_instances(cluster_name_on_cloud: str,
|
|
231
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
232
|
+
worker_only: bool = False) -> None:
|
|
233
|
+
"""Stop instances (not supported by Shadeform)."""
|
|
234
|
+
del cluster_name_on_cloud, provider_config, worker_only # unused
|
|
235
|
+
raise NotImplementedError(
|
|
236
|
+
'Stopping instances is not supported by Shadeform')
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def terminate_instances(cluster_name_on_cloud: str,
|
|
240
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
241
|
+
worker_only: bool = False) -> None:
|
|
242
|
+
"""Terminate instances."""
|
|
243
|
+
del provider_config # unused
|
|
244
|
+
logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
|
|
245
|
+
|
|
246
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
247
|
+
|
|
248
|
+
if not instances:
|
|
249
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
instances_to_delete = instances
|
|
253
|
+
if worker_only:
|
|
254
|
+
# Only delete worker nodes, not head
|
|
255
|
+
instances_to_delete = {
|
|
256
|
+
iid: inst
|
|
257
|
+
for iid, inst in instances.items()
|
|
258
|
+
if not inst.get('name', '').endswith('-head')
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for instance_id, instance in instances_to_delete.items():
|
|
262
|
+
try:
|
|
263
|
+
logger.info(
|
|
264
|
+
f'Terminating instance {instance_id} ({instance.get("name")})')
|
|
265
|
+
shadeform_utils.delete_instance(instance_id)
|
|
266
|
+
except requests.exceptions.RequestException as e:
|
|
267
|
+
logger.warning(f'Failed to terminate instance {instance_id}: {e}')
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_cluster_info(
|
|
271
|
+
region: str,
|
|
272
|
+
cluster_name_on_cloud: str,
|
|
273
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
274
|
+
"""Get cluster information."""
|
|
275
|
+
del region, provider_config # unused
|
|
276
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
277
|
+
|
|
278
|
+
if not instances:
|
|
279
|
+
return common.ClusterInfo(instances={},
|
|
280
|
+
head_instance_id=None,
|
|
281
|
+
provider_name='shadeform')
|
|
282
|
+
|
|
283
|
+
head_instance_id = _get_head_instance_id(instances)
|
|
284
|
+
|
|
285
|
+
# Convert instance format for ClusterInfo
|
|
286
|
+
cluster_instances = {}
|
|
287
|
+
for instance_id, instance in instances.items():
|
|
288
|
+
instance_info = common.InstanceInfo(
|
|
289
|
+
instance_id=instance_id,
|
|
290
|
+
internal_ip=instance.get('ip', ''),
|
|
291
|
+
external_ip=instance.get('ip', ''),
|
|
292
|
+
ssh_port=instance.get('ssh_port', 22),
|
|
293
|
+
tags={},
|
|
294
|
+
)
|
|
295
|
+
# ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
|
|
296
|
+
cluster_instances[instance_id] = [instance_info]
|
|
297
|
+
|
|
298
|
+
ssh_user = 'shadeform' # default
|
|
299
|
+
if head_instance_id is not None:
|
|
300
|
+
ssh_user = instances.get(head_instance_id,
|
|
301
|
+
{}).get('ssh_user', 'shadeform')
|
|
302
|
+
|
|
303
|
+
return common.ClusterInfo(instances=cluster_instances,
|
|
304
|
+
head_instance_id=head_instance_id,
|
|
305
|
+
provider_name='shadeform',
|
|
306
|
+
ssh_user=ssh_user)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def query_instances(
|
|
310
|
+
cluster_name: str,
|
|
311
|
+
cluster_name_on_cloud: str,
|
|
312
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
313
|
+
non_terminated_only: bool = True,
|
|
314
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
315
|
+
"""Query the status of instances."""
|
|
316
|
+
del cluster_name, provider_config # unused
|
|
317
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
318
|
+
|
|
319
|
+
if not instances:
|
|
320
|
+
return {}
|
|
321
|
+
|
|
322
|
+
status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
323
|
+
Optional[str]]] = {}
|
|
324
|
+
for instance_id, instance in instances.items():
|
|
325
|
+
shadeform_status = instance.get('status', 'unknown')
|
|
326
|
+
sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
|
|
327
|
+
status_lib.ClusterStatus.INIT)
|
|
328
|
+
|
|
329
|
+
if (non_terminated_only and
|
|
330
|
+
sky_status == status_lib.ClusterStatus.STOPPED):
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
status_map[instance_id] = (sky_status, None)
|
|
334
|
+
|
|
335
|
+
return status_map
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def open_ports(cluster_name_on_cloud: str,
|
|
339
|
+
ports: List[str],
|
|
340
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
341
|
+
"""Open ports (not supported by Shadeform)."""
|
|
342
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
343
|
+
raise NotImplementedError()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def cleanup_ports(cluster_name_on_cloud: str,
|
|
347
|
+
ports: List[str],
|
|
348
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
349
|
+
"""Cleanup ports (not supported by Shadeform)."""
|
|
350
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
351
|
+
# Nothing to cleanup since we don't support dynamic port opening
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Shadeform API utilities."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from sky.adaptors import common
|
|
7
|
+
|
|
8
|
+
# Lazy import to avoid dependency on external packages
|
|
9
|
+
requests = common.LazyImport('requests')
|
|
10
|
+
|
|
11
|
+
# Shadeform API configuration
|
|
12
|
+
SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
|
|
13
|
+
SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
"""Get Shadeform API key from file."""
|
|
18
|
+
api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
|
|
19
|
+
if not os.path.exists(api_key_path):
|
|
20
|
+
raise FileNotFoundError(
|
|
21
|
+
f'Shadeform API key not found at {api_key_path}. '
|
|
22
|
+
'Please save your API key to this file.')
|
|
23
|
+
|
|
24
|
+
with open(api_key_path, 'r', encoding='utf-8') as f:
|
|
25
|
+
api_key = f.read().strip()
|
|
26
|
+
|
|
27
|
+
if not api_key:
|
|
28
|
+
raise ValueError(f'Shadeform API key is empty in {api_key_path}')
|
|
29
|
+
|
|
30
|
+
return api_key
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def make_request(method: str, endpoint: str, **kwargs) -> Any:
|
|
34
|
+
"""Make a request to the Shadeform API."""
|
|
35
|
+
url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
|
|
36
|
+
headers = {
|
|
37
|
+
'X-API-KEY': get_api_key(),
|
|
38
|
+
'Content-Type': 'application/json',
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
response = requests.request(method, url, headers=headers, **kwargs)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
|
|
44
|
+
# Some APIs (like delete) return empty responses with just 200 status
|
|
45
|
+
if response.text.strip():
|
|
46
|
+
return response.json()
|
|
47
|
+
else:
|
|
48
|
+
# Return empty dict for empty responses (e.g., delete operations)
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_instances() -> Dict[str, Any]:
|
|
53
|
+
"""Get all instances."""
|
|
54
|
+
return make_request('GET', '/instances')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_info(instance_id: str) -> Dict[str, Any]:
|
|
58
|
+
"""Get information about a specific instance."""
|
|
59
|
+
return make_request('GET', f'/instances/{instance_id}/info')
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
63
|
+
"""Create a new instance."""
|
|
64
|
+
return make_request('POST', '/instances/create', json=config)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def delete_instance(instance_id: str) -> Dict[str, Any]:
|
|
68
|
+
"""Delete an instance.
|
|
69
|
+
|
|
70
|
+
Note: Shadeform delete API returns empty response with 200 status.
|
|
71
|
+
"""
|
|
72
|
+
return make_request('POST', f'/instances/{instance_id}/delete')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_ssh_keys() -> Dict[str, Any]:
|
|
76
|
+
"""Get all SSH keys."""
|
|
77
|
+
return make_request('GET', '/sshkeys')
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
|
|
81
|
+
"""Add a new SSH key."""
|
|
82
|
+
config = {'name': name, 'public_key': public_key}
|
|
83
|
+
return make_request('POST', '/sshkeys/add', json=config)
|
sky/provision/vast/instance.py
CHANGED
|
@@ -39,14 +39,15 @@ def _filter_instances(cluster_name_on_cloud: str,
|
|
|
39
39
|
|
|
40
40
|
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
41
41
|
for inst_id, inst in instances.items():
|
|
42
|
-
if inst['name'].endswith('-head'):
|
|
42
|
+
if inst.get('name') and inst['name'].endswith('-head'):
|
|
43
43
|
return inst_id
|
|
44
44
|
return None
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
+
del cluster_name # unused
|
|
50
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
51
52
|
|
|
52
53
|
created_instance_ids = []
|
|
@@ -220,9 +221,10 @@ def query_instances(
|
|
|
220
221
|
cluster_name_on_cloud: str,
|
|
221
222
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
222
223
|
non_terminated_only: bool = True,
|
|
224
|
+
retry_if_missing: bool = False,
|
|
223
225
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
224
226
|
"""See sky/provision/__init__.py"""
|
|
225
|
-
del cluster_name # unused
|
|
227
|
+
del cluster_name, retry_if_missing # unused
|
|
226
228
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
227
229
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
228
230
|
# "running", "frozen", "stopped", "unknown", "loading"
|
sky/provision/volume.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Volume functions for provisioning and deleting ephemeral volumes."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
from sky import clouds
|
|
7
|
+
from sky import global_user_state
|
|
8
|
+
from sky import models
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.provision import common as provision_common
|
|
11
|
+
from sky.provision import constants as provision_constants
|
|
12
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
13
|
+
from sky.utils import volume as volume_utils
|
|
14
|
+
from sky.volumes import volume as volume_lib
|
|
15
|
+
from sky.volumes.server import core as volume_server_core
|
|
16
|
+
|
|
17
|
+
logger = sky_logging.init_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _resolve_volume_type(cloud: clouds.Cloud,
|
|
21
|
+
volume_type: Optional[str]) -> str:
|
|
22
|
+
if not volume_type:
|
|
23
|
+
volume_types = None
|
|
24
|
+
for cloud_key, vol_types in volume_lib.CLOUD_TO_VOLUME_TYPE.items():
|
|
25
|
+
if cloud.is_same_cloud(cloud_key):
|
|
26
|
+
volume_types = vol_types
|
|
27
|
+
break
|
|
28
|
+
if volume_types is None:
|
|
29
|
+
raise ValueError(f'No default volume type found for cloud {cloud}')
|
|
30
|
+
if len(volume_types) != 1:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
f'Found multiple volume types for cloud {cloud}: {volume_types}'
|
|
33
|
+
)
|
|
34
|
+
return volume_types[0].value
|
|
35
|
+
supported_volume_types = [
|
|
36
|
+
volume_type.value for volume_type in volume_utils.VolumeType
|
|
37
|
+
]
|
|
38
|
+
volume_type = volume_type.lower()
|
|
39
|
+
if volume_type not in supported_volume_types:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
f'Invalid volume type: {volume_type} for cloud {cloud}')
|
|
42
|
+
return volume_type
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _resolve_pvc_volume_config(cloud: clouds.Cloud,
|
|
46
|
+
config: provision_common.ProvisionConfig,
|
|
47
|
+
volume_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
provider_config = config.provider_config
|
|
49
|
+
if not cloud.is_same_cloud(clouds.Kubernetes()):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f'PVC volume type is only supported on Kubernetes not on {cloud}')
|
|
52
|
+
supported_access_modes = [
|
|
53
|
+
access_mode.value for access_mode in volume_utils.VolumeAccessMode
|
|
54
|
+
]
|
|
55
|
+
access_mode = volume_config.get('access_mode')
|
|
56
|
+
if access_mode is None:
|
|
57
|
+
access_mode = volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value
|
|
58
|
+
volume_config['access_mode'] = access_mode
|
|
59
|
+
elif access_mode not in supported_access_modes:
|
|
60
|
+
raise ValueError(f'Invalid access mode: {access_mode} for PVC')
|
|
61
|
+
if (access_mode == volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value and
|
|
62
|
+
config.count > 1):
|
|
63
|
+
raise ValueError(
|
|
64
|
+
'Access mode ReadWriteOnce is not supported for multi-node'
|
|
65
|
+
' clusters.')
|
|
66
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
67
|
+
volume_config['namespace'] = namespace
|
|
68
|
+
return volume_config
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _create_ephemeral_volume(
|
|
72
|
+
cloud: clouds.Cloud, region: str, cluster_name_on_cloud: str,
|
|
73
|
+
config: provision_common.ProvisionConfig,
|
|
74
|
+
volume_mount: volume_utils.VolumeMount
|
|
75
|
+
) -> Optional[volume_utils.VolumeInfo]:
|
|
76
|
+
provider_name = repr(cloud)
|
|
77
|
+
path = volume_mount.path
|
|
78
|
+
volume_config = volume_mount.volume_config
|
|
79
|
+
volume_type = _resolve_volume_type(cloud, volume_config.type)
|
|
80
|
+
labels = volume_config.labels
|
|
81
|
+
if volume_type == volume_utils.VolumeType.PVC.value:
|
|
82
|
+
internal_volume_config = _resolve_pvc_volume_config(
|
|
83
|
+
cloud, config, volume_config.config)
|
|
84
|
+
if labels:
|
|
85
|
+
for key, value in labels.items():
|
|
86
|
+
valid, err_msg = cloud.is_label_valid(key, value)
|
|
87
|
+
if not valid:
|
|
88
|
+
raise ValueError(f'{err_msg}')
|
|
89
|
+
else:
|
|
90
|
+
labels = {}
|
|
91
|
+
labels.update({
|
|
92
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud
|
|
93
|
+
})
|
|
94
|
+
else:
|
|
95
|
+
logger.warning(f'Skipping unsupported ephemeral volume type: '
|
|
96
|
+
f'{volume_type} for cloud {cloud}.')
|
|
97
|
+
return None
|
|
98
|
+
volume_name = volume_config.name
|
|
99
|
+
volume_server_core.volume_apply(
|
|
100
|
+
name=volume_name,
|
|
101
|
+
volume_type=volume_type,
|
|
102
|
+
cloud=provider_name,
|
|
103
|
+
region=region,
|
|
104
|
+
zone=None,
|
|
105
|
+
size=volume_config.size,
|
|
106
|
+
config=internal_volume_config,
|
|
107
|
+
labels=labels,
|
|
108
|
+
is_ephemeral=True,
|
|
109
|
+
)
|
|
110
|
+
volume = global_user_state.get_volume_by_name(volume_name)
|
|
111
|
+
if volume is None:
|
|
112
|
+
raise ValueError(f'Failed to get record for volume: {volume_name}')
|
|
113
|
+
assert 'handle' in volume, 'Volume handle is None.'
|
|
114
|
+
volume_config: models.VolumeConfig = volume['handle']
|
|
115
|
+
volume_info = volume_utils.VolumeInfo(
|
|
116
|
+
name=volume_name,
|
|
117
|
+
path=path,
|
|
118
|
+
volume_name_on_cloud=volume_config.name_on_cloud,
|
|
119
|
+
volume_id_on_cloud=volume_config.id_on_cloud,
|
|
120
|
+
)
|
|
121
|
+
return volume_info
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def provision_ephemeral_volumes(
|
|
125
|
+
cloud: clouds.Cloud,
|
|
126
|
+
region: str,
|
|
127
|
+
cluster_name_on_cloud: str,
|
|
128
|
+
config: provision_common.ProvisionConfig,
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Provision ephemeral volumes for a cluster."""
|
|
131
|
+
provider_config = config.provider_config
|
|
132
|
+
ephemeral_volume_mounts = provider_config.get('ephemeral_volume_specs')
|
|
133
|
+
if not ephemeral_volume_mounts:
|
|
134
|
+
return
|
|
135
|
+
volume_infos = []
|
|
136
|
+
try:
|
|
137
|
+
for ephemeral_volume_mount in ephemeral_volume_mounts:
|
|
138
|
+
mount_copy = copy.deepcopy(ephemeral_volume_mount)
|
|
139
|
+
volume_mount = volume_utils.VolumeMount.from_yaml_config(mount_copy)
|
|
140
|
+
volume_info = _create_ephemeral_volume(cloud, region,
|
|
141
|
+
cluster_name_on_cloud,
|
|
142
|
+
config, volume_mount)
|
|
143
|
+
if volume_info is None:
|
|
144
|
+
continue
|
|
145
|
+
volume_infos.append(volume_info)
|
|
146
|
+
provider_config['ephemeral_volume_infos'] = volume_infos
|
|
147
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
148
|
+
logger.error(f'Failed to provision ephemeral volumes: {e}')
|
|
149
|
+
raise e
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def delete_ephemeral_volumes(provider_config: Dict[str, Any],) -> None:
|
|
153
|
+
"""Provision ephemeral volumes for a cluster."""
|
|
154
|
+
ephemeral_volume_mounts = provider_config.get('ephemeral_volume_specs')
|
|
155
|
+
if not ephemeral_volume_mounts:
|
|
156
|
+
return
|
|
157
|
+
ephemeral_volume_names = []
|
|
158
|
+
for ephemeral_volume_mount in ephemeral_volume_mounts:
|
|
159
|
+
mount_copy = copy.deepcopy(ephemeral_volume_mount)
|
|
160
|
+
volume_mount = volume_utils.VolumeMount.from_yaml_config(mount_copy)
|
|
161
|
+
volume_name = volume_mount.volume_config.name
|
|
162
|
+
ephemeral_volume_names.append(volume_name)
|
|
163
|
+
volume_server_core.volume_delete(names=ephemeral_volume_names,
|
|
164
|
+
ignore_not_found=True)
|
|
@@ -89,5 +89,6 @@ def create_unverified_session(session, suppress_warning=True):
|
|
|
89
89
|
session.verify = False
|
|
90
90
|
if suppress_warning:
|
|
91
91
|
# Suppress unverified https request warnings
|
|
92
|
-
requests.packages.urllib3.disable_warnings(
|
|
92
|
+
requests.packages.urllib3.disable_warnings( # type: ignore
|
|
93
|
+
InsecureRequestWarning)
|
|
93
94
|
return session
|