skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/provisioner.py
CHANGED
|
@@ -15,15 +15,22 @@ import colorama
|
|
|
15
15
|
import sky
|
|
16
16
|
from sky import clouds
|
|
17
17
|
from sky import exceptions
|
|
18
|
+
from sky import global_user_state
|
|
19
|
+
from sky import logs
|
|
18
20
|
from sky import provision
|
|
21
|
+
from sky import resources as resources_lib
|
|
19
22
|
from sky import sky_logging
|
|
23
|
+
from sky import skypilot_config
|
|
20
24
|
from sky.adaptors import aws
|
|
21
25
|
from sky.backends import backend_utils
|
|
26
|
+
from sky.jobs.server import utils as server_jobs_utils
|
|
22
27
|
from sky.provision import common as provision_common
|
|
23
28
|
from sky.provision import instance_setup
|
|
24
29
|
from sky.provision import logging as provision_logging
|
|
25
30
|
from sky.provision import metadata_utils
|
|
31
|
+
from sky.provision import volume as provision_volume
|
|
26
32
|
from sky.skylet import constants
|
|
33
|
+
from sky.utils import common
|
|
27
34
|
from sky.utils import common_utils
|
|
28
35
|
from sky.utils import message_utils
|
|
29
36
|
from sky.utils import resources_utils
|
|
@@ -53,6 +60,11 @@ def _bulk_provision(
|
|
|
53
60
|
region_name = region.name
|
|
54
61
|
|
|
55
62
|
start = time.time()
|
|
63
|
+
|
|
64
|
+
provision_volume.provision_ephemeral_volumes(cloud, region_name,
|
|
65
|
+
cluster_name.name_on_cloud,
|
|
66
|
+
bootstrap_config)
|
|
67
|
+
|
|
56
68
|
# TODO(suquark): Should we cache the bootstrapped result?
|
|
57
69
|
# Currently it is not necessary as bootstrapping takes
|
|
58
70
|
# only ~3s, caching it seems over-engineering and could
|
|
@@ -64,6 +76,7 @@ def _bulk_provision(
|
|
|
64
76
|
|
|
65
77
|
provision_record = provision.run_instances(provider_name,
|
|
66
78
|
region_name,
|
|
79
|
+
str(cluster_name),
|
|
67
80
|
cluster_name.name_on_cloud,
|
|
68
81
|
config=config)
|
|
69
82
|
|
|
@@ -71,7 +84,8 @@ def _bulk_provision(
|
|
|
71
84
|
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
|
72
85
|
rich_utils.force_update_status(
|
|
73
86
|
ux_utils.spinner_message('Launching - Checking instance status',
|
|
74
|
-
str(provision_logging.config.log_path)
|
|
87
|
+
str(provision_logging.config.log_path),
|
|
88
|
+
cluster_name=str(cluster_name)))
|
|
75
89
|
# AWS would take a very short time (<<1s) updating the state of the
|
|
76
90
|
# instance.
|
|
77
91
|
time.sleep(1)
|
|
@@ -95,6 +109,12 @@ def _bulk_provision(
|
|
|
95
109
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
|
96
110
|
f'seconds.')
|
|
97
111
|
|
|
112
|
+
# Add cluster event for provisioning completion.
|
|
113
|
+
global_user_state.add_cluster_event(
|
|
114
|
+
str(cluster_name), status_lib.ClusterStatus.INIT,
|
|
115
|
+
f'Instances launched on {cloud.display_name()} in {region}',
|
|
116
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
117
|
+
|
|
98
118
|
return provision_record
|
|
99
119
|
|
|
100
120
|
|
|
@@ -117,7 +137,7 @@ def bulk_provision(
|
|
|
117
137
|
Cloud specific exceptions: If the provisioning process failed, cloud-
|
|
118
138
|
specific exceptions will be raised by the cloud APIs.
|
|
119
139
|
"""
|
|
120
|
-
original_config =
|
|
140
|
+
original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
|
121
141
|
head_node_type = original_config['head_node_type']
|
|
122
142
|
bootstrap_config = provision_common.ProvisionConfig(
|
|
123
143
|
provider_config=original_config['provider'],
|
|
@@ -155,7 +175,7 @@ def bulk_provision(
|
|
|
155
175
|
# This error is a user error instead of a provisioning failure.
|
|
156
176
|
# And there is no possibility to fix it by teardown.
|
|
157
177
|
raise
|
|
158
|
-
except Exception: # pylint: disable=broad-except
|
|
178
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
159
179
|
zone_str = 'all zones'
|
|
160
180
|
if zones:
|
|
161
181
|
zone_str = ','.join(zone.name for zone in zones)
|
|
@@ -177,14 +197,18 @@ def bulk_provision(
|
|
|
177
197
|
provider_config=original_config['provider'])
|
|
178
198
|
break
|
|
179
199
|
except NotImplementedError as e:
|
|
180
|
-
|
|
200
|
+
assert not terminate, (
|
|
201
|
+
'Terminating must be supported by all clouds')
|
|
202
|
+
exc_msg = common_utils.format_exception(exc).replace(
|
|
203
|
+
'\n', ' ')
|
|
181
204
|
# If the underlying cloud does not support stopping
|
|
182
205
|
# instances, we should stop failover as well.
|
|
183
206
|
raise provision_common.StopFailoverError(
|
|
184
|
-
'
|
|
185
|
-
f'
|
|
186
|
-
|
|
187
|
-
f'
|
|
207
|
+
f'Provisioning cluster {cluster_name.display_name} '
|
|
208
|
+
f'failed: {exc_msg}. Failover is stopped for safety '
|
|
209
|
+
'because the cluster was previously in UP state but '
|
|
210
|
+
f'{cloud} does not support stopping instances to '
|
|
211
|
+
'preserve the cluster state. Please try launching the '
|
|
188
212
|
'cluster again, or terminate it with: '
|
|
189
213
|
f'sky down {cluster_name.display_name}') from e
|
|
190
214
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -219,6 +243,7 @@ def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
|
|
|
219
243
|
provision.terminate_instances(cloud_name, cluster_name.name_on_cloud,
|
|
220
244
|
provider_config)
|
|
221
245
|
metadata_utils.remove_cluster_metadata(cluster_name.name_on_cloud)
|
|
246
|
+
provision_volume.delete_ephemeral_volumes(provider_config)
|
|
222
247
|
else:
|
|
223
248
|
provision.stop_instances(cloud_name, cluster_name.name_on_cloud,
|
|
224
249
|
provider_config)
|
|
@@ -228,9 +253,9 @@ def _ssh_probe_command(ip: str,
|
|
|
228
253
|
ssh_port: int,
|
|
229
254
|
ssh_user: str,
|
|
230
255
|
ssh_private_key: str,
|
|
256
|
+
ssh_probe_timeout: int,
|
|
231
257
|
ssh_proxy_command: Optional[str] = None) -> List[str]:
|
|
232
|
-
# NOTE: Ray uses 'uptime' command
|
|
233
|
-
# setting here.
|
|
258
|
+
# NOTE: Ray uses 'uptime' command, we use the same setting here.
|
|
234
259
|
command = [
|
|
235
260
|
'ssh',
|
|
236
261
|
'-T',
|
|
@@ -244,7 +269,7 @@ def _ssh_probe_command(ip: str,
|
|
|
244
269
|
'-o',
|
|
245
270
|
'PasswordAuthentication=no',
|
|
246
271
|
'-o',
|
|
247
|
-
'ConnectTimeout=
|
|
272
|
+
f'ConnectTimeout={ssh_probe_timeout}s',
|
|
248
273
|
'-o',
|
|
249
274
|
f'UserKnownHostsFile={os.devnull}',
|
|
250
275
|
'-o',
|
|
@@ -277,6 +302,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
|
277
302
|
ssh_port: int,
|
|
278
303
|
ssh_user: str,
|
|
279
304
|
ssh_private_key: str,
|
|
305
|
+
ssh_probe_timeout: int,
|
|
280
306
|
ssh_control_name: Optional[str] = None,
|
|
281
307
|
ssh_proxy_command: Optional[str] = None,
|
|
282
308
|
**kwargs) -> Tuple[bool, str]:
|
|
@@ -305,6 +331,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
|
305
331
|
if success:
|
|
306
332
|
return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
|
|
307
333
|
ssh_private_key,
|
|
334
|
+
ssh_probe_timeout,
|
|
308
335
|
ssh_control_name,
|
|
309
336
|
ssh_proxy_command)
|
|
310
337
|
except socket.timeout: # this is the most expected exception
|
|
@@ -312,7 +339,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
|
312
339
|
except Exception as e: # pylint: disable=broad-except
|
|
313
340
|
stderr = f'Error: {common_utils.format_exception(e)}'
|
|
314
341
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
|
315
|
-
ssh_proxy_command)
|
|
342
|
+
ssh_probe_timeout, ssh_proxy_command)
|
|
316
343
|
logger.debug(f'Waiting for SSH to {ip}. Try: '
|
|
317
344
|
f'{_shlex_join(command)}. '
|
|
318
345
|
f'{stderr}')
|
|
@@ -323,6 +350,7 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
|
323
350
|
ssh_port: int,
|
|
324
351
|
ssh_user: str,
|
|
325
352
|
ssh_private_key: str,
|
|
353
|
+
ssh_probe_timeout: int,
|
|
326
354
|
ssh_control_name: Optional[str] = None,
|
|
327
355
|
ssh_proxy_command: Optional[str] = None,
|
|
328
356
|
**kwargs) -> Tuple[bool, str]:
|
|
@@ -333,14 +361,14 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
|
333
361
|
"""
|
|
334
362
|
del ssh_control_name, kwargs # unused
|
|
335
363
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
|
336
|
-
ssh_proxy_command)
|
|
364
|
+
ssh_probe_timeout, ssh_proxy_command)
|
|
337
365
|
message = f'Waiting for SSH using command: {_shlex_join(command)}'
|
|
338
366
|
logger.debug(message)
|
|
339
367
|
try:
|
|
340
368
|
proc = subprocess.run(command,
|
|
341
369
|
shell=False,
|
|
342
370
|
check=False,
|
|
343
|
-
timeout=
|
|
371
|
+
timeout=ssh_probe_timeout,
|
|
344
372
|
stdout=subprocess.DEVNULL,
|
|
345
373
|
stderr=subprocess.PIPE)
|
|
346
374
|
if proc.returncode != 0:
|
|
@@ -383,8 +411,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
|
383
411
|
def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
|
|
384
412
|
ip, ssh_port = ip_ssh_port
|
|
385
413
|
success = False
|
|
414
|
+
ssh_probe_timeout = skypilot_config.get_nested(
|
|
415
|
+
('provision', 'ssh_timeout'), 10)
|
|
386
416
|
while not success:
|
|
387
|
-
success, stderr = waiter(ip,
|
|
417
|
+
success, stderr = waiter(ip,
|
|
418
|
+
ssh_port,
|
|
419
|
+
**ssh_credentials,
|
|
420
|
+
ssh_probe_timeout=ssh_probe_timeout)
|
|
388
421
|
if not success and time.time() - start > timeout:
|
|
389
422
|
with ux_utils.print_exception_no_traceback():
|
|
390
423
|
raise RuntimeError(
|
|
@@ -403,16 +436,27 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
|
403
436
|
|
|
404
437
|
|
|
405
438
|
def _post_provision_setup(
|
|
406
|
-
|
|
407
|
-
|
|
439
|
+
launched_resources: resources_lib.Resources,
|
|
440
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
441
|
+
provision_record: provision_common.ProvisionRecord,
|
|
408
442
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
|
409
|
-
config_from_yaml =
|
|
443
|
+
config_from_yaml = global_user_state.get_cluster_yaml_dict(
|
|
444
|
+
handle_cluster_yaml)
|
|
410
445
|
provider_config = config_from_yaml.get('provider')
|
|
446
|
+
cloud_name = repr(launched_resources.cloud)
|
|
411
447
|
cluster_info = provision.get_cluster_info(cloud_name,
|
|
412
448
|
provision_record.region,
|
|
413
449
|
cluster_name.name_on_cloud,
|
|
414
450
|
provider_config=provider_config)
|
|
415
451
|
|
|
452
|
+
# Update cluster info in handle so cluster instance ids are set. This
|
|
453
|
+
# allows us to expose provision logs to debug nodes that failed during post
|
|
454
|
+
# provision setup.
|
|
455
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
456
|
+
cluster_name.display_name)
|
|
457
|
+
handle.cached_cluster_info = cluster_info
|
|
458
|
+
global_user_state.update_cluster_handle(cluster_name.display_name, handle)
|
|
459
|
+
|
|
416
460
|
if cluster_info.num_instances > 1:
|
|
417
461
|
# Only worker nodes have logs in the per-instance log directory. Head
|
|
418
462
|
# node's log will be redirected to the main log file.
|
|
@@ -437,23 +481,24 @@ def _post_provision_setup(
|
|
|
437
481
|
# TODO(suquark): Move wheel build here in future PRs.
|
|
438
482
|
# We don't set docker_user here, as we are configuring the VM itself.
|
|
439
483
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
|
440
|
-
|
|
484
|
+
handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
|
|
441
485
|
docker_config = config_from_yaml.get('docker', {})
|
|
442
486
|
|
|
443
487
|
with rich_utils.safe_status(
|
|
444
|
-
ux_utils.spinner_message(
|
|
445
|
-
|
|
446
|
-
|
|
488
|
+
ux_utils.spinner_message('Launching - Waiting for SSH access',
|
|
489
|
+
provision_logging.config.log_path,
|
|
490
|
+
cluster_name=str(cluster_name))) as status:
|
|
447
491
|
# If on Kubernetes, skip SSH check since the pods are guaranteed to be
|
|
448
492
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
449
493
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
450
494
|
# for the users to SSH into the pod.
|
|
451
|
-
|
|
495
|
+
is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
|
|
496
|
+
if not is_k8s_cloud:
|
|
452
497
|
logger.debug(
|
|
453
498
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
|
454
499
|
wait_for_ssh(cluster_info, ssh_credentials)
|
|
455
500
|
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
|
456
|
-
vm_str = 'Instance' if
|
|
501
|
+
vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
|
|
457
502
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
|
458
503
|
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
|
459
504
|
indent_str = (ux_utils.INDENT_SYMBOL
|
|
@@ -472,7 +517,8 @@ def _post_provision_setup(
|
|
|
472
517
|
status.update(
|
|
473
518
|
ux_utils.spinner_message(
|
|
474
519
|
'Launching - Initializing docker container',
|
|
475
|
-
provision_logging.config.log_path
|
|
520
|
+
provision_logging.config.log_path,
|
|
521
|
+
cluster_name=str(cluster_name)))
|
|
476
522
|
docker_user = instance_setup.initialize_docker(
|
|
477
523
|
cluster_name.name_on_cloud,
|
|
478
524
|
docker_config=docker_config,
|
|
@@ -489,6 +535,25 @@ def _post_provision_setup(
|
|
|
489
535
|
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
|
490
536
|
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
|
491
537
|
|
|
538
|
+
# Check version compatibility for jobs controller clusters
|
|
539
|
+
if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
540
|
+
# TODO(zeping): remove this in v0.12.0
|
|
541
|
+
# This only happens in upgrade from <0.9.3 to > 0.10.0
|
|
542
|
+
# After 0.10.0 no incompatibility issue
|
|
543
|
+
# See https://github.com/skypilot-org/skypilot/pull/6096
|
|
544
|
+
# For more details
|
|
545
|
+
status.update(
|
|
546
|
+
ux_utils.spinner_message(
|
|
547
|
+
'Checking controller version compatibility'))
|
|
548
|
+
|
|
549
|
+
try:
|
|
550
|
+
server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
|
|
551
|
+
except exceptions.ClusterNotUpError:
|
|
552
|
+
# Controller is not up yet during initial provisioning, that
|
|
553
|
+
# also means no non-terminal jobs, so no incompatibility in
|
|
554
|
+
# this case.
|
|
555
|
+
pass
|
|
556
|
+
|
|
492
557
|
# We mount the metadata with sky wheel for speedup.
|
|
493
558
|
# NOTE: currently we mount all credentials for all nodes, because
|
|
494
559
|
# (1) jobs controllers need permission to launch/down nodes of
|
|
@@ -502,7 +567,8 @@ def _post_provision_setup(
|
|
|
502
567
|
|
|
503
568
|
runtime_preparation_str = (ux_utils.spinner_message(
|
|
504
569
|
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
|
505
|
-
provision_logging.config.log_path
|
|
570
|
+
provision_logging.config.log_path,
|
|
571
|
+
cluster_name=str(cluster_name)))
|
|
506
572
|
status.update(
|
|
507
573
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
|
508
574
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
|
@@ -636,19 +702,32 @@ def _post_provision_setup(
|
|
|
636
702
|
logger.debug('Ray cluster is ready. Skip starting ray cluster on '
|
|
637
703
|
'worker nodes.')
|
|
638
704
|
|
|
639
|
-
|
|
640
|
-
|
|
705
|
+
logging_agent = logs.get_logging_agent()
|
|
706
|
+
if logging_agent:
|
|
707
|
+
status.update(
|
|
708
|
+
ux_utils.spinner_message('Setting up logging agent',
|
|
709
|
+
provision_logging.config.log_path,
|
|
710
|
+
cluster_name=str(cluster_name)))
|
|
711
|
+
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
|
712
|
+
cluster_info,
|
|
713
|
+
ssh_credentials)
|
|
714
|
+
|
|
715
|
+
instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
|
|
716
|
+
ssh_credentials,
|
|
717
|
+
launched_resources)
|
|
641
718
|
|
|
642
719
|
logger.info(
|
|
643
720
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
644
|
-
provision_logging.config.log_path
|
|
721
|
+
provision_logging.config.log_path,
|
|
722
|
+
cluster_name=str(cluster_name)))
|
|
645
723
|
return cluster_info
|
|
646
724
|
|
|
647
725
|
|
|
648
726
|
@timeline.event
|
|
649
727
|
def post_provision_runtime_setup(
|
|
650
|
-
|
|
651
|
-
|
|
728
|
+
launched_resources: resources_lib.Resources,
|
|
729
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
730
|
+
provision_record: provision_common.ProvisionRecord,
|
|
652
731
|
custom_resource: Optional[str],
|
|
653
732
|
log_dir: str) -> provision_common.ClusterInfo:
|
|
654
733
|
"""Run internal setup commands after provisioning and before user setup.
|
|
@@ -659,6 +738,7 @@ def post_provision_runtime_setup(
|
|
|
659
738
|
and other necessary files to the VM.
|
|
660
739
|
3. Run setup commands to install dependencies.
|
|
661
740
|
4. Start ray cluster and skylet.
|
|
741
|
+
5. (Optional) Setup logging agent.
|
|
662
742
|
|
|
663
743
|
Raises:
|
|
664
744
|
RuntimeError: If the setup process encounters any error.
|
|
@@ -666,11 +746,12 @@ def post_provision_runtime_setup(
|
|
|
666
746
|
with provision_logging.setup_provision_logging(log_dir):
|
|
667
747
|
try:
|
|
668
748
|
logger.debug(_TITLE.format('System Setup After Provision'))
|
|
669
|
-
return _post_provision_setup(
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
749
|
+
return _post_provision_setup(
|
|
750
|
+
launched_resources,
|
|
751
|
+
cluster_name,
|
|
752
|
+
handle_cluster_yaml=handle_cluster_yaml,
|
|
753
|
+
provision_record=provision_record,
|
|
754
|
+
custom_resource=custom_resource)
|
|
674
755
|
except Exception: # pylint: disable=broad-except
|
|
675
756
|
logger.error(
|
|
676
757
|
ux_utils.error_message(
|
sky/provision/runpod/__init__.py
CHANGED
|
@@ -9,3 +9,8 @@ from sky.provision.runpod.instance import run_instances
|
|
|
9
9
|
from sky.provision.runpod.instance import stop_instances
|
|
10
10
|
from sky.provision.runpod.instance import terminate_instances
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
|
+
from sky.provision.runpod.volume import apply_volume
|
|
13
|
+
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_all_volumes_usedby
|
|
15
|
+
from sky.provision.runpod.volume import get_volume_usedby
|
|
16
|
+
from sky.provision.runpod.volume import map_all_volumes_usedby
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return head_instance_id
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
-
|
|
50
|
+
del cluster_name # unused
|
|
51
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
52
52
|
|
|
53
53
|
while True:
|
|
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
80
80
|
created_instance_ids=[])
|
|
81
81
|
|
|
82
82
|
created_instance_ids = []
|
|
83
|
+
volume_mounts = config.node_config.get('VolumeMounts', [])
|
|
84
|
+
network_volume_id = None
|
|
85
|
+
volume_mount_path = None
|
|
86
|
+
if volume_mounts:
|
|
87
|
+
if len(volume_mounts) > 1:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f'RunPod only supports one network volume mount, '
|
|
90
|
+
f'but {len(volume_mounts)} are specified. Only the first one '
|
|
91
|
+
f'will be used.')
|
|
92
|
+
volume_mount = volume_mounts[0]
|
|
93
|
+
network_volume_id = volume_mount.get('VolumeIdOnCloud')
|
|
94
|
+
volume_mount_path = volume_mount.get('MountPath')
|
|
95
|
+
if network_volume_id is None or volume_mount_path is None:
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
'Network volume ID and mount path must be specified.')
|
|
83
98
|
for _ in range(to_start_count):
|
|
84
99
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
85
100
|
try:
|
|
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
112
|
bid_per_gpu=config.node_config['BidPerGPU'],
|
|
98
113
|
docker_login_config=config.provider_config.get(
|
|
99
114
|
'docker_login_config'),
|
|
115
|
+
network_volume_id=network_volume_id,
|
|
116
|
+
volume_mount_path=volume_mount_path,
|
|
100
117
|
)
|
|
101
118
|
except Exception as e: # pylint: disable=broad-except
|
|
102
119
|
logger.warning(f'run_instances error: {e}')
|
|
@@ -201,11 +218,14 @@ def get_cluster_info(
|
|
|
201
218
|
|
|
202
219
|
|
|
203
220
|
def query_instances(
|
|
221
|
+
cluster_name: str,
|
|
204
222
|
cluster_name_on_cloud: str,
|
|
205
223
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
224
|
non_terminated_only: bool = True,
|
|
207
|
-
|
|
225
|
+
retry_if_missing: bool = False,
|
|
226
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
227
|
"""See sky/provision/__init__.py"""
|
|
228
|
+
del cluster_name, retry_if_missing # unused
|
|
209
229
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
230
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
211
231
|
|
|
@@ -215,12 +235,13 @@ def query_instances(
|
|
|
215
235
|
'PAUSED': status_lib.ClusterStatus.INIT,
|
|
216
236
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
217
237
|
}
|
|
218
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
238
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
239
|
+
Optional[str]]] = {}
|
|
219
240
|
for inst_id, inst in instances.items():
|
|
220
241
|
status = status_map[inst['status']]
|
|
221
242
|
if non_terminated_only and status is None:
|
|
222
243
|
continue
|
|
223
|
-
statuses[inst_id] = status
|
|
244
|
+
statuses[inst_id] = (status, None)
|
|
224
245
|
return statuses
|
|
225
246
|
|
|
226
247
|
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.adaptors import runpod
|
|
9
9
|
from sky.provision import docker_utils
|
|
10
|
-
|
|
10
|
+
from sky.provision.runpod.api import commands as runpod_commands
|
|
11
11
|
from sky.skylet import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
@@ -263,25 +263,36 @@ def _create_template_for_docker_login(
|
|
|
263
263
|
return login_config.format_image(image_name), create_template_resp['id']
|
|
264
264
|
|
|
265
265
|
|
|
266
|
-
def launch(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
266
|
+
def launch(
|
|
267
|
+
cluster_name: str,
|
|
268
|
+
node_type: str,
|
|
269
|
+
instance_type: str,
|
|
270
|
+
region: str,
|
|
271
|
+
zone: str,
|
|
272
|
+
disk_size: int,
|
|
273
|
+
image_name: str,
|
|
274
|
+
ports: Optional[List[int]],
|
|
275
|
+
public_key: str,
|
|
276
|
+
preemptible: Optional[bool],
|
|
277
|
+
bid_per_gpu: float,
|
|
278
|
+
docker_login_config: Optional[Dict[str, str]],
|
|
279
|
+
*,
|
|
280
|
+
network_volume_id: Optional[str] = None,
|
|
281
|
+
volume_mount_path: Optional[str] = None,
|
|
282
|
+
) -> str:
|
|
271
283
|
"""Launches an instance with the given parameters.
|
|
272
284
|
|
|
273
|
-
|
|
274
|
-
|
|
285
|
+
For CPU instances, we directly use the instance_type for launching the
|
|
286
|
+
instance.
|
|
287
|
+
|
|
288
|
+
For GPU instances, we convert the instance_type to the RunPod GPU name,
|
|
289
|
+
and finds the specs for the GPU, before launching the instance.
|
|
275
290
|
|
|
276
291
|
Returns:
|
|
277
292
|
instance_id: The instance ID.
|
|
278
293
|
"""
|
|
279
294
|
name = f'{cluster_name}-{node_type}'
|
|
280
|
-
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
|
281
|
-
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
|
282
|
-
cloud_type = instance_type.split('_')[2]
|
|
283
295
|
|
|
284
|
-
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
|
285
296
|
# TODO(zhwu): keep this align with setups in
|
|
286
297
|
# `provision.kuberunetes.instance.py`
|
|
287
298
|
setup_cmd = (
|
|
@@ -329,12 +340,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
329
340
|
params = {
|
|
330
341
|
'name': name,
|
|
331
342
|
'image_name': image_name_formatted,
|
|
332
|
-
'gpu_type_id': gpu_type,
|
|
333
|
-
'cloud_type': cloud_type,
|
|
334
343
|
'container_disk_in_gb': disk_size,
|
|
335
|
-
'min_vcpu_count': 4 * gpu_quantity,
|
|
336
|
-
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
|
337
|
-
'gpu_count': gpu_quantity,
|
|
338
344
|
'country_code': region,
|
|
339
345
|
'data_center_id': zone,
|
|
340
346
|
'ports': ports_str,
|
|
@@ -343,12 +349,39 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
343
349
|
'template_id': template_id,
|
|
344
350
|
}
|
|
345
351
|
|
|
352
|
+
# Optional network volume mount.
|
|
353
|
+
if volume_mount_path is not None:
|
|
354
|
+
params['volume_mount_path'] = volume_mount_path
|
|
355
|
+
if network_volume_id is not None:
|
|
356
|
+
params['network_volume_id'] = network_volume_id
|
|
357
|
+
|
|
358
|
+
# GPU instance types start with f'{gpu_count}x',
|
|
359
|
+
# CPU instance types start with 'cpu'.
|
|
360
|
+
is_cpu_instance = instance_type.startswith('cpu')
|
|
361
|
+
if is_cpu_instance:
|
|
362
|
+
# RunPod CPU instances can be uniquely identified by the instance_id.
|
|
363
|
+
params.update({
|
|
364
|
+
'instance_id': instance_type,
|
|
365
|
+
})
|
|
366
|
+
else:
|
|
367
|
+
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
|
368
|
+
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
|
369
|
+
cloud_type = instance_type.split('_')[2]
|
|
370
|
+
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
|
371
|
+
params.update({
|
|
372
|
+
'gpu_type_id': gpu_type,
|
|
373
|
+
'cloud_type': cloud_type,
|
|
374
|
+
'min_vcpu_count': 4 * gpu_quantity,
|
|
375
|
+
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
|
376
|
+
'gpu_count': gpu_quantity,
|
|
377
|
+
})
|
|
378
|
+
|
|
346
379
|
if preemptible is None or not preemptible:
|
|
347
380
|
new_instance = runpod.runpod.create_pod(**params)
|
|
348
381
|
else:
|
|
349
382
|
new_instance = runpod_commands.create_spot_pod(
|
|
350
383
|
bid_per_gpu=bid_per_gpu,
|
|
351
|
-
**params,
|
|
384
|
+
**params, # type: ignore[arg-type]
|
|
352
385
|
)
|
|
353
386
|
|
|
354
387
|
return new_instance['id']
|