skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/provision/provisioner.py
CHANGED
|
@@ -15,15 +15,21 @@ import colorama
|
|
|
15
15
|
import sky
|
|
16
16
|
from sky import clouds
|
|
17
17
|
from sky import exceptions
|
|
18
|
+
from sky import global_user_state
|
|
19
|
+
from sky import logs
|
|
18
20
|
from sky import provision
|
|
21
|
+
from sky import resources as resources_lib
|
|
19
22
|
from sky import sky_logging
|
|
23
|
+
from sky import skypilot_config
|
|
20
24
|
from sky.adaptors import aws
|
|
21
25
|
from sky.backends import backend_utils
|
|
26
|
+
from sky.jobs.server import utils as server_jobs_utils
|
|
22
27
|
from sky.provision import common as provision_common
|
|
23
28
|
from sky.provision import instance_setup
|
|
24
29
|
from sky.provision import logging as provision_logging
|
|
25
30
|
from sky.provision import metadata_utils
|
|
26
31
|
from sky.skylet import constants
|
|
32
|
+
from sky.utils import common
|
|
27
33
|
from sky.utils import common_utils
|
|
28
34
|
from sky.utils import message_utils
|
|
29
35
|
from sky.utils import resources_utils
|
|
@@ -64,6 +70,7 @@ def _bulk_provision(
|
|
|
64
70
|
|
|
65
71
|
provision_record = provision.run_instances(provider_name,
|
|
66
72
|
region_name,
|
|
73
|
+
str(cluster_name),
|
|
67
74
|
cluster_name.name_on_cloud,
|
|
68
75
|
config=config)
|
|
69
76
|
|
|
@@ -71,7 +78,8 @@ def _bulk_provision(
|
|
|
71
78
|
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
|
72
79
|
rich_utils.force_update_status(
|
|
73
80
|
ux_utils.spinner_message('Launching - Checking instance status',
|
|
74
|
-
str(provision_logging.config.log_path)
|
|
81
|
+
str(provision_logging.config.log_path),
|
|
82
|
+
cluster_name=str(cluster_name)))
|
|
75
83
|
# AWS would take a very short time (<<1s) updating the state of the
|
|
76
84
|
# instance.
|
|
77
85
|
time.sleep(1)
|
|
@@ -95,6 +103,12 @@ def _bulk_provision(
|
|
|
95
103
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
|
96
104
|
f'seconds.')
|
|
97
105
|
|
|
106
|
+
# Add cluster event for provisioning completion.
|
|
107
|
+
global_user_state.add_cluster_event(
|
|
108
|
+
str(cluster_name), status_lib.ClusterStatus.INIT,
|
|
109
|
+
f'Instances launched on {cloud.display_name()} in {region}',
|
|
110
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
111
|
+
|
|
98
112
|
return provision_record
|
|
99
113
|
|
|
100
114
|
|
|
@@ -117,7 +131,7 @@ def bulk_provision(
|
|
|
117
131
|
Cloud specific exceptions: If the provisioning process failed, cloud-
|
|
118
132
|
specific exceptions will be raised by the cloud APIs.
|
|
119
133
|
"""
|
|
120
|
-
original_config =
|
|
134
|
+
original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
|
121
135
|
head_node_type = original_config['head_node_type']
|
|
122
136
|
bootstrap_config = provision_common.ProvisionConfig(
|
|
123
137
|
provider_config=original_config['provider'],
|
|
@@ -155,7 +169,7 @@ def bulk_provision(
|
|
|
155
169
|
# This error is a user error instead of a provisioning failure.
|
|
156
170
|
# And there is no possibility to fix it by teardown.
|
|
157
171
|
raise
|
|
158
|
-
except Exception: # pylint: disable=broad-except
|
|
172
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
159
173
|
zone_str = 'all zones'
|
|
160
174
|
if zones:
|
|
161
175
|
zone_str = ','.join(zone.name for zone in zones)
|
|
@@ -177,14 +191,18 @@ def bulk_provision(
|
|
|
177
191
|
provider_config=original_config['provider'])
|
|
178
192
|
break
|
|
179
193
|
except NotImplementedError as e:
|
|
180
|
-
|
|
194
|
+
assert not terminate, (
|
|
195
|
+
'Terminating must be supported by all clouds')
|
|
196
|
+
exc_msg = common_utils.format_exception(exc).replace(
|
|
197
|
+
'\n', ' ')
|
|
181
198
|
# If the underlying cloud does not support stopping
|
|
182
199
|
# instances, we should stop failover as well.
|
|
183
200
|
raise provision_common.StopFailoverError(
|
|
184
|
-
'
|
|
185
|
-
f'
|
|
186
|
-
|
|
187
|
-
f'
|
|
201
|
+
f'Provisioning cluster {cluster_name.display_name} '
|
|
202
|
+
f'failed: {exc_msg}. Failover is stopped for safety '
|
|
203
|
+
'because the cluster was previously in UP state but '
|
|
204
|
+
f'{cloud} does not support stopping instances to '
|
|
205
|
+
'preserve the cluster state. Please try launching the '
|
|
188
206
|
'cluster again, or terminate it with: '
|
|
189
207
|
f'sky down {cluster_name.display_name}') from e
|
|
190
208
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -228,9 +246,9 @@ def _ssh_probe_command(ip: str,
|
|
|
228
246
|
ssh_port: int,
|
|
229
247
|
ssh_user: str,
|
|
230
248
|
ssh_private_key: str,
|
|
249
|
+
ssh_probe_timeout: int,
|
|
231
250
|
ssh_proxy_command: Optional[str] = None) -> List[str]:
|
|
232
|
-
# NOTE: Ray uses 'uptime' command
|
|
233
|
-
# setting here.
|
|
251
|
+
# NOTE: Ray uses 'uptime' command, we use the same setting here.
|
|
234
252
|
command = [
|
|
235
253
|
'ssh',
|
|
236
254
|
'-T',
|
|
@@ -244,7 +262,7 @@ def _ssh_probe_command(ip: str,
|
|
|
244
262
|
'-o',
|
|
245
263
|
'PasswordAuthentication=no',
|
|
246
264
|
'-o',
|
|
247
|
-
'ConnectTimeout=
|
|
265
|
+
f'ConnectTimeout={ssh_probe_timeout}s',
|
|
248
266
|
'-o',
|
|
249
267
|
f'UserKnownHostsFile={os.devnull}',
|
|
250
268
|
'-o',
|
|
@@ -277,6 +295,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
|
277
295
|
ssh_port: int,
|
|
278
296
|
ssh_user: str,
|
|
279
297
|
ssh_private_key: str,
|
|
298
|
+
ssh_probe_timeout: int,
|
|
280
299
|
ssh_control_name: Optional[str] = None,
|
|
281
300
|
ssh_proxy_command: Optional[str] = None,
|
|
282
301
|
**kwargs) -> Tuple[bool, str]:
|
|
@@ -305,6 +324,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
|
305
324
|
if success:
|
|
306
325
|
return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
|
|
307
326
|
ssh_private_key,
|
|
327
|
+
ssh_probe_timeout,
|
|
308
328
|
ssh_control_name,
|
|
309
329
|
ssh_proxy_command)
|
|
310
330
|
except socket.timeout: # this is the most expected exception
|
|
@@ -312,7 +332,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
|
312
332
|
except Exception as e: # pylint: disable=broad-except
|
|
313
333
|
stderr = f'Error: {common_utils.format_exception(e)}'
|
|
314
334
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
|
315
|
-
ssh_proxy_command)
|
|
335
|
+
ssh_probe_timeout, ssh_proxy_command)
|
|
316
336
|
logger.debug(f'Waiting for SSH to {ip}. Try: '
|
|
317
337
|
f'{_shlex_join(command)}. '
|
|
318
338
|
f'{stderr}')
|
|
@@ -323,6 +343,7 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
|
323
343
|
ssh_port: int,
|
|
324
344
|
ssh_user: str,
|
|
325
345
|
ssh_private_key: str,
|
|
346
|
+
ssh_probe_timeout: int,
|
|
326
347
|
ssh_control_name: Optional[str] = None,
|
|
327
348
|
ssh_proxy_command: Optional[str] = None,
|
|
328
349
|
**kwargs) -> Tuple[bool, str]:
|
|
@@ -333,14 +354,14 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
|
333
354
|
"""
|
|
334
355
|
del ssh_control_name, kwargs # unused
|
|
335
356
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
|
336
|
-
ssh_proxy_command)
|
|
357
|
+
ssh_probe_timeout, ssh_proxy_command)
|
|
337
358
|
message = f'Waiting for SSH using command: {_shlex_join(command)}'
|
|
338
359
|
logger.debug(message)
|
|
339
360
|
try:
|
|
340
361
|
proc = subprocess.run(command,
|
|
341
362
|
shell=False,
|
|
342
363
|
check=False,
|
|
343
|
-
timeout=
|
|
364
|
+
timeout=ssh_probe_timeout,
|
|
344
365
|
stdout=subprocess.DEVNULL,
|
|
345
366
|
stderr=subprocess.PIPE)
|
|
346
367
|
if proc.returncode != 0:
|
|
@@ -383,8 +404,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
|
383
404
|
def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
|
|
384
405
|
ip, ssh_port = ip_ssh_port
|
|
385
406
|
success = False
|
|
407
|
+
ssh_probe_timeout = skypilot_config.get_nested(
|
|
408
|
+
('provision', 'ssh_timeout'), 10)
|
|
386
409
|
while not success:
|
|
387
|
-
success, stderr = waiter(ip,
|
|
410
|
+
success, stderr = waiter(ip,
|
|
411
|
+
ssh_port,
|
|
412
|
+
**ssh_credentials,
|
|
413
|
+
ssh_probe_timeout=ssh_probe_timeout)
|
|
388
414
|
if not success and time.time() - start > timeout:
|
|
389
415
|
with ux_utils.print_exception_no_traceback():
|
|
390
416
|
raise RuntimeError(
|
|
@@ -403,16 +429,27 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
|
403
429
|
|
|
404
430
|
|
|
405
431
|
def _post_provision_setup(
|
|
406
|
-
|
|
407
|
-
|
|
432
|
+
launched_resources: resources_lib.Resources,
|
|
433
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
434
|
+
provision_record: provision_common.ProvisionRecord,
|
|
408
435
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
|
409
|
-
config_from_yaml =
|
|
436
|
+
config_from_yaml = global_user_state.get_cluster_yaml_dict(
|
|
437
|
+
handle_cluster_yaml)
|
|
410
438
|
provider_config = config_from_yaml.get('provider')
|
|
439
|
+
cloud_name = repr(launched_resources.cloud)
|
|
411
440
|
cluster_info = provision.get_cluster_info(cloud_name,
|
|
412
441
|
provision_record.region,
|
|
413
442
|
cluster_name.name_on_cloud,
|
|
414
443
|
provider_config=provider_config)
|
|
415
444
|
|
|
445
|
+
# Update cluster info in handle so cluster instance ids are set. This
|
|
446
|
+
# allows us to expose provision logs to debug nodes that failed during post
|
|
447
|
+
# provision setup.
|
|
448
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
449
|
+
cluster_name.display_name)
|
|
450
|
+
handle.cached_cluster_info = cluster_info
|
|
451
|
+
global_user_state.update_cluster_handle(cluster_name.display_name, handle)
|
|
452
|
+
|
|
416
453
|
if cluster_info.num_instances > 1:
|
|
417
454
|
# Only worker nodes have logs in the per-instance log directory. Head
|
|
418
455
|
# node's log will be redirected to the main log file.
|
|
@@ -437,23 +474,24 @@ def _post_provision_setup(
|
|
|
437
474
|
# TODO(suquark): Move wheel build here in future PRs.
|
|
438
475
|
# We don't set docker_user here, as we are configuring the VM itself.
|
|
439
476
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
|
440
|
-
|
|
477
|
+
handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
|
|
441
478
|
docker_config = config_from_yaml.get('docker', {})
|
|
442
479
|
|
|
443
480
|
with rich_utils.safe_status(
|
|
444
|
-
ux_utils.spinner_message(
|
|
445
|
-
|
|
446
|
-
|
|
481
|
+
ux_utils.spinner_message('Launching - Waiting for SSH access',
|
|
482
|
+
provision_logging.config.log_path,
|
|
483
|
+
cluster_name=str(cluster_name))) as status:
|
|
447
484
|
# If on Kubernetes, skip SSH check since the pods are guaranteed to be
|
|
448
485
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
449
486
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
450
487
|
# for the users to SSH into the pod.
|
|
451
|
-
|
|
488
|
+
is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
|
|
489
|
+
if not is_k8s_cloud:
|
|
452
490
|
logger.debug(
|
|
453
491
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
|
454
492
|
wait_for_ssh(cluster_info, ssh_credentials)
|
|
455
493
|
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
|
456
|
-
vm_str = 'Instance' if
|
|
494
|
+
vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
|
|
457
495
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
|
458
496
|
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
|
459
497
|
indent_str = (ux_utils.INDENT_SYMBOL
|
|
@@ -472,7 +510,8 @@ def _post_provision_setup(
|
|
|
472
510
|
status.update(
|
|
473
511
|
ux_utils.spinner_message(
|
|
474
512
|
'Launching - Initializing docker container',
|
|
475
|
-
provision_logging.config.log_path
|
|
513
|
+
provision_logging.config.log_path,
|
|
514
|
+
cluster_name=str(cluster_name)))
|
|
476
515
|
docker_user = instance_setup.initialize_docker(
|
|
477
516
|
cluster_name.name_on_cloud,
|
|
478
517
|
docker_config=docker_config,
|
|
@@ -489,6 +528,25 @@ def _post_provision_setup(
|
|
|
489
528
|
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
|
490
529
|
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
|
491
530
|
|
|
531
|
+
# Check version compatibility for jobs controller clusters
|
|
532
|
+
if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
533
|
+
# TODO(zeping): remove this in v0.12.0
|
|
534
|
+
# This only happens in upgrade from <0.9.3 to > 0.10.0
|
|
535
|
+
# After 0.10.0 no incompatibility issue
|
|
536
|
+
# See https://github.com/skypilot-org/skypilot/pull/6096
|
|
537
|
+
# For more details
|
|
538
|
+
status.update(
|
|
539
|
+
ux_utils.spinner_message(
|
|
540
|
+
'Checking controller version compatibility'))
|
|
541
|
+
|
|
542
|
+
try:
|
|
543
|
+
server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
|
|
544
|
+
except exceptions.ClusterNotUpError:
|
|
545
|
+
# Controller is not up yet during initial provisioning, that
|
|
546
|
+
# also means no non-terminal jobs, so no incompatibility in
|
|
547
|
+
# this case.
|
|
548
|
+
pass
|
|
549
|
+
|
|
492
550
|
# We mount the metadata with sky wheel for speedup.
|
|
493
551
|
# NOTE: currently we mount all credentials for all nodes, because
|
|
494
552
|
# (1) jobs controllers need permission to launch/down nodes of
|
|
@@ -502,7 +560,8 @@ def _post_provision_setup(
|
|
|
502
560
|
|
|
503
561
|
runtime_preparation_str = (ux_utils.spinner_message(
|
|
504
562
|
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
|
505
|
-
provision_logging.config.log_path
|
|
563
|
+
provision_logging.config.log_path,
|
|
564
|
+
cluster_name=str(cluster_name)))
|
|
506
565
|
status.update(
|
|
507
566
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
|
508
567
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
|
@@ -636,19 +695,32 @@ def _post_provision_setup(
|
|
|
636
695
|
logger.debug('Ray cluster is ready. Skip starting ray cluster on '
|
|
637
696
|
'worker nodes.')
|
|
638
697
|
|
|
639
|
-
|
|
640
|
-
|
|
698
|
+
logging_agent = logs.get_logging_agent()
|
|
699
|
+
if logging_agent:
|
|
700
|
+
status.update(
|
|
701
|
+
ux_utils.spinner_message('Setting up logging agent',
|
|
702
|
+
provision_logging.config.log_path,
|
|
703
|
+
cluster_name=str(cluster_name)))
|
|
704
|
+
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
|
705
|
+
cluster_info,
|
|
706
|
+
ssh_credentials)
|
|
707
|
+
|
|
708
|
+
instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
|
|
709
|
+
ssh_credentials,
|
|
710
|
+
launched_resources)
|
|
641
711
|
|
|
642
712
|
logger.info(
|
|
643
713
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
644
|
-
provision_logging.config.log_path
|
|
714
|
+
provision_logging.config.log_path,
|
|
715
|
+
cluster_name=str(cluster_name)))
|
|
645
716
|
return cluster_info
|
|
646
717
|
|
|
647
718
|
|
|
648
719
|
@timeline.event
|
|
649
720
|
def post_provision_runtime_setup(
|
|
650
|
-
|
|
651
|
-
|
|
721
|
+
launched_resources: resources_lib.Resources,
|
|
722
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
723
|
+
provision_record: provision_common.ProvisionRecord,
|
|
652
724
|
custom_resource: Optional[str],
|
|
653
725
|
log_dir: str) -> provision_common.ClusterInfo:
|
|
654
726
|
"""Run internal setup commands after provisioning and before user setup.
|
|
@@ -659,6 +731,7 @@ def post_provision_runtime_setup(
|
|
|
659
731
|
and other necessary files to the VM.
|
|
660
732
|
3. Run setup commands to install dependencies.
|
|
661
733
|
4. Start ray cluster and skylet.
|
|
734
|
+
5. (Optional) Setup logging agent.
|
|
662
735
|
|
|
663
736
|
Raises:
|
|
664
737
|
RuntimeError: If the setup process encounters any error.
|
|
@@ -666,11 +739,12 @@ def post_provision_runtime_setup(
|
|
|
666
739
|
with provision_logging.setup_provision_logging(log_dir):
|
|
667
740
|
try:
|
|
668
741
|
logger.debug(_TITLE.format('System Setup After Provision'))
|
|
669
|
-
return _post_provision_setup(
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
742
|
+
return _post_provision_setup(
|
|
743
|
+
launched_resources,
|
|
744
|
+
cluster_name,
|
|
745
|
+
handle_cluster_yaml=handle_cluster_yaml,
|
|
746
|
+
provision_record=provision_record,
|
|
747
|
+
custom_resource=custom_resource)
|
|
674
748
|
except Exception: # pylint: disable=broad-except
|
|
675
749
|
logger.error(
|
|
676
750
|
ux_utils.error_message(
|
sky/provision/runpod/__init__.py
CHANGED
|
@@ -9,3 +9,8 @@ from sky.provision.runpod.instance import run_instances
|
|
|
9
9
|
from sky.provision.runpod.instance import stop_instances
|
|
10
10
|
from sky.provision.runpod.instance import terminate_instances
|
|
11
11
|
from sky.provision.runpod.instance import wait_instances
|
|
12
|
+
from sky.provision.runpod.volume import apply_volume
|
|
13
|
+
from sky.provision.runpod.volume import delete_volume
|
|
14
|
+
from sky.provision.runpod.volume import get_all_volumes_usedby
|
|
15
|
+
from sky.provision.runpod.volume import get_volume_usedby
|
|
16
|
+
from sky.provision.runpod.volume import map_all_volumes_usedby
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
44
44
|
return head_instance_id
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
-
|
|
50
|
+
del cluster_name # unused
|
|
51
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
52
52
|
|
|
53
53
|
while True:
|
|
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
80
80
|
created_instance_ids=[])
|
|
81
81
|
|
|
82
82
|
created_instance_ids = []
|
|
83
|
+
volume_mounts = config.node_config.get('VolumeMounts', [])
|
|
84
|
+
network_volume_id = None
|
|
85
|
+
volume_mount_path = None
|
|
86
|
+
if volume_mounts:
|
|
87
|
+
if len(volume_mounts) > 1:
|
|
88
|
+
logger.warning(
|
|
89
|
+
f'RunPod only supports one network volume mount, '
|
|
90
|
+
f'but {len(volume_mounts)} are specified. Only the first one '
|
|
91
|
+
f'will be used.')
|
|
92
|
+
volume_mount = volume_mounts[0]
|
|
93
|
+
network_volume_id = volume_mount.get('VolumeIdOnCloud')
|
|
94
|
+
volume_mount_path = volume_mount.get('MountPath')
|
|
95
|
+
if network_volume_id is None or volume_mount_path is None:
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
'Network volume ID and mount path must be specified.')
|
|
83
98
|
for _ in range(to_start_count):
|
|
84
99
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
85
100
|
try:
|
|
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
112
|
bid_per_gpu=config.node_config['BidPerGPU'],
|
|
98
113
|
docker_login_config=config.provider_config.get(
|
|
99
114
|
'docker_login_config'),
|
|
115
|
+
network_volume_id=network_volume_id,
|
|
116
|
+
volume_mount_path=volume_mount_path,
|
|
100
117
|
)
|
|
101
118
|
except Exception as e: # pylint: disable=broad-except
|
|
102
119
|
logger.warning(f'run_instances error: {e}')
|
|
@@ -201,11 +218,14 @@ def get_cluster_info(
|
|
|
201
218
|
|
|
202
219
|
|
|
203
220
|
def query_instances(
|
|
221
|
+
cluster_name: str,
|
|
204
222
|
cluster_name_on_cloud: str,
|
|
205
223
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
224
|
non_terminated_only: bool = True,
|
|
207
|
-
|
|
225
|
+
retry_if_missing: bool = False,
|
|
226
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
227
|
"""See sky/provision/__init__.py"""
|
|
228
|
+
del cluster_name, retry_if_missing # unused
|
|
209
229
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
230
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
211
231
|
|
|
@@ -215,12 +235,13 @@ def query_instances(
|
|
|
215
235
|
'PAUSED': status_lib.ClusterStatus.INIT,
|
|
216
236
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
217
237
|
}
|
|
218
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
238
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
239
|
+
Optional[str]]] = {}
|
|
219
240
|
for inst_id, inst in instances.items():
|
|
220
241
|
status = status_map[inst['status']]
|
|
221
242
|
if non_terminated_only and status is None:
|
|
222
243
|
continue
|
|
223
|
-
statuses[inst_id] = status
|
|
244
|
+
statuses[inst_id] = (status, None)
|
|
224
245
|
return statuses
|
|
225
246
|
|
|
226
247
|
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.adaptors import runpod
|
|
9
9
|
from sky.provision import docker_utils
|
|
10
|
-
|
|
10
|
+
from sky.provision.runpod.api import commands as runpod_commands
|
|
11
11
|
from sky.skylet import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
@@ -263,25 +263,36 @@ def _create_template_for_docker_login(
|
|
|
263
263
|
return login_config.format_image(image_name), create_template_resp['id']
|
|
264
264
|
|
|
265
265
|
|
|
266
|
-
def launch(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
266
|
+
def launch(
|
|
267
|
+
cluster_name: str,
|
|
268
|
+
node_type: str,
|
|
269
|
+
instance_type: str,
|
|
270
|
+
region: str,
|
|
271
|
+
zone: str,
|
|
272
|
+
disk_size: int,
|
|
273
|
+
image_name: str,
|
|
274
|
+
ports: Optional[List[int]],
|
|
275
|
+
public_key: str,
|
|
276
|
+
preemptible: Optional[bool],
|
|
277
|
+
bid_per_gpu: float,
|
|
278
|
+
docker_login_config: Optional[Dict[str, str]],
|
|
279
|
+
*,
|
|
280
|
+
network_volume_id: Optional[str] = None,
|
|
281
|
+
volume_mount_path: Optional[str] = None,
|
|
282
|
+
) -> str:
|
|
271
283
|
"""Launches an instance with the given parameters.
|
|
272
284
|
|
|
273
|
-
|
|
274
|
-
|
|
285
|
+
For CPU instances, we directly use the instance_type for launching the
|
|
286
|
+
instance.
|
|
287
|
+
|
|
288
|
+
For GPU instances, we convert the instance_type to the RunPod GPU name,
|
|
289
|
+
and finds the specs for the GPU, before launching the instance.
|
|
275
290
|
|
|
276
291
|
Returns:
|
|
277
292
|
instance_id: The instance ID.
|
|
278
293
|
"""
|
|
279
294
|
name = f'{cluster_name}-{node_type}'
|
|
280
|
-
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
|
281
|
-
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
|
282
|
-
cloud_type = instance_type.split('_')[2]
|
|
283
295
|
|
|
284
|
-
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
|
285
296
|
# TODO(zhwu): keep this align with setups in
|
|
286
297
|
# `provision.kuberunetes.instance.py`
|
|
287
298
|
setup_cmd = (
|
|
@@ -329,12 +340,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
329
340
|
params = {
|
|
330
341
|
'name': name,
|
|
331
342
|
'image_name': image_name_formatted,
|
|
332
|
-
'gpu_type_id': gpu_type,
|
|
333
|
-
'cloud_type': cloud_type,
|
|
334
343
|
'container_disk_in_gb': disk_size,
|
|
335
|
-
'min_vcpu_count': 4 * gpu_quantity,
|
|
336
|
-
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
|
337
|
-
'gpu_count': gpu_quantity,
|
|
338
344
|
'country_code': region,
|
|
339
345
|
'data_center_id': zone,
|
|
340
346
|
'ports': ports_str,
|
|
@@ -343,12 +349,39 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
343
349
|
'template_id': template_id,
|
|
344
350
|
}
|
|
345
351
|
|
|
352
|
+
# Optional network volume mount.
|
|
353
|
+
if volume_mount_path is not None:
|
|
354
|
+
params['volume_mount_path'] = volume_mount_path
|
|
355
|
+
if network_volume_id is not None:
|
|
356
|
+
params['network_volume_id'] = network_volume_id
|
|
357
|
+
|
|
358
|
+
# GPU instance types start with f'{gpu_count}x',
|
|
359
|
+
# CPU instance types start with 'cpu'.
|
|
360
|
+
is_cpu_instance = instance_type.startswith('cpu')
|
|
361
|
+
if is_cpu_instance:
|
|
362
|
+
# RunPod CPU instances can be uniquely identified by the instance_id.
|
|
363
|
+
params.update({
|
|
364
|
+
'instance_id': instance_type,
|
|
365
|
+
})
|
|
366
|
+
else:
|
|
367
|
+
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
|
368
|
+
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
|
369
|
+
cloud_type = instance_type.split('_')[2]
|
|
370
|
+
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
|
371
|
+
params.update({
|
|
372
|
+
'gpu_type_id': gpu_type,
|
|
373
|
+
'cloud_type': cloud_type,
|
|
374
|
+
'min_vcpu_count': 4 * gpu_quantity,
|
|
375
|
+
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
|
376
|
+
'gpu_count': gpu_quantity,
|
|
377
|
+
})
|
|
378
|
+
|
|
346
379
|
if preemptible is None or not preemptible:
|
|
347
380
|
new_instance = runpod.runpod.create_pod(**params)
|
|
348
381
|
else:
|
|
349
382
|
new_instance = runpod_commands.create_spot_pod(
|
|
350
383
|
bid_per_gpu=bid_per_gpu,
|
|
351
|
-
**params,
|
|
384
|
+
**params, # type: ignore[arg-type]
|
|
352
385
|
)
|
|
353
386
|
|
|
354
387
|
return new_instance['id']
|