skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/provision/common.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
|
+
from sky.utils import env_options
|
|
9
10
|
from sky.utils import resources_utils
|
|
10
11
|
|
|
11
12
|
# NOTE: we can use pydantic instead of dataclasses or namedtuples, because
|
|
@@ -96,6 +97,8 @@ class InstanceInfo:
|
|
|
96
97
|
external_ip: Optional[str]
|
|
97
98
|
tags: Dict[str, str]
|
|
98
99
|
ssh_port: int = 22
|
|
100
|
+
# The internal service address of the instance on Kubernetes.
|
|
101
|
+
internal_svc: Optional[str] = None
|
|
99
102
|
|
|
100
103
|
def get_feasible_ip(self) -> str:
|
|
101
104
|
"""Get the most feasible IPs of the instance. This function returns
|
|
@@ -238,12 +241,21 @@ class Endpoint:
|
|
|
238
241
|
|
|
239
242
|
@dataclasses.dataclass
|
|
240
243
|
class SocketEndpoint(Endpoint):
|
|
241
|
-
"""Socket endpoint
|
|
244
|
+
"""Socket endpoint accessible via a host and a port."""
|
|
242
245
|
port: Optional[int]
|
|
243
246
|
host: str = ''
|
|
244
247
|
|
|
245
248
|
def url(self, override_ip: Optional[str] = None) -> str:
|
|
246
249
|
host = override_ip if override_ip else self.host
|
|
250
|
+
if env_options.Options.RUNNING_IN_BUILDKITE.get(
|
|
251
|
+
) and 'localhost' in host:
|
|
252
|
+
# In Buildkite CI, we run a kind (Kubernetes in Docker) cluster.
|
|
253
|
+
# The controller pod runs inside this kind cluster, which itself
|
|
254
|
+
# runs in a container. When the pod tries to access 'localhost',
|
|
255
|
+
# it can't reach the host machine's localhost. Using
|
|
256
|
+
# 'host.docker.internal' allows the pod to properly communicate
|
|
257
|
+
# with services running on the host machine's localhost.
|
|
258
|
+
host = 'host.docker.internal'
|
|
247
259
|
return f'{host}{":" + str(self.port) if self.port else ""}'
|
|
248
260
|
|
|
249
261
|
|
sky/provision/cudo/cudo_utils.py
CHANGED
|
@@ -1,22 +1,28 @@
|
|
|
1
1
|
"""Cudo catalog helper."""
|
|
2
2
|
|
|
3
3
|
cudo_gpu_model = {
|
|
4
|
-
'
|
|
5
|
-
'
|
|
6
|
-
'
|
|
7
|
-
'
|
|
8
|
-
'
|
|
4
|
+
'H100 NVL': 'H100',
|
|
5
|
+
'H100 SXM': 'H100-SXM',
|
|
6
|
+
'L40S (compute mode)': 'L40S',
|
|
7
|
+
'L40S (graphics mode)': 'L40S',
|
|
8
|
+
'A40 (compute mode)': 'A40',
|
|
9
|
+
'A40 (graphics mode)': 'A40',
|
|
9
10
|
'RTX A5000': 'RTXA5000',
|
|
10
11
|
'RTX A6000': 'RTXA6000',
|
|
12
|
+
'A100 80GB PCIe': 'A100',
|
|
13
|
+
'A800 PCIe': 'A800',
|
|
14
|
+
'V100': 'V100',
|
|
11
15
|
}
|
|
12
16
|
|
|
13
17
|
cudo_gpu_mem = {
|
|
14
|
-
'
|
|
18
|
+
'H100': 94,
|
|
19
|
+
'H100-SXM': 80,
|
|
20
|
+
'L40S': 48,
|
|
15
21
|
'A40': 48,
|
|
16
|
-
'RTXA4000': 16,
|
|
17
|
-
'RTXA4500': 20,
|
|
18
22
|
'RTXA5000': 24,
|
|
19
23
|
'RTXA6000': 48,
|
|
24
|
+
'A100': 80,
|
|
25
|
+
'A800': 80,
|
|
20
26
|
'V100': 16,
|
|
21
27
|
}
|
|
22
28
|
|
|
@@ -4,7 +4,7 @@ from typing import Dict
|
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.adaptors import cudo
|
|
7
|
-
|
|
7
|
+
from sky.provision.cudo import cudo_utils as utils
|
|
8
8
|
|
|
9
9
|
logger = sky_logging.init_logger(__name__)
|
|
10
10
|
|
|
@@ -28,12 +28,10 @@ def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
|
|
|
28
28
|
size_gib=disk_size),
|
|
29
29
|
metadata=tags)
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
except cudo.cudo.rest.ApiException as e:
|
|
36
|
-
raise e
|
|
31
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
|
32
|
+
vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
|
|
33
|
+
|
|
34
|
+
return vm.to_dict()['id']
|
|
37
35
|
|
|
38
36
|
|
|
39
37
|
def remove(instance_id: str):
|
|
@@ -54,11 +52,8 @@ def remove(instance_id: str):
|
|
|
54
52
|
state = 'unknown'
|
|
55
53
|
project_id = cudo.cudo.cudo_api.project_id_throwable()
|
|
56
54
|
while retry_count < max_retries:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
state = vm.to_dict()['vm']['short_state']
|
|
60
|
-
except cudo.cudo.rest.ApiException as e:
|
|
61
|
-
raise e
|
|
55
|
+
vm = api.get_vm(project_id, instance_id)
|
|
56
|
+
state = vm.to_dict()['vm']['short_state']
|
|
62
57
|
|
|
63
58
|
if state in terminate_ok:
|
|
64
59
|
break
|
|
@@ -69,76 +64,82 @@ def remove(instance_id: str):
|
|
|
69
64
|
'Timeout error, could not terminate due to VM state: {}'.format(
|
|
70
65
|
state))
|
|
71
66
|
|
|
72
|
-
|
|
73
|
-
api.terminate_vm(project_id, instance_id)
|
|
74
|
-
except cudo.cudo.rest.ApiException as e:
|
|
75
|
-
raise e
|
|
67
|
+
api.terminate_vm(project_id, instance_id)
|
|
76
68
|
|
|
77
69
|
|
|
78
70
|
def set_tags(instance_id: str, tags: Dict):
|
|
79
71
|
"""Sets the tags for the given instance."""
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
merge=True)) # TODO (skypilot team) merge or overwrite?
|
|
87
|
-
except cudo.cudo.rest.ApiException as e:
|
|
88
|
-
raise e
|
|
72
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
|
73
|
+
api.update_vm_metadata(
|
|
74
|
+
cudo.cudo.cudo_api.project_id(), instance_id,
|
|
75
|
+
cudo.cudo.UpdateVMMetadataBody(
|
|
76
|
+
metadata=tags,
|
|
77
|
+
merge=True)) # TODO (skypilot team) merge or overwrite?
|
|
89
78
|
|
|
90
79
|
|
|
91
80
|
def get_instance(vm_id):
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
return vm_dict
|
|
97
|
-
except cudo.cudo.rest.ApiException as e:
|
|
98
|
-
raise e
|
|
81
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
|
82
|
+
vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
|
|
83
|
+
vm_dict = vm.to_dict()
|
|
84
|
+
return vm_dict
|
|
99
85
|
|
|
100
86
|
|
|
101
87
|
def list_instances():
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
return instances
|
|
122
|
-
except cudo.cudo.rest.ApiException as e:
|
|
123
|
-
raise e
|
|
88
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
|
89
|
+
vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
|
|
90
|
+
instances = {}
|
|
91
|
+
for vm in vms.to_dict()['vms']:
|
|
92
|
+
ex_ip = vm['external_ip_address']
|
|
93
|
+
in_ip = vm['internal_ip_address']
|
|
94
|
+
if not in_ip:
|
|
95
|
+
in_ip = ex_ip
|
|
96
|
+
instance = {
|
|
97
|
+
# active_state, init_state, lcm_state, short_state
|
|
98
|
+
'status': vm['short_state'],
|
|
99
|
+
'tags': vm['metadata'],
|
|
100
|
+
'name': vm['id'],
|
|
101
|
+
'ip': ex_ip,
|
|
102
|
+
'external_ip': ex_ip,
|
|
103
|
+
'internal_ip': in_ip
|
|
104
|
+
}
|
|
105
|
+
instances[vm['id']] = instance
|
|
106
|
+
return instances
|
|
124
107
|
|
|
125
108
|
|
|
126
109
|
def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
|
|
127
110
|
cpus):
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
111
|
+
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
|
|
112
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
|
113
|
+
types = api.list_vm_machine_types2()
|
|
114
|
+
types_dict = types.to_dict()
|
|
115
|
+
machine_types = types_dict['machine_types']
|
|
116
|
+
|
|
117
|
+
# Filter machine types based on requirements
|
|
118
|
+
matching_types = []
|
|
119
|
+
for machine_type in machine_types:
|
|
120
|
+
# Check if this machine type matches our requirements
|
|
121
|
+
if (machine_type['data_center_id'] == data_center_id and
|
|
122
|
+
machine_type['gpu_model'] == gpu_model and
|
|
123
|
+
machine_type['min_vcpu'] <= cpus <= machine_type.get(
|
|
124
|
+
'max_vcpu_free', float('inf')) and
|
|
125
|
+
machine_type['min_memory_gib'] <= mem <= machine_type.get(
|
|
126
|
+
'max_memory_gib_free', float('inf'))):
|
|
127
|
+
|
|
128
|
+
# Calculate available VMs based on resource constraints
|
|
129
|
+
max_vms_by_vcpu = machine_type[
|
|
130
|
+
'total_vcpu_free'] // cpus if cpus > 0 else float('inf')
|
|
131
|
+
max_vms_by_memory = machine_type[
|
|
132
|
+
'total_memory_gib_free'] // mem if mem > 0 else float('inf')
|
|
133
|
+
max_vms_by_gpu = machine_type[
|
|
134
|
+
'total_gpu_free'] // gpu_count if gpu_count > 0 else float(
|
|
135
|
+
'inf')
|
|
136
|
+
|
|
137
|
+
available_vms = min(max_vms_by_vcpu, max_vms_by_memory,
|
|
138
|
+
max_vms_by_gpu)
|
|
139
|
+
matching_types.append(available_vms)
|
|
140
|
+
|
|
141
|
+
total_count = sum(matching_types)
|
|
142
|
+
if total_count < to_start_count:
|
|
143
|
+
raise Exception(
|
|
144
|
+
'Too many VMs requested, try another gpu type or region')
|
|
145
|
+
return total_count
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Cudo Compute instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
40
40
|
return head_instance_id
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
43
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
44
44
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
45
45
|
"""Runs instances for the given cluster."""
|
|
46
|
-
|
|
46
|
+
del cluster_name # unused
|
|
47
47
|
pending_status = ['pend', 'init', 'prol', 'boot']
|
|
48
48
|
|
|
49
49
|
while True:
|
|
@@ -191,11 +191,14 @@ def get_cluster_info(
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
def query_instances(
|
|
194
|
+
cluster_name: str,
|
|
194
195
|
cluster_name_on_cloud: str,
|
|
195
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
196
197
|
non_terminated_only: bool = True,
|
|
197
|
-
|
|
198
|
+
retry_if_missing: bool = False,
|
|
199
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
198
200
|
"""See sky/provision/__init__.py"""
|
|
201
|
+
del cluster_name, retry_if_missing # unused
|
|
199
202
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
200
203
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
201
204
|
|
|
@@ -210,12 +213,13 @@ def query_instances(
|
|
|
210
213
|
'done': status_lib.ClusterStatus.STOPPED,
|
|
211
214
|
'poff': status_lib.ClusterStatus.STOPPED,
|
|
212
215
|
}
|
|
213
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
216
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
217
|
+
Optional[str]]] = {}
|
|
214
218
|
for inst_id, inst in instances.items():
|
|
215
219
|
status = status_map[inst['status']]
|
|
216
220
|
if non_terminated_only and status is None:
|
|
217
221
|
continue
|
|
218
|
-
statuses[inst_id] = status
|
|
222
|
+
statuses[inst_id] = (status, None)
|
|
219
223
|
return statuses
|
|
220
224
|
|
|
221
225
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""DigitalOcean instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
import uuid
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
@@ -26,10 +26,10 @@ def _get_head_instance(
|
|
|
26
26
|
return None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
29
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
30
30
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
31
31
|
"""Runs instances for the given cluster."""
|
|
32
|
-
|
|
32
|
+
del cluster_name # unused
|
|
33
33
|
pending_status = ['new']
|
|
34
34
|
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
|
35
35
|
pending_status + ['off'])
|
|
@@ -242,11 +242,14 @@ def get_cluster_info(
|
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
def query_instances(
|
|
245
|
+
cluster_name: str,
|
|
245
246
|
cluster_name_on_cloud: str,
|
|
246
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
247
248
|
non_terminated_only: bool = True,
|
|
248
|
-
|
|
249
|
+
retry_if_missing: bool = False,
|
|
250
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
249
251
|
"""See sky/provision/__init__.py"""
|
|
252
|
+
del cluster_name, retry_if_missing # unused
|
|
250
253
|
# terminated instances are not retrieved by the
|
|
251
254
|
# API making `non_terminated_only` argument moot.
|
|
252
255
|
del non_terminated_only
|
|
@@ -260,10 +263,11 @@ def query_instances(
|
|
|
260
263
|
'active': status_lib.ClusterStatus.UP,
|
|
261
264
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
262
265
|
}
|
|
263
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
266
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
267
|
+
Optional[str]]] = {}
|
|
264
268
|
for instance_meta in instances.values():
|
|
265
269
|
status = status_map[instance_meta['status']]
|
|
266
|
-
statuses[instance_meta['name']] = status
|
|
270
|
+
statuses[instance_meta['name']] = (status, None)
|
|
267
271
|
return statuses
|
|
268
272
|
|
|
269
273
|
|
sky/provision/do/utils.py
CHANGED
|
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
|
|
|
17
17
|
from sky.provision.do import constants
|
|
18
18
|
from sky.utils import annotations
|
|
19
19
|
from sky.utils import common_utils
|
|
20
|
+
from sky.utils import yaml_utils
|
|
20
21
|
|
|
21
22
|
logger = sky_logging.init_logger(__name__)
|
|
22
23
|
|
|
@@ -30,7 +31,7 @@ POSSIBLE_CREDENTIALS_PATHS = [
|
|
|
30
31
|
INITIAL_BACKOFF_SECONDS = 10
|
|
31
32
|
MAX_BACKOFF_FACTOR = 10
|
|
32
33
|
MAX_ATTEMPTS = 6
|
|
33
|
-
|
|
34
|
+
SSH_KEY_NAME_ON_DO_PREFIX = 'sky-key-'
|
|
34
35
|
|
|
35
36
|
_client = None
|
|
36
37
|
_ssh_key_id = None
|
|
@@ -61,7 +62,7 @@ def _init_client():
|
|
|
61
62
|
if get_credentials_path() is None:
|
|
62
63
|
raise DigitalOceanError(
|
|
63
64
|
'No credentials found, please run `doctl auth init`')
|
|
64
|
-
credentials =
|
|
65
|
+
credentials = yaml_utils.read_yaml(get_credentials_path())
|
|
65
66
|
default_token = credentials.get('access-token', None)
|
|
66
67
|
if default_token is not None:
|
|
67
68
|
try:
|
|
@@ -125,7 +126,7 @@ def ssh_key_id(public_key: str):
|
|
|
125
126
|
|
|
126
127
|
request = {
|
|
127
128
|
'public_key': public_key,
|
|
128
|
-
'name':
|
|
129
|
+
'name': SSH_KEY_NAME_ON_DO_PREFIX + common_utils.get_user_hash(),
|
|
129
130
|
}
|
|
130
131
|
_ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
|
|
131
132
|
return _ssh_key_id
|
sky/provision/docker_utils.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import shlex
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.skylet import constants
|
|
@@ -15,10 +15,14 @@ logger = sky_logging.init_logger(__name__)
|
|
|
15
15
|
# Configure environment variables. A docker image can have environment variables
|
|
16
16
|
# set in the Dockerfile with `ENV``. We need to export these variables to the
|
|
17
17
|
# shell environment, so that our ssh session can access them.
|
|
18
|
+
# Filter out RAY_RUNTIME_ENV_HOOK to prevent Ray version conflicts.
|
|
19
|
+
# Docker images with Ray 2.48.0+ set this for UV package manager support,
|
|
20
|
+
# but it causes FAILED_DRIVER errors with SkyPilot's Ray 2.9.3.
|
|
21
|
+
# See: https://github.com/skypilot-org/skypilot/pull/7181
|
|
18
22
|
SETUP_ENV_VARS_CMD = (
|
|
19
23
|
'prefix_cmd() '
|
|
20
24
|
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
|
|
21
|
-
'export -p > ~/container_env_var.sh && '
|
|
25
|
+
'export -p | grep -v RAY_RUNTIME_ENV_HOOK > ~/container_env_var.sh && '
|
|
22
26
|
'$(prefix_cmd) '
|
|
23
27
|
'mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;')
|
|
24
28
|
|
|
@@ -32,6 +36,30 @@ DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
|
|
|
32
36
|
|
|
33
37
|
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
|
|
34
38
|
|
|
39
|
+
# Install AWS CLI v2 (not v1 from pip) as it's required for ECR authentication
|
|
40
|
+
# AWS CLI v2 is installed as a standalone binary, not a Python package. See:
|
|
41
|
+
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
|
42
|
+
INSTALL_AWS_CLI_CMD = (
|
|
43
|
+
'which aws || ((command -v unzip >/dev/null 2>&1 || '
|
|
44
|
+
'(sudo apt-get update && sudo apt-get install -y unzip)) && '
|
|
45
|
+
'curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" '
|
|
46
|
+
'-o "/tmp/awscliv2.zip" && '
|
|
47
|
+
'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
|
|
48
|
+
'&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _extract_region_from_ecr_server(server: str) -> str:
|
|
52
|
+
"""Extract AWS region from ECR server URL.
|
|
53
|
+
|
|
54
|
+
ECR server format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
55
|
+
Returns the region part from the URL.
|
|
56
|
+
"""
|
|
57
|
+
# Split: ['<account-id>', 'dkr', 'ecr', '<region>', 'amazonaws', 'com']
|
|
58
|
+
parts = server.split('.')
|
|
59
|
+
if len(parts) >= 6 and parts[1] == 'dkr' and parts[2] == 'ecr':
|
|
60
|
+
return parts[3]
|
|
61
|
+
raise ValueError(f'Invalid ECR server format: {server}')
|
|
62
|
+
|
|
35
63
|
|
|
36
64
|
@dataclasses.dataclass
|
|
37
65
|
class DockerLoginConfig:
|
|
@@ -83,6 +111,21 @@ def check_docker_image(cname, docker_cmd):
|
|
|
83
111
|
return _check_helper(cname, '.Config.Image', docker_cmd)
|
|
84
112
|
|
|
85
113
|
|
|
114
|
+
def maybe_remove_container_cmds(container_name, docker_cmd):
|
|
115
|
+
"""Remove the container if it exists. If not, it will be a no-op.
|
|
116
|
+
"""
|
|
117
|
+
docker_rm = [
|
|
118
|
+
docker_cmd,
|
|
119
|
+
'rm',
|
|
120
|
+
'-f',
|
|
121
|
+
container_name,
|
|
122
|
+
'2>/dev/null',
|
|
123
|
+
'||',
|
|
124
|
+
'true',
|
|
125
|
+
]
|
|
126
|
+
return ' '.join(docker_rm)
|
|
127
|
+
|
|
128
|
+
|
|
86
129
|
def docker_start_cmds(
|
|
87
130
|
image,
|
|
88
131
|
container_name,
|
|
@@ -149,12 +192,16 @@ class DockerInitializer:
|
|
|
149
192
|
self.docker_cmd = 'podman' if use_podman else 'docker'
|
|
150
193
|
self.log_path = log_path
|
|
151
194
|
|
|
152
|
-
def _run(
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
195
|
+
def _run(
|
|
196
|
+
self,
|
|
197
|
+
cmd,
|
|
198
|
+
run_env='host',
|
|
199
|
+
wait_for_docker_daemon: bool = False,
|
|
200
|
+
separate_stderr: bool = False,
|
|
201
|
+
log_err_when_fail: bool = True,
|
|
202
|
+
flock_name: Optional[str] = None,
|
|
203
|
+
flock_args: Optional[str] = None,
|
|
204
|
+
) -> str:
|
|
158
205
|
|
|
159
206
|
if run_env == 'docker':
|
|
160
207
|
cmd = self._docker_expand_user(cmd, any_char=True)
|
|
@@ -163,8 +210,13 @@ class DockerInitializer:
|
|
|
163
210
|
# an error: `the input device is not a TTY`, and it works without
|
|
164
211
|
# `-it` flag.
|
|
165
212
|
# TODO(zhwu): ray use the `-it` flag, we need to check why.
|
|
166
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
167
|
-
f' {shlex.quote(cmd)} ')
|
|
213
|
+
cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
|
|
214
|
+
f' /bin/bash -c {shlex.quote(cmd)} ')
|
|
215
|
+
|
|
216
|
+
if flock_name is not None:
|
|
217
|
+
flock_args = flock_args or ''
|
|
218
|
+
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
219
|
+
f'-c {shlex.quote(cmd)}')
|
|
168
220
|
|
|
169
221
|
logger.debug(f'+ {cmd}')
|
|
170
222
|
start = time.time()
|
|
@@ -216,14 +268,17 @@ class DockerInitializer:
|
|
|
216
268
|
if self._check_container_exited():
|
|
217
269
|
self.initialized = True
|
|
218
270
|
self._run(f'{self.docker_cmd} start {self.container_name}')
|
|
219
|
-
self._run('sudo service ssh start',
|
|
271
|
+
self._run('sudo service ssh start',
|
|
272
|
+
run_env='docker',
|
|
273
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
274
|
+
flock_args='-s -w 1')
|
|
220
275
|
return self._run('whoami', run_env='docker')
|
|
221
276
|
|
|
222
277
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
223
278
|
if 'docker_login_config' in self.docker_config:
|
|
224
|
-
# TODO(tian): Maybe support a command to get the login password?
|
|
225
279
|
docker_login_config = DockerLoginConfig(
|
|
226
280
|
**self.docker_config['docker_login_config'])
|
|
281
|
+
|
|
227
282
|
if docker_login_config.password:
|
|
228
283
|
# Password is allowed to be empty, in that case, we will not run
|
|
229
284
|
# the login command, and assume that the image pulling is
|
|
@@ -234,6 +289,25 @@ class DockerInitializer:
|
|
|
234
289
|
f'--password {shlex.quote(docker_login_config.password)} '
|
|
235
290
|
f'{shlex.quote(docker_login_config.server)}',
|
|
236
291
|
wait_for_docker_daemon=True)
|
|
292
|
+
elif (docker_login_config.server.endswith('.amazonaws.com') and
|
|
293
|
+
'.dkr.ecr.' in docker_login_config.server):
|
|
294
|
+
# AWS ECR: Use aws ecr get-login-password for authentication
|
|
295
|
+
# ECR format: <account-id>.dkr.ecr.<region>.amazonaws.com
|
|
296
|
+
# This command uses the IAM credentials from the EC2 instance
|
|
297
|
+
# Ref: https://docs.aws.amazon.com/AmazonECR/latest/userguide/registry_auth.html # pylint: disable=line-too-long
|
|
298
|
+
region = _extract_region_from_ecr_server(
|
|
299
|
+
docker_login_config.server)
|
|
300
|
+
|
|
301
|
+
# AWS CLI is not pre-installed on AWS instances, unlike gcloud
|
|
302
|
+
# on GCP instances, so we need to install it first
|
|
303
|
+
self._run(INSTALL_AWS_CLI_CMD, wait_for_docker_daemon=False)
|
|
304
|
+
|
|
305
|
+
self._run(
|
|
306
|
+
f'aws ecr get-login-password --region {region} | '
|
|
307
|
+
f'{self.docker_cmd} login --username AWS '
|
|
308
|
+
f'--password-stdin '
|
|
309
|
+
f'{shlex.quote(docker_login_config.server)}',
|
|
310
|
+
wait_for_docker_daemon=True)
|
|
237
311
|
elif docker_login_config.server.endswith('-docker.pkg.dev'):
|
|
238
312
|
# Docker image server is on GCR, we need to do additional setup
|
|
239
313
|
# to pull the image.
|
|
@@ -285,6 +359,10 @@ class DockerInitializer:
|
|
|
285
359
|
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
|
|
286
360
|
'sudo systemctl restart docker; } || true')
|
|
287
361
|
user_docker_run_options = self.docker_config.get('run_options', [])
|
|
362
|
+
remove_container_cmd = maybe_remove_container_cmds(
|
|
363
|
+
self.container_name,
|
|
364
|
+
self.docker_cmd,
|
|
365
|
+
)
|
|
288
366
|
start_command = docker_start_cmds(
|
|
289
367
|
specific_image,
|
|
290
368
|
self.container_name,
|
|
@@ -292,7 +370,9 @@ class DockerInitializer:
|
|
|
292
370
|
self._auto_configure_shm(user_docker_run_options)),
|
|
293
371
|
self.docker_cmd,
|
|
294
372
|
)
|
|
295
|
-
self._run(start_command
|
|
373
|
+
self._run(f'{remove_container_cmd} && {start_command}',
|
|
374
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
375
|
+
flock_args='-x -w 10')
|
|
296
376
|
|
|
297
377
|
# SkyPilot: Setup Commands.
|
|
298
378
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -310,14 +390,18 @@ class DockerInitializer:
|
|
|
310
390
|
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
|
|
311
391
|
run_env='docker')
|
|
312
392
|
# Install dependencies.
|
|
313
|
-
|
|
314
|
-
'
|
|
393
|
+
cmd = (
|
|
394
|
+
'bash -lc \''
|
|
395
|
+
'exec 200>/var/tmp/sky_apt.lock; '
|
|
396
|
+
'flock -x -w 120 200 || exit 1; '
|
|
397
|
+
'export DEBIAN_FRONTEND=noninteractive; '
|
|
398
|
+
'apt-get -yq update && '
|
|
315
399
|
# Our mount script will install gcsfuse without fuse package.
|
|
316
400
|
# We need to install fuse package first to enable storage mount.
|
|
317
401
|
# The dpkg option is to suppress the prompt for fuse installation.
|
|
318
|
-
'
|
|
319
|
-
'rsync curl wget patch openssh-server python3-pip fuse
|
|
320
|
-
|
|
402
|
+
'apt-get -o DPkg::Options::=--force-confnew install -y '
|
|
403
|
+
'rsync curl wget patch openssh-server python3-pip fuse\'')
|
|
404
|
+
self._run(cmd, run_env='docker')
|
|
321
405
|
|
|
322
406
|
# Copy local authorized_keys to docker container.
|
|
323
407
|
# Stop and disable jupyter service. This is to avoid port conflict on
|
|
@@ -343,13 +427,16 @@ class DockerInitializer:
|
|
|
343
427
|
# `mesg: ttyname failed: inappropriate ioctl for device`.
|
|
344
428
|
# see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
|
345
429
|
port = constants.DEFAULT_DOCKER_PORT
|
|
430
|
+
# In case the port is already configured in the sshd_config file
|
|
431
|
+
# in some images, we delete it first and then append the new one.
|
|
346
432
|
# pylint: disable=anomalous-backslash-in-string
|
|
347
433
|
self._run(
|
|
348
|
-
|
|
434
|
+
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
|
435
|
+
f'echo "Port {port}" | sudo tee -a /etc/ssh/sshd_config > /dev/null;'
|
|
349
436
|
'mkdir -p ~/.ssh;'
|
|
350
437
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
|
351
438
|
'sudo service ssh start;'
|
|
352
|
-
'sudo sed -i "s/mesg n/tty -s
|
|
439
|
+
'sudo sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;'
|
|
353
440
|
f'{SETUP_ENV_VARS_CMD}',
|
|
354
441
|
run_env='docker')
|
|
355
442
|
|
|
@@ -390,9 +477,13 @@ class DockerInitializer:
|
|
|
390
477
|
user_pos = string.find('~')
|
|
391
478
|
if user_pos > -1:
|
|
392
479
|
if self.home_dir is None:
|
|
393
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
394
|
-
'printenv HOME')
|
|
395
|
-
self.home_dir = self._run(
|
|
480
|
+
cmd = (f'{self.docker_cmd} exec {self.container_name}'
|
|
481
|
+
' printenv HOME')
|
|
482
|
+
self.home_dir = self._run(
|
|
483
|
+
cmd,
|
|
484
|
+
separate_stderr=True,
|
|
485
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
486
|
+
flock_args='-s -w 1')
|
|
396
487
|
# Check for unexpected newline in home directory, which can be
|
|
397
488
|
# a common issue when the output is mixed with stderr.
|
|
398
489
|
assert '\n' not in self.home_dir, (
|