skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/optimizer.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky import clouds
|
|
|
14
14
|
from sky import exceptions
|
|
15
15
|
from sky import resources as resources_lib
|
|
16
16
|
from sky import sky_logging
|
|
17
|
+
from sky import skypilot_config
|
|
17
18
|
from sky import task as task_lib
|
|
18
19
|
from sky.adaptors import common as adaptors_common
|
|
19
20
|
from sky.clouds import cloud as sky_cloud
|
|
@@ -21,6 +22,7 @@ from sky.usage import usage_lib
|
|
|
21
22
|
from sky.utils import common
|
|
22
23
|
from sky.utils import env_options
|
|
23
24
|
from sky.utils import log_utils
|
|
25
|
+
from sky.utils import registry
|
|
24
26
|
from sky.utils import resources_utils
|
|
25
27
|
from sky.utils import rich_utils
|
|
26
28
|
from sky.utils import subprocess_utils
|
|
@@ -73,8 +75,8 @@ class Optimizer:
|
|
|
73
75
|
def _egress_cost(src_cloud: clouds.Cloud, dst_cloud: clouds.Cloud,
|
|
74
76
|
gigabytes: float) -> float:
|
|
75
77
|
"""Returns estimated egress cost."""
|
|
76
|
-
if isinstance(src_cloud, DummyCloud) or isinstance(
|
|
77
|
-
dst_cloud, DummyCloud):
|
|
78
|
+
if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
|
|
79
|
+
dst_cloud, clouds.DummyCloud):
|
|
78
80
|
return 0.0
|
|
79
81
|
|
|
80
82
|
if not src_cloud.is_same_cloud(dst_cloud):
|
|
@@ -88,8 +90,8 @@ class Optimizer:
|
|
|
88
90
|
gigabytes: float) -> float:
|
|
89
91
|
"""Returns estimated egress time in seconds."""
|
|
90
92
|
# FIXME: estimate bandwidth between each cloud-region pair.
|
|
91
|
-
if isinstance(src_cloud, DummyCloud) or isinstance(
|
|
92
|
-
dst_cloud, DummyCloud):
|
|
93
|
+
if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
|
|
94
|
+
dst_cloud, clouds.DummyCloud):
|
|
93
95
|
return 0.0
|
|
94
96
|
if not src_cloud.is_same_cloud(dst_cloud):
|
|
95
97
|
# 10Gbps is close to the average of observed b/w from S3
|
|
@@ -167,7 +169,7 @@ class Optimizer:
|
|
|
167
169
|
|
|
168
170
|
def make_dummy(name):
|
|
169
171
|
dummy = task_lib.Task(name)
|
|
170
|
-
dummy.set_resources({DummyResources(DummyCloud()
|
|
172
|
+
dummy.set_resources({DummyResources(cloud=clouds.DummyCloud())})
|
|
171
173
|
dummy.set_time_estimator(lambda _: 0)
|
|
172
174
|
return dummy
|
|
173
175
|
|
|
@@ -197,7 +199,7 @@ class Optimizer:
|
|
|
197
199
|
node: task_lib.Task,
|
|
198
200
|
resources: resources_lib.Resources,
|
|
199
201
|
) -> Tuple[Optional[clouds.Cloud], Optional[clouds.Cloud], Optional[float]]:
|
|
200
|
-
if isinstance(parent_resources.cloud, DummyCloud):
|
|
202
|
+
if isinstance(parent_resources.cloud, clouds.DummyCloud):
|
|
201
203
|
# Special case. The current 'node' is a real
|
|
202
204
|
# source node, and its input may be on a different
|
|
203
205
|
# cloud from 'resources'.
|
|
@@ -321,10 +323,10 @@ class Optimizer:
|
|
|
321
323
|
estimated_runtime = 1 * 3600
|
|
322
324
|
else:
|
|
323
325
|
# We assume the time estimator takes in a partial resource
|
|
324
|
-
# Resources('V100')
|
|
326
|
+
# Resources(accelerators='V100')
|
|
325
327
|
# and treats their launchable versions
|
|
326
|
-
# Resources(
|
|
327
|
-
# Resources(
|
|
328
|
+
# Resources(infra='aws', instance_type='p3.2xlarge'),
|
|
329
|
+
# Resources(infra='gcp', accelerators='V100'),
|
|
328
330
|
# ...
|
|
329
331
|
# as having the same run time.
|
|
330
332
|
# FIXME(zongheng): take 'num_nodes' as an arg/into
|
|
@@ -376,6 +378,10 @@ class Optimizer:
|
|
|
376
378
|
if any(orig_resources.cloud is None
|
|
377
379
|
for orig_resources in node.resources):
|
|
378
380
|
source_hint = 'catalog and kubernetes cluster'
|
|
381
|
+
elif all(
|
|
382
|
+
isinstance(orig_resources.cloud, clouds.SSH)
|
|
383
|
+
for orig_resources in node.resources):
|
|
384
|
+
source_hint = 'node pool'
|
|
379
385
|
elif all(
|
|
380
386
|
isinstance(orig_resources.cloud, clouds.Kubernetes)
|
|
381
387
|
for orig_resources in node.resources):
|
|
@@ -671,7 +677,7 @@ class Optimizer:
|
|
|
671
677
|
plan: Dict[task_lib.Task, resources_lib.Resources],
|
|
672
678
|
) -> float:
|
|
673
679
|
"""Estimates the total cost of running the DAG by the plan."""
|
|
674
|
-
total_cost = 0
|
|
680
|
+
total_cost = 0.
|
|
675
681
|
for node in topo_order:
|
|
676
682
|
resources = plan[node]
|
|
677
683
|
if node.time_estimator_func is None:
|
|
@@ -772,15 +778,27 @@ class Optimizer:
|
|
|
772
778
|
f'{colorama.Style.BRIGHT}Estimated total cost: '
|
|
773
779
|
f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
|
|
774
780
|
|
|
781
|
+
def _instance_type_str(resources: 'resources_lib.Resources') -> str:
|
|
782
|
+
instance_type = resources.instance_type
|
|
783
|
+
assert instance_type is not None, 'Instance type must be specified'
|
|
784
|
+
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
785
|
+
instance_type = '-'
|
|
786
|
+
if resources.use_spot:
|
|
787
|
+
instance_type = ''
|
|
788
|
+
return instance_type
|
|
789
|
+
|
|
775
790
|
def _get_resources_element_list(
|
|
776
791
|
resources: 'resources_lib.Resources') -> List[str]:
|
|
777
792
|
accelerators = resources.get_accelerators_str()
|
|
778
793
|
spot = resources.get_spot_str()
|
|
779
794
|
cloud = resources.cloud
|
|
780
|
-
|
|
795
|
+
assert cloud is not None, 'Cloud must be specified'
|
|
796
|
+
assert (resources.instance_type is not None), \
|
|
797
|
+
'Instance type must be specified'
|
|
798
|
+
vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
|
|
781
799
|
resources.instance_type)
|
|
782
800
|
|
|
783
|
-
def format_number(x):
|
|
801
|
+
def format_number(x: Optional[float]) -> str:
|
|
784
802
|
if x is None:
|
|
785
803
|
return '-'
|
|
786
804
|
elif x.is_integer():
|
|
@@ -788,25 +806,23 @@ class Optimizer:
|
|
|
788
806
|
else:
|
|
789
807
|
return f'{x:.1f}'
|
|
790
808
|
|
|
791
|
-
vcpus = format_number(
|
|
792
|
-
mem = format_number(
|
|
809
|
+
vcpus = format_number(vcpus_)
|
|
810
|
+
mem = format_number(mem_)
|
|
811
|
+
|
|
812
|
+
# Format infra as CLOUD (REGION/ZONE)
|
|
813
|
+
infra = resources.infra.formatted_str()
|
|
793
814
|
|
|
794
|
-
if resources.zone is None:
|
|
795
|
-
region_or_zone = resources.region
|
|
796
|
-
else:
|
|
797
|
-
region_or_zone = resources.zone
|
|
798
815
|
return [
|
|
799
|
-
|
|
800
|
-
resources
|
|
816
|
+
infra,
|
|
817
|
+
_instance_type_str(resources) + spot,
|
|
801
818
|
vcpus,
|
|
802
819
|
mem,
|
|
803
820
|
str(accelerators),
|
|
804
|
-
str(region_or_zone),
|
|
805
821
|
]
|
|
806
822
|
|
|
807
823
|
Row = collections.namedtuple('Row', [
|
|
808
|
-
'
|
|
809
|
-
'
|
|
824
|
+
'infra', 'instance', 'vcpus', 'mem', 'accelerators', 'cost_str',
|
|
825
|
+
'chosen_str'
|
|
810
826
|
])
|
|
811
827
|
|
|
812
828
|
def _get_resources_named_tuple(resources: 'resources_lib.Resources',
|
|
@@ -814,11 +830,12 @@ class Optimizer:
|
|
|
814
830
|
|
|
815
831
|
accelerators = resources.get_accelerators_str()
|
|
816
832
|
spot = resources.get_spot_str()
|
|
833
|
+
resources = resources.assert_launchable()
|
|
817
834
|
cloud = resources.cloud
|
|
818
|
-
|
|
835
|
+
vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
|
|
819
836
|
resources.instance_type)
|
|
820
837
|
|
|
821
|
-
def format_number(x):
|
|
838
|
+
def format_number(x: Optional[float]) -> str:
|
|
822
839
|
if x is None:
|
|
823
840
|
return '-'
|
|
824
841
|
elif x.is_integer():
|
|
@@ -826,21 +843,18 @@ class Optimizer:
|
|
|
826
843
|
else:
|
|
827
844
|
return f'{x:.1f}'
|
|
828
845
|
|
|
829
|
-
vcpus = format_number(
|
|
830
|
-
mem = format_number(
|
|
846
|
+
vcpus = format_number(vcpus_)
|
|
847
|
+
mem = format_number(mem_)
|
|
831
848
|
|
|
832
|
-
|
|
833
|
-
region_or_zone = resources.region
|
|
834
|
-
else:
|
|
835
|
-
region_or_zone = resources.zone
|
|
849
|
+
infra = resources.infra.formatted_str()
|
|
836
850
|
|
|
837
851
|
chosen_str = ''
|
|
838
852
|
if chosen:
|
|
839
853
|
chosen_str = (colorama.Fore.GREEN + ' ' + '\u2714' +
|
|
840
854
|
colorama.Style.RESET_ALL)
|
|
841
|
-
row = Row(
|
|
842
|
-
|
|
843
|
-
chosen_str)
|
|
855
|
+
row = Row(infra,
|
|
856
|
+
_instance_type_str(resources) + spot, vcpus, mem,
|
|
857
|
+
str(accelerators), cost_str, chosen_str)
|
|
844
858
|
|
|
845
859
|
return row
|
|
846
860
|
|
|
@@ -850,18 +864,23 @@ class Optimizer:
|
|
|
850
864
|
'accelerators': f'{resources.accelerators}',
|
|
851
865
|
'use_spot': resources.use_spot
|
|
852
866
|
}
|
|
867
|
+
|
|
868
|
+
# Handle special case for Kubernetes and SSH clouds
|
|
853
869
|
if isinstance(resources.cloud, clouds.Kubernetes):
|
|
854
|
-
# Region for Kubernetes
|
|
855
|
-
# Kubernetes clusters. We add
|
|
856
|
-
#
|
|
870
|
+
# Region for Kubernetes-like clouds (SSH, Kubernetes) is the
|
|
871
|
+
# context name, i.e. different Kubernetes clusters. We add
|
|
872
|
+
# region to the key to show all the Kubernetes clusters in the
|
|
873
|
+
# optimizer table for better UX.
|
|
874
|
+
|
|
875
|
+
if resources.cloud.__class__.__name__ == 'SSH':
|
|
876
|
+
resource_key_dict[
|
|
877
|
+
'cloud'] = 'SSH' # Force the cloud name to be SSH
|
|
857
878
|
resource_key_dict['region'] = resources.region
|
|
879
|
+
|
|
858
880
|
return json.dumps(resource_key_dict, sort_keys=True)
|
|
859
881
|
|
|
860
882
|
# Print the list of resouces that the optimizer considered.
|
|
861
|
-
resource_fields = [
|
|
862
|
-
'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS',
|
|
863
|
-
'REGION/ZONE'
|
|
864
|
-
]
|
|
883
|
+
resource_fields = ['INFRA', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'GPUS']
|
|
865
884
|
if len(ordered_best_plan) > 1:
|
|
866
885
|
best_plan_rows = []
|
|
867
886
|
for t, r in ordered_best_plan.items():
|
|
@@ -978,24 +997,36 @@ class Optimizer:
|
|
|
978
997
|
@staticmethod
|
|
979
998
|
def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
|
|
980
999
|
for node, candidate_set in node_to_candidate_map.items():
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
accelerator = list(node.resources)[0].accelerators
|
|
1000
|
+
best_resources = node.best_resources
|
|
1001
|
+
if best_resources is None:
|
|
1002
|
+
best_resources = list(node.resources)[0]
|
|
985
1003
|
is_multi_instances = False
|
|
986
|
-
if
|
|
987
|
-
acc_name, acc_count = list(
|
|
1004
|
+
if best_resources.accelerators:
|
|
1005
|
+
acc_name, acc_count = list(
|
|
1006
|
+
best_resources.accelerators.items())[0]
|
|
988
1007
|
for cloud, candidate_list in candidate_set.items():
|
|
989
|
-
|
|
1008
|
+
# Filter only the candidates matching the best
|
|
1009
|
+
# resources chosen by the optimizer.
|
|
1010
|
+
best_resources_candidates = [
|
|
1011
|
+
res for res in candidate_list if
|
|
1012
|
+
res.get_accelerators_str() == f'{acc_name}:{acc_count}'
|
|
1013
|
+
]
|
|
1014
|
+
if len(best_resources_candidates) > 1:
|
|
990
1015
|
is_multi_instances = True
|
|
991
|
-
instance_list = [
|
|
992
|
-
res.instance_type
|
|
993
|
-
|
|
1016
|
+
instance_list = set([
|
|
1017
|
+
res.instance_type
|
|
1018
|
+
for res in best_resources_candidates
|
|
1019
|
+
if res.instance_type is not None
|
|
1020
|
+
])
|
|
1021
|
+
candidate_str = resources_utils.format_resource(
|
|
1022
|
+
best_resources, simplified_only=True)[0]
|
|
1023
|
+
|
|
994
1024
|
logger.info(
|
|
995
|
-
f'Multiple {cloud} instances
|
|
996
|
-
f'{acc_name}:{int(acc_count)}. '
|
|
997
|
-
f'The cheapest {
|
|
998
|
-
f'among
|
|
1025
|
+
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
|
1026
|
+
f'satisfy {acc_name}:{int(acc_count)}. '
|
|
1027
|
+
f'The cheapest {candidate_str} is considered '
|
|
1028
|
+
f'among: {", ".join(instance_list)}.'
|
|
1029
|
+
f'{colorama.Style.RESET_ALL}')
|
|
999
1030
|
if is_multi_instances:
|
|
1000
1031
|
logger.info(
|
|
1001
1032
|
f'To list more details, run: sky show-gpus {acc_name}\n')
|
|
@@ -1147,11 +1178,6 @@ class DummyResources(resources_lib.Resources):
|
|
|
1147
1178
|
return 0
|
|
1148
1179
|
|
|
1149
1180
|
|
|
1150
|
-
class DummyCloud(clouds.Cloud):
|
|
1151
|
-
"""A dummy Cloud that has zero egress cost from/to."""
|
|
1152
|
-
pass
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
1181
|
def _filter_out_blocked_launchable_resources(
|
|
1156
1182
|
launchable_resources: Iterable[resources_lib.Resources],
|
|
1157
1183
|
blocked_resources: Iterable[resources_lib.Resources]):
|
|
@@ -1195,10 +1221,14 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
|
1195
1221
|
all_clouds_specified.add(cloud_str)
|
|
1196
1222
|
|
|
1197
1223
|
# Explicitly check again to update the enabled cloud list.
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1224
|
+
clouds_to_check_again = list(clouds_need_recheck -
|
|
1225
|
+
global_disabled_clouds)
|
|
1226
|
+
if len(clouds_to_check_again) > 0:
|
|
1227
|
+
sky_check.check_capability(
|
|
1228
|
+
sky_cloud.CloudCapability.COMPUTE,
|
|
1229
|
+
quiet=True,
|
|
1230
|
+
clouds=clouds_to_check_again,
|
|
1231
|
+
workspace=skypilot_config.get_active_workspace())
|
|
1202
1232
|
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
|
1203
1233
|
capability=sky_cloud.CloudCapability.COMPUTE,
|
|
1204
1234
|
raise_if_no_cloud_access=True)
|
|
@@ -1208,7 +1238,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
|
1208
1238
|
if disabled_clouds:
|
|
1209
1239
|
is_or_are = 'is' if len(disabled_clouds) == 1 else 'are'
|
|
1210
1240
|
task_name = f' {task.name!r}' if task.name is not None else ''
|
|
1211
|
-
|
|
1241
|
+
disabled_display_names = []
|
|
1242
|
+
for c in disabled_clouds:
|
|
1243
|
+
cloud_obj_one = registry.CLOUD_REGISTRY.from_str(c)
|
|
1244
|
+
if cloud_obj_one is not None:
|
|
1245
|
+
disabled_display_names.append(cloud_obj_one.display_name())
|
|
1246
|
+
cloud_names = ', '.join(disabled_display_names)
|
|
1247
|
+
msg = (f'Task{task_name} requires {cloud_names} '
|
|
1212
1248
|
f'which {is_or_are} not enabled. To enable access, change '
|
|
1213
1249
|
f'the task cloud requirement or run: {colorama.Style.BRIGHT}'
|
|
1214
1250
|
f'sky check {" ".join(c.lower() for c in disabled_clouds)}'
|
|
@@ -1222,6 +1258,62 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
|
1222
1258
|
logger.warning(
|
|
1223
1259
|
f'{colorama.Fore.YELLOW}{msg}{colorama.Style.RESET_ALL}')
|
|
1224
1260
|
|
|
1261
|
+
_check_specified_regions(task)
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
def _check_specified_regions(task: task_lib.Task) -> None:
|
|
1265
|
+
"""Check if specified regions (Kubernetes/SSH contexts) are enabled.
|
|
1266
|
+
|
|
1267
|
+
Args:
|
|
1268
|
+
task: The task to check.
|
|
1269
|
+
"""
|
|
1270
|
+
# Only check for Kubernetes/SSH for now
|
|
1271
|
+
# Below check works because SSH inherits Kubernetes cloud.
|
|
1272
|
+
if not all(
|
|
1273
|
+
isinstance(resources.cloud, clouds.Kubernetes)
|
|
1274
|
+
for resources in task.resources):
|
|
1275
|
+
return
|
|
1276
|
+
# Kubernetes region is a context if set
|
|
1277
|
+
for resources in task.resources:
|
|
1278
|
+
if resources.region is None:
|
|
1279
|
+
continue
|
|
1280
|
+
|
|
1281
|
+
is_ssh = isinstance(resources.cloud, clouds.SSH)
|
|
1282
|
+
if is_ssh:
|
|
1283
|
+
existing_contexts = clouds.SSH.existing_allowed_contexts()
|
|
1284
|
+
else:
|
|
1285
|
+
existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
1286
|
+
|
|
1287
|
+
region = resources.region
|
|
1288
|
+
task_name = f' {task.name!r}' if task.name is not None else ''
|
|
1289
|
+
msg = f'Task{task_name} requires '
|
|
1290
|
+
if region not in existing_contexts:
|
|
1291
|
+
if is_ssh:
|
|
1292
|
+
infra_str = f'SSH/{region.lstrip("ssh-")}'
|
|
1293
|
+
else:
|
|
1294
|
+
infra_str = f'Kubernetes/{region}'
|
|
1295
|
+
logger.warning(f'{infra_str} is not enabled.')
|
|
1296
|
+
volume_mounts_str = ''
|
|
1297
|
+
if task.volume_mounts:
|
|
1298
|
+
if len(task.volume_mounts) > 1:
|
|
1299
|
+
volume_mounts_str += 'volumes '
|
|
1300
|
+
else:
|
|
1301
|
+
volume_mounts_str += 'volume '
|
|
1302
|
+
volume_mounts_str += ', '.join(
|
|
1303
|
+
[f'{v.volume_name}' for v in task.volume_mounts])
|
|
1304
|
+
volume_mounts_str += f' with infra {infra_str}'
|
|
1305
|
+
if volume_mounts_str:
|
|
1306
|
+
msg += volume_mounts_str
|
|
1307
|
+
else:
|
|
1308
|
+
msg += f'infra {infra_str}'
|
|
1309
|
+
msg += (
|
|
1310
|
+
f' which is not enabled. To enable access, change '
|
|
1311
|
+
f'the task infra requirement or run: {colorama.Style.BRIGHT}'
|
|
1312
|
+
f'sky check {colorama.Style.RESET_ALL}'
|
|
1313
|
+
f'to ensure the infra is enabled.')
|
|
1314
|
+
with ux_utils.print_exception_no_traceback():
|
|
1315
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
|
1316
|
+
|
|
1225
1317
|
|
|
1226
1318
|
def _fill_in_launchable_resources(
|
|
1227
1319
|
task: task_lib.Task,
|
|
@@ -1251,8 +1343,7 @@ def _fill_in_launchable_resources(
|
|
|
1251
1343
|
launchable: Dict[resources_lib.Resources, List[resources_lib.Resources]] = (
|
|
1252
1344
|
collections.defaultdict(list))
|
|
1253
1345
|
all_fuzzy_candidates = set()
|
|
1254
|
-
cloud_candidates: _PerCloudCandidates = collections.defaultdict(
|
|
1255
|
-
List[resources_lib.Resources])
|
|
1346
|
+
cloud_candidates: _PerCloudCandidates = collections.defaultdict(list)
|
|
1256
1347
|
resource_hints: Dict[resources_lib.Resources,
|
|
1257
1348
|
List[str]] = collections.defaultdict(list)
|
|
1258
1349
|
if blocked_resources is None:
|
|
@@ -1283,13 +1374,16 @@ def _fill_in_launchable_resources(
|
|
|
1283
1374
|
if feasible_resources.resources_list:
|
|
1284
1375
|
# Assume feasible_resources is sorted by prices. Guaranteed by
|
|
1285
1376
|
# the implementation of get_feasible_launchable_resources and
|
|
1286
|
-
# the underlying
|
|
1377
|
+
# the underlying catalog filtering
|
|
1287
1378
|
cheapest = feasible_resources.resources_list[0]
|
|
1288
1379
|
# Generate region/zone-specified resources.
|
|
1289
1380
|
launchable[resources].extend(
|
|
1290
1381
|
resources_utils.make_launchables_for_valid_region_zones(
|
|
1291
1382
|
cheapest))
|
|
1292
|
-
|
|
1383
|
+
# Each cloud can occur multiple times in feasible_list,
|
|
1384
|
+
# for different region/zone.
|
|
1385
|
+
cloud_candidates[cloud].extend(
|
|
1386
|
+
feasible_resources.resources_list)
|
|
1293
1387
|
else:
|
|
1294
1388
|
all_fuzzy_candidates.update(
|
|
1295
1389
|
feasible_resources.fuzzy_candidate_list)
|
|
@@ -1299,7 +1393,7 @@ def _fill_in_launchable_resources(
|
|
|
1299
1393
|
num_node_str = ''
|
|
1300
1394
|
if task.num_nodes > 1:
|
|
1301
1395
|
num_node_str = f'{task.num_nodes}x '
|
|
1302
|
-
if not quiet:
|
|
1396
|
+
if not (quiet or resources.no_missing_accel_warnings):
|
|
1303
1397
|
logger.info(
|
|
1304
1398
|
f'No resource satisfying {num_node_str}'
|
|
1305
1399
|
f'{resources.repr_with_region_zone} on {clouds_str}.')
|
sky/provision/__init__.py
CHANGED
|
@@ -6,8 +6,9 @@ providers supported by SkyPilot need to follow.
|
|
|
6
6
|
import functools
|
|
7
7
|
import inspect
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Any, Dict, List, Optional, Type
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Type
|
|
10
10
|
|
|
11
|
+
from sky import models
|
|
11
12
|
from sky import sky_logging
|
|
12
13
|
# These provision.<cloud> modules should never fail even if underlying cloud SDK
|
|
13
14
|
# dependencies are not installed. This is ensured by using sky.adaptors inside
|
|
@@ -18,11 +19,17 @@ from sky.provision import common
|
|
|
18
19
|
from sky.provision import cudo
|
|
19
20
|
from sky.provision import fluidstack
|
|
20
21
|
from sky.provision import gcp
|
|
22
|
+
from sky.provision import hyperbolic
|
|
21
23
|
from sky.provision import kubernetes
|
|
22
24
|
from sky.provision import lambda_cloud
|
|
23
25
|
from sky.provision import nebius
|
|
24
26
|
from sky.provision import oci
|
|
27
|
+
from sky.provision import primeintellect
|
|
25
28
|
from sky.provision import runpod
|
|
29
|
+
from sky.provision import scp
|
|
30
|
+
from sky.provision import seeweb
|
|
31
|
+
from sky.provision import shadeform
|
|
32
|
+
from sky.provision import ssh
|
|
26
33
|
from sky.provision import vast
|
|
27
34
|
from sky.provision import vsphere
|
|
28
35
|
from sky.utils import command_runner
|
|
@@ -69,16 +76,24 @@ def _route_to_cloud_impl(func):
|
|
|
69
76
|
@_route_to_cloud_impl
|
|
70
77
|
def query_instances(
|
|
71
78
|
provider_name: str,
|
|
79
|
+
cluster_name: str,
|
|
72
80
|
cluster_name_on_cloud: str,
|
|
73
81
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
74
82
|
non_terminated_only: bool = True,
|
|
75
|
-
|
|
83
|
+
retry_if_missing: bool = False,
|
|
84
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
76
85
|
"""Query instances.
|
|
77
86
|
|
|
78
|
-
Returns a dictionary of instance IDs and status
|
|
87
|
+
Returns a dictionary of instance IDs and a tuple of (status, reason for
|
|
88
|
+
being in status if any).
|
|
79
89
|
|
|
80
90
|
A None status means the instance is marked as "terminated"
|
|
81
91
|
or "terminating".
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
95
|
+
cluster is not found when querying the live status on the cloud.
|
|
96
|
+
NOTE: This is currently only used on kubernetes.
|
|
82
97
|
"""
|
|
83
98
|
raise NotImplementedError
|
|
84
99
|
|
|
@@ -101,7 +116,67 @@ def bootstrap_instances(
|
|
|
101
116
|
|
|
102
117
|
|
|
103
118
|
@_route_to_cloud_impl
|
|
104
|
-
def
|
|
119
|
+
def apply_volume(provider_name: str,
|
|
120
|
+
volume_config: models.VolumeConfig) -> models.VolumeConfig:
|
|
121
|
+
"""Create or register a volume.
|
|
122
|
+
|
|
123
|
+
This function creates or registers a volume with the provided configuration,
|
|
124
|
+
and returns a VolumeConfig object with updated configuration.
|
|
125
|
+
"""
|
|
126
|
+
raise NotImplementedError
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@_route_to_cloud_impl
|
|
130
|
+
def delete_volume(provider_name: str,
|
|
131
|
+
volume_config: models.VolumeConfig) -> models.VolumeConfig:
|
|
132
|
+
"""Delete a volume."""
|
|
133
|
+
raise NotImplementedError
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@_route_to_cloud_impl
|
|
137
|
+
def get_volume_usedby(
|
|
138
|
+
provider_name: str,
|
|
139
|
+
volume_config: models.VolumeConfig,
|
|
140
|
+
) -> Tuple[List[str], List[str]]:
|
|
141
|
+
"""Get the usedby of a volume.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
usedby_pods: List of pods using the volume. These may include pods
|
|
145
|
+
not created by SkyPilot.
|
|
146
|
+
usedby_clusters: List of clusters using the volume.
|
|
147
|
+
"""
|
|
148
|
+
raise NotImplementedError
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@_route_to_cloud_impl
|
|
152
|
+
def get_all_volumes_usedby(
|
|
153
|
+
provider_name: str, configs: List[models.VolumeConfig]
|
|
154
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
155
|
+
"""Get the usedby of a volume.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
usedby_pods: List of dictionaries, each containing the config keys for
|
|
159
|
+
a volume and a key containing pods using the volume.
|
|
160
|
+
These may include pods not created by SkyPilot.
|
|
161
|
+
usedby_clusters: List of dictionaries, each containing the config keys
|
|
162
|
+
for a volume and a key containing clusters using
|
|
163
|
+
the volume.
|
|
164
|
+
"""
|
|
165
|
+
raise NotImplementedError
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@_route_to_cloud_impl
|
|
169
|
+
def map_all_volumes_usedby(
|
|
170
|
+
provider_name: str, used_by_pods: Dict[str, Any],
|
|
171
|
+
used_by_clusters: Dict[str, Any],
|
|
172
|
+
config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
|
|
173
|
+
"""Map the usedby resources of a volume."""
|
|
174
|
+
raise NotImplementedError
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@_route_to_cloud_impl
|
|
178
|
+
def run_instances(provider_name: str, region: str, cluster_name: str,
|
|
179
|
+
cluster_name_on_cloud: str,
|
|
105
180
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
106
181
|
"""Start instances with bootstrapped configuration."""
|
|
107
182
|
raise NotImplementedError
|
|
@@ -129,6 +204,17 @@ def terminate_instances(
|
|
|
129
204
|
raise NotImplementedError
|
|
130
205
|
|
|
131
206
|
|
|
207
|
+
@_route_to_cloud_impl
|
|
208
|
+
def cleanup_custom_multi_network(
|
|
209
|
+
provider_name: str,
|
|
210
|
+
cluster_name_on_cloud: str,
|
|
211
|
+
provider_config: Dict[str, Any],
|
|
212
|
+
failover: bool = False,
|
|
213
|
+
) -> None:
|
|
214
|
+
"""Cleanup custom multi-network."""
|
|
215
|
+
raise NotImplementedError
|
|
216
|
+
|
|
217
|
+
|
|
132
218
|
@_route_to_cloud_impl
|
|
133
219
|
def open_ports(
|
|
134
220
|
provider_name: str,
|