skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/cloud.py
CHANGED
|
@@ -11,13 +11,14 @@ import collections
|
|
|
11
11
|
import enum
|
|
12
12
|
import math
|
|
13
13
|
import typing
|
|
14
|
-
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple,
|
|
14
|
+
from typing import (Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple,
|
|
15
|
+
Union)
|
|
15
16
|
|
|
16
17
|
from typing_extensions import assert_never
|
|
17
18
|
|
|
19
|
+
from sky import catalog
|
|
18
20
|
from sky import exceptions
|
|
19
21
|
from sky import skypilot_config
|
|
20
|
-
from sky.clouds import service_catalog
|
|
21
22
|
from sky.utils import log_utils
|
|
22
23
|
from sky.utils import resources_utils
|
|
23
24
|
from sky.utils import timeline
|
|
@@ -26,6 +27,7 @@ from sky.utils import ux_utils
|
|
|
26
27
|
if typing.TYPE_CHECKING:
|
|
27
28
|
from sky import resources as resources_lib
|
|
28
29
|
from sky.utils import status_lib
|
|
30
|
+
from sky.utils import volume as volume_lib
|
|
29
31
|
|
|
30
32
|
|
|
31
33
|
class CloudImplementationFeatures(enum.Enum):
|
|
@@ -44,6 +46,7 @@ class CloudImplementationFeatures(enum.Enum):
|
|
|
44
46
|
DOCKER_IMAGE = 'docker_image'
|
|
45
47
|
SPOT_INSTANCE = 'spot_instance'
|
|
46
48
|
CUSTOM_DISK_TIER = 'custom_disk_tier'
|
|
49
|
+
CUSTOM_NETWORK_TIER = 'custom_network_tier'
|
|
47
50
|
OPEN_PORTS = 'open_ports'
|
|
48
51
|
STORAGE_MOUNTING = 'storage_mounting'
|
|
49
52
|
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
|
|
@@ -52,6 +55,9 @@ class CloudImplementationFeatures(enum.Enum):
|
|
|
52
55
|
AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
|
|
53
56
|
AUTOSTOP = 'autostop' # Pod/VM can stop itself
|
|
54
57
|
AUTODOWN = 'autodown' # Pod/VM can down itself
|
|
58
|
+
# Pod/VM can have customized multiple network interfaces
|
|
59
|
+
# e.g. GCP GPUDirect TCPX
|
|
60
|
+
CUSTOM_MULTI_NETWORK = 'custom_multi_network'
|
|
55
61
|
|
|
56
62
|
|
|
57
63
|
# Use str, enum.Enum to allow CloudCapability to be used as a string.
|
|
@@ -138,6 +144,9 @@ class Cloud:
|
|
|
138
144
|
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
|
|
139
145
|
_BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
|
|
140
146
|
_SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
|
|
147
|
+
_SUPPORTED_NETWORK_TIERS = {
|
|
148
|
+
resources_utils.NetworkTier.STANDARD, resources_utils.NetworkTier.BEST
|
|
149
|
+
}
|
|
141
150
|
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
|
|
142
151
|
|
|
143
152
|
# The version of provisioner and status query. This is used to determine
|
|
@@ -176,14 +185,19 @@ class Cloud:
|
|
|
176
185
|
#### Regions/Zones ####
|
|
177
186
|
|
|
178
187
|
@classmethod
|
|
179
|
-
def regions_with_offering(
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
188
|
+
def regions_with_offering(
|
|
189
|
+
cls,
|
|
190
|
+
instance_type: str,
|
|
191
|
+
accelerators: Optional[Dict[str, int]],
|
|
192
|
+
use_spot: bool,
|
|
193
|
+
region: Optional[str],
|
|
194
|
+
zone: Optional[str],
|
|
195
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
196
|
+
) -> List[Region]:
|
|
183
197
|
"""Returns the regions that offer the specified resources.
|
|
184
198
|
|
|
185
199
|
The order of the regions follow the order of the regions returned by
|
|
186
|
-
|
|
200
|
+
sky/catalog/common.py#get_region_zones().
|
|
187
201
|
When region or zone is not None, the returned value will be limited to
|
|
188
202
|
the specified region/zone.
|
|
189
203
|
|
|
@@ -302,7 +316,8 @@ class Cloud:
|
|
|
302
316
|
zones: Optional[List['Zone']],
|
|
303
317
|
num_nodes: int,
|
|
304
318
|
dryrun: bool = False,
|
|
305
|
-
|
|
319
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
320
|
+
) -> Dict[str, Any]:
|
|
306
321
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
|
307
322
|
|
|
308
323
|
These variables are used to fill the node type section (instance type,
|
|
@@ -331,14 +346,23 @@ class Cloud:
|
|
|
331
346
|
raise NotImplementedError
|
|
332
347
|
|
|
333
348
|
@classmethod
|
|
334
|
-
def
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
memory: Optional[str] = None,
|
|
338
|
-
disk_tier: Optional[resources_utils.DiskTier] = None
|
|
349
|
+
def get_arch_from_instance_type(
|
|
350
|
+
cls,
|
|
351
|
+
instance_type: str,
|
|
339
352
|
) -> Optional[str]:
|
|
340
|
-
"""Returns the
|
|
341
|
-
|
|
353
|
+
"""Returns the arch of the instance type, if any."""
|
|
354
|
+
raise NotImplementedError
|
|
355
|
+
|
|
356
|
+
@classmethod
|
|
357
|
+
def get_default_instance_type(cls,
|
|
358
|
+
cpus: Optional[str] = None,
|
|
359
|
+
memory: Optional[str] = None,
|
|
360
|
+
disk_tier: Optional[
|
|
361
|
+
resources_utils.DiskTier] = None,
|
|
362
|
+
region: Optional[str] = None,
|
|
363
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
364
|
+
"""Returns the default instance type with the given #vCPUs, memory,
|
|
365
|
+
disk tier, region, and zone.
|
|
342
366
|
|
|
343
367
|
For example, if cpus='4', this method returns the default instance type
|
|
344
368
|
with 4 vCPUs. If cpus='4+', this method returns the default instance
|
|
@@ -362,9 +386,9 @@ class Cloud:
|
|
|
362
386
|
@classmethod
|
|
363
387
|
def is_image_tag_valid(cls, image_tag: str, region: Optional[str]) -> bool:
|
|
364
388
|
"""Validates that the image tag is valid for this cloud."""
|
|
365
|
-
return
|
|
366
|
-
|
|
367
|
-
|
|
389
|
+
return catalog.is_image_tag_valid(image_tag,
|
|
390
|
+
region,
|
|
391
|
+
clouds=cls._REPR.lower())
|
|
368
392
|
|
|
369
393
|
@classmethod
|
|
370
394
|
def is_label_valid(cls, label_key: str,
|
|
@@ -385,6 +409,21 @@ class Cloud:
|
|
|
385
409
|
del label_key, label_value
|
|
386
410
|
return True, None
|
|
387
411
|
|
|
412
|
+
@classmethod
|
|
413
|
+
def is_volume_name_valid(cls,
|
|
414
|
+
volume_name: str) -> Tuple[bool, Optional[str]]:
|
|
415
|
+
"""Validates that the volume name is valid for this cloud.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
A tuple of a boolean indicating whether the volume name is valid
|
|
419
|
+
and an optional string describing the reason if the volume name
|
|
420
|
+
is invalid.
|
|
421
|
+
"""
|
|
422
|
+
# If a cloud does not support volume, they are ignored. Only clouds
|
|
423
|
+
# that support volume implement this method.
|
|
424
|
+
del volume_name
|
|
425
|
+
return True, None
|
|
426
|
+
|
|
388
427
|
@timeline.event
|
|
389
428
|
def get_feasible_launchable_resources(
|
|
390
429
|
self,
|
|
@@ -456,12 +495,14 @@ class Cloud:
|
|
|
456
495
|
|
|
457
496
|
@classmethod
|
|
458
497
|
def check_credentials(
|
|
459
|
-
|
|
460
|
-
|
|
498
|
+
cls, cloud_capability: CloudCapability
|
|
499
|
+
) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
461
500
|
"""Checks if the user has access credentials to this cloud.
|
|
462
501
|
|
|
463
|
-
Returns a boolean of whether the user can access this cloud, and
|
|
464
|
-
|
|
502
|
+
Returns a boolean of whether the user can access this cloud, and:
|
|
503
|
+
- For SSH and Kubernetes, a dictionary that maps context names to
|
|
504
|
+
the status of the context.
|
|
505
|
+
- For others, a string describing the reason if cannot access.
|
|
465
506
|
|
|
466
507
|
Raises NotSupportedError if the capability is
|
|
467
508
|
not supported by this cloud.
|
|
@@ -473,19 +514,30 @@ class Cloud:
|
|
|
473
514
|
assert_never(cloud_capability)
|
|
474
515
|
|
|
475
516
|
@classmethod
|
|
476
|
-
def _check_compute_credentials(
|
|
517
|
+
def _check_compute_credentials(
|
|
518
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
477
519
|
"""Checks if the user has access credentials to
|
|
478
520
|
this cloud's compute service."""
|
|
479
521
|
raise exceptions.NotSupportedError(
|
|
480
522
|
f'{cls._REPR} does not support {CloudCapability.COMPUTE.value}.')
|
|
481
523
|
|
|
482
524
|
@classmethod
|
|
483
|
-
def _check_storage_credentials(
|
|
525
|
+
def _check_storage_credentials(
|
|
526
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
484
527
|
"""Checks if the user has access credentials to
|
|
485
528
|
this cloud's storage service."""
|
|
486
529
|
raise exceptions.NotSupportedError(
|
|
487
530
|
f'{cls._REPR} does not support {CloudCapability.STORAGE.value}.')
|
|
488
531
|
|
|
532
|
+
@classmethod
|
|
533
|
+
def expand_infras(cls) -> List[str]:
|
|
534
|
+
"""Returns a list of enabled infrastructures for this cloud.
|
|
535
|
+
|
|
536
|
+
For Kubernetes and SSH, return a list of resource pools.
|
|
537
|
+
For all other clouds, return self.
|
|
538
|
+
"""
|
|
539
|
+
return [cls.canonical_name()]
|
|
540
|
+
|
|
489
541
|
# TODO(zhwu): Make the return type immutable.
|
|
490
542
|
@classmethod
|
|
491
543
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
@@ -607,13 +659,13 @@ class Cloud:
|
|
|
607
659
|
Raises:
|
|
608
660
|
ValueError: If region or zone is invalid or not supported.
|
|
609
661
|
"""
|
|
610
|
-
return
|
|
611
|
-
|
|
612
|
-
|
|
662
|
+
return catalog.validate_region_zone(region,
|
|
663
|
+
zone,
|
|
664
|
+
clouds=self._REPR.lower())
|
|
613
665
|
|
|
614
666
|
def need_cleanup_after_preemption_or_failure(
|
|
615
667
|
self, resources: 'resources_lib.Resources') -> bool:
|
|
616
|
-
"""Whether a resource needs cleanup after
|
|
668
|
+
"""Whether a resource needs cleanup after preemption or failure.
|
|
617
669
|
|
|
618
670
|
In most cases, spot resources do not need cleanup after preemption,
|
|
619
671
|
as long as the cluster can be relaunched with the same name and tag,
|
|
@@ -627,8 +679,11 @@ class Cloud:
|
|
|
627
679
|
|
|
628
680
|
@classmethod
|
|
629
681
|
def check_features_are_supported(
|
|
630
|
-
|
|
631
|
-
|
|
682
|
+
cls,
|
|
683
|
+
resources: 'resources_lib.Resources',
|
|
684
|
+
requested_features: Set[CloudImplementationFeatures],
|
|
685
|
+
region: Optional[str] = None,
|
|
686
|
+
) -> None:
|
|
632
687
|
"""Errors out if the cloud does not support all requested features.
|
|
633
688
|
|
|
634
689
|
For instance, Lambda Cloud does not support stop, so
|
|
@@ -646,11 +701,14 @@ class Cloud:
|
|
|
646
701
|
requested features.
|
|
647
702
|
"""
|
|
648
703
|
unsupported_features2reason = cls._unsupported_features_for_resources(
|
|
649
|
-
resources)
|
|
704
|
+
resources, region)
|
|
650
705
|
|
|
651
706
|
# Docker image is not compatible with ssh proxy command.
|
|
652
|
-
if skypilot_config.
|
|
653
|
-
|
|
707
|
+
if skypilot_config.get_effective_region_config(
|
|
708
|
+
cloud=str(cls).lower(),
|
|
709
|
+
region=None,
|
|
710
|
+
keys=('ssh_proxy_command',),
|
|
711
|
+
default_value=None) is not None:
|
|
654
712
|
unsupported_features2reason.update({
|
|
655
713
|
CloudImplementationFeatures.DOCKER_IMAGE: (
|
|
656
714
|
f'Docker image is currently not supported on {cls._REPR} '
|
|
@@ -673,7 +731,9 @@ class Cloud:
|
|
|
673
731
|
|
|
674
732
|
@classmethod
|
|
675
733
|
def _unsupported_features_for_resources(
|
|
676
|
-
cls,
|
|
734
|
+
cls,
|
|
735
|
+
resources: 'resources_lib.Resources',
|
|
736
|
+
region: Optional[str] = None,
|
|
677
737
|
) -> Dict[CloudImplementationFeatures, str]:
|
|
678
738
|
"""The features not supported based on the resources provided.
|
|
679
739
|
|
|
@@ -684,7 +744,7 @@ class Cloud:
|
|
|
684
744
|
A dict of {feature: reason} for the features not supported by the
|
|
685
745
|
cloud implementation.
|
|
686
746
|
"""
|
|
687
|
-
del resources
|
|
747
|
+
del resources, region
|
|
688
748
|
raise NotImplementedError
|
|
689
749
|
|
|
690
750
|
@classmethod
|
|
@@ -701,6 +761,26 @@ class Cloud:
|
|
|
701
761
|
raise exceptions.NotSupportedError(
|
|
702
762
|
f'{disk_tier} is not supported by {cls._REPR}.')
|
|
703
763
|
|
|
764
|
+
@classmethod
|
|
765
|
+
def check_network_tier_enabled(
|
|
766
|
+
cls, instance_type: Optional[str],
|
|
767
|
+
network_tier: resources_utils.NetworkTier) -> None:
|
|
768
|
+
"""Errors out if the network tier is not supported by the
|
|
769
|
+
cloud provider.
|
|
770
|
+
|
|
771
|
+
For BEST tier: always succeeds, will use best available tier.
|
|
772
|
+
|
|
773
|
+
Raises:
|
|
774
|
+
exceptions.NotSupportedError: If the network tier is not supported.
|
|
775
|
+
"""
|
|
776
|
+
del instance_type # unused
|
|
777
|
+
|
|
778
|
+
# For other tiers, check if supported
|
|
779
|
+
if network_tier not in cls._SUPPORTED_NETWORK_TIERS:
|
|
780
|
+
with ux_utils.print_exception_no_traceback():
|
|
781
|
+
raise exceptions.NotSupportedError(
|
|
782
|
+
f'{network_tier} is not supported by {cls._REPR}.')
|
|
783
|
+
|
|
704
784
|
@classmethod
|
|
705
785
|
def _translate_disk_tier(
|
|
706
786
|
cls, disk_tier: Optional[resources_utils.DiskTier]
|
|
@@ -721,7 +801,7 @@ class Cloud:
|
|
|
721
801
|
Raises:
|
|
722
802
|
ResourcesMismatchError: If the accelerator is not supported.
|
|
723
803
|
"""
|
|
724
|
-
|
|
804
|
+
resources = resources.assert_launchable()
|
|
725
805
|
|
|
726
806
|
def _equal_accelerators(
|
|
727
807
|
acc_requested: Optional[Dict[str, Union[int, float]]],
|
|
@@ -738,12 +818,21 @@ class Cloud:
|
|
|
738
818
|
if acc_from_instance_type is None:
|
|
739
819
|
return False
|
|
740
820
|
|
|
741
|
-
for
|
|
742
|
-
|
|
821
|
+
for requested_acc in acc_requested:
|
|
822
|
+
for instance_acc in acc_from_instance_type:
|
|
823
|
+
# The requested accelerator can be canonicalized based on
|
|
824
|
+
# the accelerator registry, which may not has the same case
|
|
825
|
+
# as the cloud's catalog, e.g., 'RTXPro6000' in Shadeform
|
|
826
|
+
# catalog, and 'RTXPRO6000' in RunPod catalog.
|
|
827
|
+
if requested_acc.lower() == instance_acc.lower():
|
|
828
|
+
# Found the requested accelerator in the instance type.
|
|
829
|
+
break
|
|
830
|
+
else:
|
|
831
|
+
# Requested accelerator not found in instance type.
|
|
743
832
|
return False
|
|
744
833
|
# Avoid float point precision issue.
|
|
745
|
-
if not math.isclose(acc_requested[
|
|
746
|
-
acc_from_instance_type[
|
|
834
|
+
if not math.isclose(acc_requested[requested_acc],
|
|
835
|
+
acc_from_instance_type[instance_acc]):
|
|
747
836
|
return False
|
|
748
837
|
return True
|
|
749
838
|
|
|
@@ -877,6 +966,11 @@ class Cloud:
|
|
|
877
966
|
def canonical_name(cls) -> str:
|
|
878
967
|
return cls.__name__.lower()
|
|
879
968
|
|
|
969
|
+
@classmethod
|
|
970
|
+
def display_name(cls) -> str:
|
|
971
|
+
"""Name of the cloud used in messages displayed to the user."""
|
|
972
|
+
return cls.canonical_name()
|
|
973
|
+
|
|
880
974
|
def __repr__(self):
|
|
881
975
|
return self._REPR
|
|
882
976
|
|
|
@@ -887,6 +981,12 @@ class Cloud:
|
|
|
887
981
|
return state
|
|
888
982
|
|
|
889
983
|
|
|
984
|
+
class DummyCloud(Cloud):
|
|
985
|
+
"""A dummy Cloud that has zero egress cost from/to for optimization
|
|
986
|
+
purpose."""
|
|
987
|
+
pass
|
|
988
|
+
|
|
989
|
+
|
|
890
990
|
# === Helper functions ===
|
|
891
991
|
def cloud_in_iterable(cloud: Cloud, cloud_list: Iterable[Cloud]) -> bool:
|
|
892
992
|
"""Returns whether the cloud is in the given cloud list."""
|
sky/clouds/cudo.py
CHANGED
|
@@ -3,8 +3,9 @@ import subprocess
|
|
|
3
3
|
import typing
|
|
4
4
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
from sky import catalog
|
|
6
7
|
from sky import clouds
|
|
7
|
-
from sky.
|
|
8
|
+
from sky.adaptors import common
|
|
8
9
|
from sky.utils import common_utils
|
|
9
10
|
from sky.utils import registry
|
|
10
11
|
from sky.utils import resources_utils
|
|
@@ -12,6 +13,7 @@ from sky.utils import resources_utils
|
|
|
12
13
|
if typing.TYPE_CHECKING:
|
|
13
14
|
# Renaming to avoid shadowing variables.
|
|
14
15
|
from sky import resources as resources_lib
|
|
16
|
+
from sky.utils import volume as volume_lib
|
|
15
17
|
|
|
16
18
|
_CREDENTIAL_FILES = [
|
|
17
19
|
# credential files for Cudo,
|
|
@@ -59,6 +61,8 @@ class Cudo(clouds.Cloud):
|
|
|
59
61
|
('Spot is not supported, as Cudo API does not implement spot.'),
|
|
60
62
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
|
61
63
|
('Custom disk tier is currently not supported on Cudo Compute'),
|
|
64
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
|
65
|
+
('Custom network tier is currently not supported on Cudo Compute'),
|
|
62
66
|
clouds.CloudImplementationFeatures.IMAGE_ID:
|
|
63
67
|
('Image ID is currently not supported on Cudo. '),
|
|
64
68
|
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
|
@@ -70,6 +74,9 @@ class Cudo(clouds.Cloud):
|
|
|
70
74
|
),
|
|
71
75
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
|
72
76
|
('High availability controllers are not supported on Cudo.'),
|
|
77
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
|
|
78
|
+
('Customized multiple network interfaces are not supported on Cudo.'
|
|
79
|
+
),
|
|
73
80
|
}
|
|
74
81
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 60
|
|
75
82
|
|
|
@@ -80,7 +87,9 @@ class Cudo(clouds.Cloud):
|
|
|
80
87
|
|
|
81
88
|
@classmethod
|
|
82
89
|
def _unsupported_features_for_resources(
|
|
83
|
-
cls,
|
|
90
|
+
cls,
|
|
91
|
+
resources: 'resources_lib.Resources',
|
|
92
|
+
region: Optional[str] = None,
|
|
84
93
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
85
94
|
"""The features not supported based on the resources provided.
|
|
86
95
|
|
|
@@ -99,16 +108,21 @@ class Cudo(clouds.Cloud):
|
|
|
99
108
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
100
109
|
|
|
101
110
|
@classmethod
|
|
102
|
-
def regions_with_offering(
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
111
|
+
def regions_with_offering(
|
|
112
|
+
cls,
|
|
113
|
+
instance_type,
|
|
114
|
+
accelerators: Optional[Dict[str, int]],
|
|
115
|
+
use_spot: bool,
|
|
116
|
+
region: Optional[str],
|
|
117
|
+
zone: Optional[str],
|
|
118
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
119
|
+
) -> List[clouds.Region]:
|
|
106
120
|
assert zone is None, 'Cudo does not support zones.'
|
|
107
121
|
del accelerators, zone # unused
|
|
108
122
|
if use_spot:
|
|
109
123
|
return []
|
|
110
124
|
|
|
111
|
-
regions =
|
|
125
|
+
regions = catalog.get_region_zones_for_instance_type(
|
|
112
126
|
instance_type, use_spot, 'cudo')
|
|
113
127
|
|
|
114
128
|
if region is not None:
|
|
@@ -121,8 +135,8 @@ class Cudo(clouds.Cloud):
|
|
|
121
135
|
instance_type: str,
|
|
122
136
|
) -> Tuple[Optional[float], Optional[float]]:
|
|
123
137
|
|
|
124
|
-
return
|
|
125
|
-
|
|
138
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
|
139
|
+
clouds='cudo')
|
|
126
140
|
|
|
127
141
|
@classmethod
|
|
128
142
|
def zones_provision_loop(
|
|
@@ -149,11 +163,11 @@ class Cudo(clouds.Cloud):
|
|
|
149
163
|
use_spot: bool,
|
|
150
164
|
region: Optional[str] = None,
|
|
151
165
|
zone: Optional[str] = None) -> float:
|
|
152
|
-
return
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
166
|
+
return catalog.get_hourly_cost(instance_type,
|
|
167
|
+
use_spot=use_spot,
|
|
168
|
+
region=region,
|
|
169
|
+
zone=zone,
|
|
170
|
+
clouds='cudo')
|
|
157
171
|
|
|
158
172
|
def accelerators_to_hourly_cost(self,
|
|
159
173
|
accelerators: Dict[str, int],
|
|
@@ -169,23 +183,27 @@ class Cudo(clouds.Cloud):
|
|
|
169
183
|
return 0.0
|
|
170
184
|
|
|
171
185
|
@classmethod
|
|
172
|
-
def get_default_instance_type(
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
186
|
+
def get_default_instance_type(cls,
|
|
187
|
+
cpus: Optional[str] = None,
|
|
188
|
+
memory: Optional[str] = None,
|
|
189
|
+
disk_tier: Optional[
|
|
190
|
+
resources_utils.DiskTier] = None,
|
|
191
|
+
region: Optional[str] = None,
|
|
192
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
193
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
|
194
|
+
memory=memory,
|
|
195
|
+
disk_tier=disk_tier,
|
|
196
|
+
region=region,
|
|
197
|
+
zone=zone,
|
|
198
|
+
clouds='cudo')
|
|
181
199
|
|
|
182
200
|
@classmethod
|
|
183
201
|
def get_accelerators_from_instance_type(
|
|
184
202
|
cls,
|
|
185
203
|
instance_type: str,
|
|
186
204
|
) -> Optional[Dict[str, Union[int, float]]]:
|
|
187
|
-
return
|
|
188
|
-
|
|
205
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
206
|
+
clouds='cudo')
|
|
189
207
|
|
|
190
208
|
@classmethod
|
|
191
209
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
@@ -199,10 +217,12 @@ class Cudo(clouds.Cloud):
|
|
|
199
217
|
zones: Optional[List['clouds.Zone']],
|
|
200
218
|
num_nodes: int,
|
|
201
219
|
dryrun: bool = False,
|
|
220
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
202
221
|
) -> Dict[str, Optional[str]]:
|
|
203
222
|
del zones, cluster_name # unused
|
|
204
|
-
|
|
205
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
|
223
|
+
resources = resources.assert_launchable()
|
|
224
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
|
225
|
+
resources.instance_type)
|
|
206
226
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
|
207
227
|
acc_dict)
|
|
208
228
|
|
|
@@ -243,7 +263,9 @@ class Cudo(clouds.Cloud):
|
|
|
243
263
|
default_instance_type = Cudo.get_default_instance_type(
|
|
244
264
|
cpus=resources.cpus,
|
|
245
265
|
memory=resources.memory,
|
|
246
|
-
disk_tier=resources.disk_tier
|
|
266
|
+
disk_tier=resources.disk_tier,
|
|
267
|
+
region=resources.region,
|
|
268
|
+
zone=resources.zone)
|
|
247
269
|
if default_instance_type is None:
|
|
248
270
|
return resources_utils.FeasibleResources([], [], None)
|
|
249
271
|
else:
|
|
@@ -252,16 +274,16 @@ class Cudo(clouds.Cloud):
|
|
|
252
274
|
|
|
253
275
|
assert len(accelerators) == 1, resources
|
|
254
276
|
acc, acc_count = list(accelerators.items())[0]
|
|
255
|
-
(instance_list,
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
277
|
+
(instance_list,
|
|
278
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
|
279
|
+
acc,
|
|
280
|
+
acc_count,
|
|
281
|
+
use_spot=resources.use_spot,
|
|
282
|
+
cpus=resources.cpus,
|
|
283
|
+
memory=resources.memory,
|
|
284
|
+
region=resources.region,
|
|
285
|
+
zone=resources.zone,
|
|
286
|
+
clouds='cudo')
|
|
265
287
|
if instance_list is None:
|
|
266
288
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
|
267
289
|
None)
|
|
@@ -269,17 +291,13 @@ class Cudo(clouds.Cloud):
|
|
|
269
291
|
fuzzy_candidate_list, None)
|
|
270
292
|
|
|
271
293
|
@classmethod
|
|
272
|
-
def _check_compute_credentials(
|
|
294
|
+
def _check_compute_credentials(
|
|
295
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
273
296
|
"""Checks if the user has access credentials to
|
|
274
297
|
Cudo's compute service."""
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
except (ImportError, subprocess.CalledProcessError) as e:
|
|
279
|
-
return False, (
|
|
280
|
-
f'{cls._DEPENDENCY_HINT}\n'
|
|
281
|
-
f'{cls._INDENT_PREFIX}'
|
|
282
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
298
|
+
if not common.can_import_modules(['cudo_compute']):
|
|
299
|
+
return False, (f'{cls._DEPENDENCY_HINT}\n'
|
|
300
|
+
f'{cls._INDENT_PREFIX}')
|
|
283
301
|
|
|
284
302
|
try:
|
|
285
303
|
_run_output('cudoctl --version')
|
|
@@ -292,7 +310,7 @@ class Cudo(clouds.Cloud):
|
|
|
292
310
|
from cudo_compute import cudo_api
|
|
293
311
|
from cudo_compute.rest import ApiException
|
|
294
312
|
try:
|
|
295
|
-
_, error = cudo_api.
|
|
313
|
+
_, error = cudo_api.make_client()
|
|
296
314
|
except FileNotFoundError as e:
|
|
297
315
|
return False, (
|
|
298
316
|
'Cudo credentials are not set. '
|
|
@@ -334,7 +352,7 @@ class Cudo(clouds.Cloud):
|
|
|
334
352
|
return None
|
|
335
353
|
|
|
336
354
|
def instance_type_exists(self, instance_type: str) -> bool:
|
|
337
|
-
return
|
|
355
|
+
return catalog.instance_type_exists(instance_type, 'cudo')
|
|
338
356
|
|
|
339
357
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
|
340
|
-
return
|
|
358
|
+
return catalog.validate_region_zone(region, zone, clouds='cudo')
|