skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/clouds/shadeform.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
""" Shadeform Cloud. """
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import typing
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
from sky import catalog
|
|
9
|
+
from sky import clouds
|
|
10
|
+
from sky.adaptors import common as adaptors_common
|
|
11
|
+
from sky.catalog import shadeform_catalog
|
|
12
|
+
from sky.utils import registry
|
|
13
|
+
from sky.utils import resources_utils
|
|
14
|
+
from sky.utils import status_lib
|
|
15
|
+
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from sky import resources as resources_lib
|
|
18
|
+
from sky.utils import volume as volume_lib
|
|
19
|
+
else:
|
|
20
|
+
requests = adaptors_common.LazyImport('requests')
|
|
21
|
+
|
|
22
|
+
# Minimum set of files under ~/.shadeform that grant Shadeform access.
|
|
23
|
+
_CREDENTIAL_FILES = [
|
|
24
|
+
'api_key',
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@registry.CLOUD_REGISTRY.register
|
|
29
|
+
class Shadeform(clouds.Cloud):
|
|
30
|
+
"""Shadeform GPU Cloud
|
|
31
|
+
|
|
32
|
+
Shadeform is a unified API for deploying and managing cloud GPUs across
|
|
33
|
+
multiple cloud providers.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Shadeform doesn't have explicit cluster name limits, but conservative
|
|
37
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
|
38
|
+
|
|
39
|
+
# Features not currently supported by Shadeform
|
|
40
|
+
# yapf: disable
|
|
41
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
|
42
|
+
clouds.CloudImplementationFeatures.STOP:
|
|
43
|
+
'Stopping instances not supported on Shadeform.',
|
|
44
|
+
clouds.CloudImplementationFeatures.MULTI_NODE:
|
|
45
|
+
'Multi-node clusters not supported on Shadeform.',
|
|
46
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
|
47
|
+
'Spot instances not supported on Shadeform.',
|
|
48
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
|
49
|
+
'Custom disk tiers not supported on Shadeform.',
|
|
50
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
|
51
|
+
'Custom network tiers not supported on Shadeform.',
|
|
52
|
+
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|
|
53
|
+
'Object storage mounting not supported on Shadeform.',
|
|
54
|
+
clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
|
|
55
|
+
'Host controllers not supported on Shadeform.',
|
|
56
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
|
57
|
+
'High availability controllers not supported.',
|
|
58
|
+
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
|
59
|
+
'Disk cloning not supported on Shadeform.',
|
|
60
|
+
clouds.CloudImplementationFeatures.IMAGE_ID:
|
|
61
|
+
'Custom image IDs not supported on Shadeform.',
|
|
62
|
+
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
|
63
|
+
'Docker images not supported on Shadeform yet.',
|
|
64
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
|
|
65
|
+
'Custom multiple network interfaces not supported.',
|
|
66
|
+
}
|
|
67
|
+
# yapf: enable
|
|
68
|
+
|
|
69
|
+
_regions: List[clouds.Region] = []
|
|
70
|
+
|
|
71
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
|
72
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
|
73
|
+
OPEN_PORTS_VERSION = clouds.OpenPortsVersion.LAUNCH_ONLY
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def _unsupported_features_for_resources(
|
|
77
|
+
cls,
|
|
78
|
+
resources: 'resources_lib.Resources',
|
|
79
|
+
region: Optional[str] = None,
|
|
80
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
81
|
+
"""The features not supported based on the resources provided."""
|
|
82
|
+
del resources # unused
|
|
83
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
|
87
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def regions_with_offering(
|
|
91
|
+
cls,
|
|
92
|
+
instance_type: str,
|
|
93
|
+
accelerators: Optional[Dict[str, int]],
|
|
94
|
+
use_spot: bool,
|
|
95
|
+
region: Optional[str],
|
|
96
|
+
zone: Optional[str],
|
|
97
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
98
|
+
) -> List[clouds.Region]:
|
|
99
|
+
"""Get regions that offer the requested instance type."""
|
|
100
|
+
assert zone is None, 'Shadeform does not support zones.'
|
|
101
|
+
del zone # unused
|
|
102
|
+
if use_spot:
|
|
103
|
+
return [] # No spot support
|
|
104
|
+
|
|
105
|
+
# IMPORTANT: instance_type here is the specific Shadeform instance type
|
|
106
|
+
# (like 'massedcompute_A6000_base'), NOT the accelerator name
|
|
107
|
+
# We only return regions where this exact instance type exists
|
|
108
|
+
regions = shadeform_catalog.get_region_zones_for_instance_type(
|
|
109
|
+
instance_type, use_spot)
|
|
110
|
+
|
|
111
|
+
if region is not None:
|
|
112
|
+
regions = [r for r in regions if r.name == region]
|
|
113
|
+
return regions
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def zones_provision_loop(
|
|
117
|
+
cls,
|
|
118
|
+
*,
|
|
119
|
+
region: str,
|
|
120
|
+
num_nodes: int,
|
|
121
|
+
instance_type: str,
|
|
122
|
+
accelerators: Optional[Dict[str, int]] = None,
|
|
123
|
+
use_spot: bool = False,
|
|
124
|
+
) -> Iterator[None]:
|
|
125
|
+
"""Iterate over zones for provisioning."""
|
|
126
|
+
del num_nodes # unused
|
|
127
|
+
if use_spot:
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
regions = cls.regions_with_offering(instance_type, accelerators,
|
|
131
|
+
use_spot, region, None)
|
|
132
|
+
for r in regions:
|
|
133
|
+
assert r.zones is None, r
|
|
134
|
+
yield r.zones
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def get_vcpus_mem_from_instance_type(
|
|
138
|
+
cls,
|
|
139
|
+
instance_type: str,
|
|
140
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
|
141
|
+
"""Get vCPUs and memory from instance type."""
|
|
142
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
|
143
|
+
clouds='shadeform')
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def get_accelerators_from_instance_type(
|
|
147
|
+
cls,
|
|
148
|
+
instance_type: str,
|
|
149
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
|
150
|
+
"""Get accelerator information from instance type."""
|
|
151
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
152
|
+
clouds='shadeform')
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def get_default_instance_type(
|
|
156
|
+
cls,
|
|
157
|
+
cpus: Optional[str] = None,
|
|
158
|
+
memory: Optional[str] = None,
|
|
159
|
+
disk_tier: Optional[resources_utils.DiskTier] = None,
|
|
160
|
+
region: Optional[str] = None,
|
|
161
|
+
zone: Optional[str] = None,
|
|
162
|
+
) -> Optional[str]:
|
|
163
|
+
"""Get default instance type."""
|
|
164
|
+
del disk_tier # Not supported
|
|
165
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
|
166
|
+
memory=memory,
|
|
167
|
+
disk_tier=None,
|
|
168
|
+
region=region,
|
|
169
|
+
zone=zone,
|
|
170
|
+
clouds='shadeform')
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
174
|
+
"""Return shell command to get the zone of the instance."""
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
179
|
+
"""Get user identities for Shadeform."""
|
|
180
|
+
# No user identity support needed
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
|
184
|
+
return catalog.instance_type_exists(instance_type, 'shadeform')
|
|
185
|
+
|
|
186
|
+
def instance_type_to_hourly_cost(self,
|
|
187
|
+
instance_type: str,
|
|
188
|
+
use_spot: bool,
|
|
189
|
+
region: Optional[str] = None,
|
|
190
|
+
zone: Optional[str] = None) -> float:
|
|
191
|
+
"""Get hourly cost for instance type."""
|
|
192
|
+
if use_spot:
|
|
193
|
+
raise ValueError('Spot instances are not supported on Shadeform')
|
|
194
|
+
return catalog.get_hourly_cost(instance_type,
|
|
195
|
+
use_spot=use_spot,
|
|
196
|
+
region=region,
|
|
197
|
+
zone=zone,
|
|
198
|
+
clouds='shadeform')
|
|
199
|
+
|
|
200
|
+
def accelerators_to_hourly_cost(self,
|
|
201
|
+
accelerators: Dict[str, int],
|
|
202
|
+
use_spot: bool,
|
|
203
|
+
region: Optional[str] = None,
|
|
204
|
+
zone: Optional[str] = None) -> float:
|
|
205
|
+
"""Get hourly cost for accelerators."""
|
|
206
|
+
return 0.0
|
|
207
|
+
|
|
208
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
|
209
|
+
"""Get egress cost."""
|
|
210
|
+
# No explicit egress pricing from Shadeform API
|
|
211
|
+
return 0.0
|
|
212
|
+
|
|
213
|
+
def __repr__(self):
|
|
214
|
+
return 'Shadeform'
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def get_current_user_identity(cls) -> Optional[str]:
|
|
218
|
+
"""Get current user identity."""
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
def make_deploy_resources_variables(
|
|
222
|
+
self,
|
|
223
|
+
resources: 'resources_lib.Resources',
|
|
224
|
+
cluster_name: resources_utils.ClusterName,
|
|
225
|
+
region: 'clouds.Region',
|
|
226
|
+
zones: Optional[List['clouds.Zone']],
|
|
227
|
+
num_nodes: int,
|
|
228
|
+
dryrun: bool = False,
|
|
229
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
230
|
+
) -> Dict[str, Any]:
|
|
231
|
+
"""Make variables for deployment template."""
|
|
232
|
+
del zones, num_nodes, dryrun, volume_mounts # unused for Shadeform
|
|
233
|
+
|
|
234
|
+
# Get instance type
|
|
235
|
+
r = resources.copy(accelerators=None)
|
|
236
|
+
feasible_resources = self._get_feasible_launchable_resources(r)
|
|
237
|
+
instance_type = feasible_resources.resources_list[0].instance_type
|
|
238
|
+
|
|
239
|
+
resources_vars = {}
|
|
240
|
+
if instance_type is not None:
|
|
241
|
+
instance_type_split = instance_type.split('_')
|
|
242
|
+
cloud = instance_type_split[0]
|
|
243
|
+
resources_vars.update({
|
|
244
|
+
'instance_type': instance_type,
|
|
245
|
+
'region': region.name,
|
|
246
|
+
'cloud': cloud,
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
# Add accelerator resources for Ray
|
|
250
|
+
accelerators = resources.accelerators
|
|
251
|
+
if accelerators is not None:
|
|
252
|
+
resources_vars['custom_resources'] = json.dumps(accelerators,
|
|
253
|
+
separators=(',',
|
|
254
|
+
':'))
|
|
255
|
+
|
|
256
|
+
return resources_vars
|
|
257
|
+
|
|
258
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
259
|
+
"""Get credential files that need to be mounted."""
|
|
260
|
+
return {
|
|
261
|
+
f'~/.shadeform/{f}': f'~/.shadeform/{f}' for f in _CREDENTIAL_FILES
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
def get_current_user_identity_str(cls) -> Optional[str]:
|
|
266
|
+
"""Get current user identity string."""
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
@classmethod
|
|
270
|
+
def check_credentials(
|
|
271
|
+
cls, cloud_capability: clouds.CloudCapability
|
|
272
|
+
) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
273
|
+
"""Check if Shadeform credentials are properly configured."""
|
|
274
|
+
del cloud_capability # unused for Shadeform
|
|
275
|
+
try:
|
|
276
|
+
api_key_path = os.path.expanduser('~/.shadeform/api_key')
|
|
277
|
+
if not os.path.exists(api_key_path):
|
|
278
|
+
return False, (f'Shadeform API key not found. '
|
|
279
|
+
f'Please save your API key to {api_key_path}')
|
|
280
|
+
|
|
281
|
+
# Try to read the API key
|
|
282
|
+
with open(api_key_path, 'r', encoding='utf-8') as f:
|
|
283
|
+
api_key = f.read().strip()
|
|
284
|
+
|
|
285
|
+
if not api_key:
|
|
286
|
+
return False, f'Shadeform API key is empty in {api_key_path}'
|
|
287
|
+
|
|
288
|
+
return True, None
|
|
289
|
+
|
|
290
|
+
except (OSError, IOError) as e:
|
|
291
|
+
return False, f'Error checking Shadeform credentials: {str(e)}'
|
|
292
|
+
|
|
293
|
+
def _get_feasible_launchable_resources(
|
|
294
|
+
self, resources: 'resources_lib.Resources'
|
|
295
|
+
) -> 'resources_utils.FeasibleResources':
|
|
296
|
+
"""Get feasible launchable resources."""
|
|
297
|
+
if resources.use_spot:
|
|
298
|
+
return resources_utils.FeasibleResources(
|
|
299
|
+
[], [], 'Spot instances are not supported on Shadeform.')
|
|
300
|
+
|
|
301
|
+
if resources.instance_type is not None:
|
|
302
|
+
# Instance type is already specified, validate it
|
|
303
|
+
assert resources.is_launchable(), resources
|
|
304
|
+
fuzzy_candidate_list = [resources.instance_type]
|
|
305
|
+
return resources_utils.FeasibleResources([resources],
|
|
306
|
+
fuzzy_candidate_list, None)
|
|
307
|
+
|
|
308
|
+
# Map accelerators to instance types
|
|
309
|
+
def _make_resources(instance_type_list):
|
|
310
|
+
resource_list = []
|
|
311
|
+
for instance_type in instance_type_list:
|
|
312
|
+
r = resources.copy(
|
|
313
|
+
cloud=Shadeform(),
|
|
314
|
+
instance_type=instance_type,
|
|
315
|
+
accelerators=resources.
|
|
316
|
+
accelerators, # Keep original accelerators
|
|
317
|
+
cpus=None,
|
|
318
|
+
memory=None,
|
|
319
|
+
)
|
|
320
|
+
resource_list.append(r)
|
|
321
|
+
return resource_list
|
|
322
|
+
|
|
323
|
+
# Handle accelerator requests
|
|
324
|
+
accelerators = resources.accelerators
|
|
325
|
+
if accelerators is not None:
|
|
326
|
+
# Get the first accelerator type and count
|
|
327
|
+
for accelerator_name, accelerator_count in accelerators.items():
|
|
328
|
+
# Get instance types that provide this accelerator
|
|
329
|
+
func = shadeform_catalog.get_instance_type_for_accelerator
|
|
330
|
+
instance_types, errors = func(accelerator_name,
|
|
331
|
+
accelerator_count,
|
|
332
|
+
use_spot=resources.use_spot)
|
|
333
|
+
|
|
334
|
+
if instance_types:
|
|
335
|
+
# Create separate resource objects for each instance type
|
|
336
|
+
# This is crucial: each resource will only be considered
|
|
337
|
+
# for regions where its specific instance type is available
|
|
338
|
+
all_resources = []
|
|
339
|
+
all_candidate_names = []
|
|
340
|
+
|
|
341
|
+
# Create one resource per instance type
|
|
342
|
+
for instance_type in instance_types:
|
|
343
|
+
resource = resources.copy(
|
|
344
|
+
cloud=Shadeform(),
|
|
345
|
+
instance_type=instance_type,
|
|
346
|
+
accelerators=resources.accelerators,
|
|
347
|
+
cpus=None,
|
|
348
|
+
memory=None,
|
|
349
|
+
)
|
|
350
|
+
all_resources.append(resource)
|
|
351
|
+
all_candidate_names.append(instance_type)
|
|
352
|
+
|
|
353
|
+
return resources_utils.FeasibleResources(
|
|
354
|
+
all_resources, all_candidate_names, None)
|
|
355
|
+
else:
|
|
356
|
+
error_msg = (f'No instances available for accelerator '
|
|
357
|
+
f'{accelerator_name}')
|
|
358
|
+
if errors:
|
|
359
|
+
error_msg += f': {"; ".join(errors)}'
|
|
360
|
+
return resources_utils.FeasibleResources([], [], error_msg)
|
|
361
|
+
|
|
362
|
+
# If accelerator not found in mapping, return error
|
|
363
|
+
return resources_utils.FeasibleResources(
|
|
364
|
+
[], [],
|
|
365
|
+
f'Accelerator {list(accelerators.keys())[0]} not supported.')
|
|
366
|
+
|
|
367
|
+
# No accelerators specified, return a default instance type
|
|
368
|
+
if accelerators is None:
|
|
369
|
+
# Return a default instance type
|
|
370
|
+
default_instance_type = Shadeform.get_default_instance_type(
|
|
371
|
+
cpus=resources.cpus,
|
|
372
|
+
memory=resources.memory,
|
|
373
|
+
disk_tier=resources.disk_tier,
|
|
374
|
+
region=resources.region,
|
|
375
|
+
zone=resources.zone)
|
|
376
|
+
if default_instance_type is None:
|
|
377
|
+
# TODO: Add hints to all return values in this method to help
|
|
378
|
+
# users understand why the resources are not launchable.
|
|
379
|
+
return resources_utils.FeasibleResources([], [], None)
|
|
380
|
+
else:
|
|
381
|
+
return resources_utils.FeasibleResources(
|
|
382
|
+
_make_resources([default_instance_type]), [], None)
|
|
383
|
+
|
|
384
|
+
@classmethod
|
|
385
|
+
def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
|
|
386
|
+
"""Check compute credentials."""
|
|
387
|
+
success, msg = cls.check_credentials(clouds.CloudCapability.COMPUTE)
|
|
388
|
+
# Convert return type to match expected signature
|
|
389
|
+
if isinstance(msg, dict):
|
|
390
|
+
msg = str(msg)
|
|
391
|
+
return success, msg
|
|
392
|
+
|
|
393
|
+
@classmethod
|
|
394
|
+
def query_status(cls, name: str, tag_filters: Dict[str, str],
|
|
395
|
+
region: Optional[str], zone: Optional[str],
|
|
396
|
+
**kwargs) -> List[status_lib.ClusterStatus]:
|
|
397
|
+
"""Query cluster status."""
|
|
398
|
+
# For validation purposes, return empty list (no existing clusters)
|
|
399
|
+
# Actual status querying is handled by the provisioner
|
|
400
|
+
return []
|
sky/clouds/ssh.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""SSH Node Pools"""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import typing
|
|
5
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky import skypilot_config
|
|
9
|
+
from sky.adaptors import kubernetes as kubernetes_adaptor
|
|
10
|
+
from sky.clouds import kubernetes
|
|
11
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
+
from sky.utils import annotations
|
|
13
|
+
from sky.utils import common_utils
|
|
14
|
+
from sky.utils import registry
|
|
15
|
+
from sky.utils import yaml_utils
|
|
16
|
+
|
|
17
|
+
if typing.TYPE_CHECKING:
|
|
18
|
+
# Renaming to avoid shadowing variables.
|
|
19
|
+
from sky import resources as resources_lib
|
|
20
|
+
|
|
21
|
+
logger = sky_logging.init_logger(__name__)
|
|
22
|
+
|
|
23
|
+
SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@registry.CLOUD_REGISTRY.register()
|
|
27
|
+
class SSH(kubernetes.Kubernetes):
|
|
28
|
+
"""SSH cloud implementation.
|
|
29
|
+
|
|
30
|
+
This is used by SSH Node Pools in SkyPilot, which use Kubernetes to manage
|
|
31
|
+
the SSH clusters.
|
|
32
|
+
|
|
33
|
+
This cloud is a thin wrapper around Kubernetes that only uses contexts
|
|
34
|
+
starting with 'ssh-', which are managed through `sky ssh up` command.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
_REPR = 'SSH'
|
|
38
|
+
|
|
39
|
+
# Keep track of contexts that have been logged as unreachable
|
|
40
|
+
logged_unreachable_contexts: Set[str] = set()
|
|
41
|
+
|
|
42
|
+
def __repr__(self):
|
|
43
|
+
return self._REPR
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def _unsupported_features_for_resources(
|
|
47
|
+
cls,
|
|
48
|
+
resources: 'resources_lib.Resources',
|
|
49
|
+
region: Optional[str] = None,
|
|
50
|
+
) -> Dict[kubernetes.clouds.CloudImplementationFeatures, str]:
|
|
51
|
+
# Inherit all Kubernetes unsupported features
|
|
52
|
+
return super()._unsupported_features_for_resources(resources, region)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_ssh_node_pool_contexts(cls) -> List[str]:
|
|
56
|
+
"""Get context names from ssh_node_pools.yaml file.
|
|
57
|
+
|
|
58
|
+
Reads the SSH node pools configuration file and returns
|
|
59
|
+
a list of context names by prepending 'ssh-' to each Node Pool name.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A list of SSH Kubernetes context names derived from the Node Pools
|
|
63
|
+
in the SSH node pools file.
|
|
64
|
+
"""
|
|
65
|
+
contexts = []
|
|
66
|
+
|
|
67
|
+
if os.path.exists(SSH_NODE_POOLS_PATH):
|
|
68
|
+
try:
|
|
69
|
+
with open(SSH_NODE_POOLS_PATH, 'r', encoding='utf-8') as f:
|
|
70
|
+
ssh_config = yaml_utils.safe_load(f)
|
|
71
|
+
if ssh_config:
|
|
72
|
+
# Get cluster names and prepend 'ssh-' to match
|
|
73
|
+
# context naming convention
|
|
74
|
+
contexts = [
|
|
75
|
+
f'ssh-{cluster_name}'
|
|
76
|
+
for cluster_name in ssh_config.keys()
|
|
77
|
+
]
|
|
78
|
+
except Exception: # pylint: disable=broad-except
|
|
79
|
+
# If there's an error reading the file, return empty list
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
return contexts
|
|
83
|
+
|
|
84
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
|
85
|
+
if region == kubernetes_adaptor.in_cluster_context_name():
|
|
86
|
+
# If running incluster, we set region to IN_CLUSTER_REGION
|
|
87
|
+
# since there is no context name available.
|
|
88
|
+
return region, zone
|
|
89
|
+
|
|
90
|
+
all_contexts = self.existing_allowed_contexts()
|
|
91
|
+
|
|
92
|
+
if region is not None and region not in all_contexts:
|
|
93
|
+
region_name = common_utils.removeprefix(region, 'ssh-')
|
|
94
|
+
available_contexts = [
|
|
95
|
+
common_utils.removeprefix(c, 'ssh-') for c in all_contexts
|
|
96
|
+
]
|
|
97
|
+
err_str = (f'SSH Node Pool {region_name!r} is not set up. '
|
|
98
|
+
'Run `sky check` for more details. ')
|
|
99
|
+
if available_contexts:
|
|
100
|
+
err_str += f'Available node pools: {available_contexts}'
|
|
101
|
+
raise ValueError(err_str)
|
|
102
|
+
if zone is not None:
|
|
103
|
+
raise ValueError('SSH Node Pools do not support setting zone.')
|
|
104
|
+
return region, zone
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
108
|
+
def _ssh_log_skipped_contexts_once(
|
|
109
|
+
cls, skipped_contexts: Tuple[str, ...]) -> None:
|
|
110
|
+
"""Log skipped contexts for only once.
|
|
111
|
+
|
|
112
|
+
We don't directly cache the result of _filter_existing_allowed_contexts
|
|
113
|
+
as the admin policy may update the allowed contexts.
|
|
114
|
+
"""
|
|
115
|
+
if skipped_contexts:
|
|
116
|
+
count = len(set(skipped_contexts))
|
|
117
|
+
is_singular = count == 1
|
|
118
|
+
logger.warning(
|
|
119
|
+
f'SSH Node {("Pool" if is_singular else "Pools")} '
|
|
120
|
+
f'{set(skipped_contexts)!r} specified in '
|
|
121
|
+
f'{SSH_NODE_POOLS_PATH} {("has" if is_singular else "have")} '
|
|
122
|
+
'not been set up. Skipping '
|
|
123
|
+
f'{("that pool" if is_singular else "those pools")}. '
|
|
124
|
+
'Run `sky ssh up` to set up.')
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
|
|
128
|
+
"""Get existing allowed contexts that start with 'ssh-'.
|
|
129
|
+
|
|
130
|
+
Override the Kubernetes implementation to only return contexts that
|
|
131
|
+
start with 'ssh-', which are created by `sky ssh up`.
|
|
132
|
+
|
|
133
|
+
Returns contexts based on clusters defined in ~/.sky/ssh_node_pools.yaml
|
|
134
|
+
"""
|
|
135
|
+
# Get all contexts from the Kubernetes implementation
|
|
136
|
+
all_contexts = kubernetes_utils.get_all_kube_context_names()
|
|
137
|
+
if not all_contexts:
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
all_contexts = set(all_contexts)
|
|
141
|
+
|
|
142
|
+
# Workspace-level allowed_node_pools should take precedence over
|
|
143
|
+
# the global allowed_node_pools.
|
|
144
|
+
allowed_node_pools = skypilot_config.get_workspace_cloud('ssh').get(
|
|
145
|
+
'allowed_node_pools', None)
|
|
146
|
+
if allowed_node_pools is None:
|
|
147
|
+
allowed_node_pools = skypilot_config.get_effective_region_config(
|
|
148
|
+
cloud='ssh',
|
|
149
|
+
region=None,
|
|
150
|
+
keys=('allowed_node_pools',),
|
|
151
|
+
default_value=None)
|
|
152
|
+
|
|
153
|
+
# Filter for SSH contexts (those starting with 'ssh-')
|
|
154
|
+
ssh_contexts = [
|
|
155
|
+
context for context in all_contexts if context.startswith('ssh-')
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
# Get contexts from SSH node pools file
|
|
159
|
+
all_node_pool_contexts = cls.get_ssh_node_pool_contexts()
|
|
160
|
+
|
|
161
|
+
def filter_by_allowed_node_pools(ctxs):
|
|
162
|
+
if allowed_node_pools is None:
|
|
163
|
+
return ctxs
|
|
164
|
+
return [
|
|
165
|
+
ctx for ctx in ctxs
|
|
166
|
+
if common_utils.removeprefix(ctx, 'ssh-') in allowed_node_pools
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
if all_node_pool_contexts:
|
|
170
|
+
# Only include allowed contexts that exist
|
|
171
|
+
existing_contexts = []
|
|
172
|
+
skipped_contexts = []
|
|
173
|
+
for context in all_node_pool_contexts:
|
|
174
|
+
if context in ssh_contexts:
|
|
175
|
+
existing_contexts.append(context)
|
|
176
|
+
else:
|
|
177
|
+
skipped_contexts.append(context)
|
|
178
|
+
if not silent:
|
|
179
|
+
cls._ssh_log_skipped_contexts_once(tuple(skipped_contexts))
|
|
180
|
+
return filter_by_allowed_node_pools(existing_contexts)
|
|
181
|
+
|
|
182
|
+
# If no all_node_pool_contexts found, return all SSH contexts
|
|
183
|
+
return filter_by_allowed_node_pools(ssh_contexts)
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def _check_compute_credentials(
|
|
187
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
188
|
+
"""Check if the user has access credentials to SSH contexts."""
|
|
189
|
+
# Check for port forward dependencies - reuse Kubernetes implementation
|
|
190
|
+
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
|
191
|
+
if reasons is not None:
|
|
192
|
+
formatted = '\n'.join(
|
|
193
|
+
[reasons[0]] +
|
|
194
|
+
[f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
|
|
195
|
+
return (False, formatted)
|
|
196
|
+
|
|
197
|
+
# Get SSH contexts
|
|
198
|
+
try:
|
|
199
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
|
200
|
+
except Exception as e: # pylint: disable=broad-except
|
|
201
|
+
return (False, f'Failed to get SSH contexts: {str(e)}')
|
|
202
|
+
|
|
203
|
+
if not existing_allowed_contexts:
|
|
204
|
+
return (False,
|
|
205
|
+
'No SSH Node Pools are up. Run `sky ssh up` to set up '
|
|
206
|
+
f'Node Pools from {SSH_NODE_POOLS_PATH}.')
|
|
207
|
+
|
|
208
|
+
# Check credentials for each context
|
|
209
|
+
ctx2text = {}
|
|
210
|
+
success = False
|
|
211
|
+
for context in existing_allowed_contexts:
|
|
212
|
+
suc, text = super()._check_single_context(context)
|
|
213
|
+
success = success or suc
|
|
214
|
+
ctx2text[context] = text
|
|
215
|
+
|
|
216
|
+
return success, ctx2text
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def check_single_context(cls, context: str) -> Tuple[bool, str]:
|
|
220
|
+
"""Checks if the context is valid and accessible."""
|
|
221
|
+
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
|
222
|
+
if reasons is not None:
|
|
223
|
+
formatted = '\n'.join(
|
|
224
|
+
[reasons[0]] +
|
|
225
|
+
[f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
|
|
226
|
+
return (False, formatted)
|
|
227
|
+
|
|
228
|
+
# Add ssh- prefix to the context
|
|
229
|
+
if not context.startswith('ssh-'):
|
|
230
|
+
context = f'ssh-{context}'
|
|
231
|
+
|
|
232
|
+
# Get SSH contexts
|
|
233
|
+
try:
|
|
234
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
|
235
|
+
except Exception as e: # pylint: disable=broad-except
|
|
236
|
+
return (False, f'Failed to get SSH contexts: {str(e)}')
|
|
237
|
+
|
|
238
|
+
if not existing_allowed_contexts:
|
|
239
|
+
return (False,
|
|
240
|
+
'No SSH Node Pools are up. Run `sky ssh up` to set up '
|
|
241
|
+
f'Node Pools from {SSH_NODE_POOLS_PATH}.')
|
|
242
|
+
|
|
243
|
+
if context not in existing_allowed_contexts:
|
|
244
|
+
return (False, f'SSH Node Pool {context} is not set up. '
|
|
245
|
+
f'Run `sky ssh up --infra {context}` to set up.')
|
|
246
|
+
|
|
247
|
+
# Check if the context is valid
|
|
248
|
+
suc, text = super()._check_single_context(context)
|
|
249
|
+
if not suc:
|
|
250
|
+
return (False, text)
|
|
251
|
+
|
|
252
|
+
return (True, 'SSH Node Pool is set up.')
|
|
253
|
+
|
|
254
|
+
@classmethod
|
|
255
|
+
def expand_infras(cls) -> List[str]:
|
|
256
|
+
return [
|
|
257
|
+
f'{cls.canonical_name()}/{c.lstrip("ssh-")}'
|
|
258
|
+
for c in cls.existing_allowed_contexts(silent=True)
|
|
259
|
+
]
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
def display_name(cls) -> str:
|
|
263
|
+
return 'SSH Node Pools'
|
sky/clouds/utils/aws_utils.py
CHANGED
|
@@ -28,10 +28,16 @@ class AWSReservation:
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def use_reservations() -> bool:
|
|
31
|
-
prioritize_reservations = skypilot_config.
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
('
|
|
31
|
+
prioritize_reservations = skypilot_config.get_effective_region_config(
|
|
32
|
+
cloud='aws',
|
|
33
|
+
region=None,
|
|
34
|
+
keys=('prioritize_reservations',),
|
|
35
|
+
default_value=False)
|
|
36
|
+
specific_reservations = skypilot_config.get_effective_region_config(
|
|
37
|
+
cloud='aws',
|
|
38
|
+
region=None,
|
|
39
|
+
keys=('specific_reservations',),
|
|
40
|
+
default_value=set())
|
|
35
41
|
return prioritize_reservations or specific_reservations
|
|
36
42
|
|
|
37
43
|
|