skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Shadeform provisioner."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.shadeform.config import bootstrap_instances
|
|
4
|
+
from sky.provision.shadeform.instance import cleanup_ports
|
|
5
|
+
from sky.provision.shadeform.instance import get_cluster_info
|
|
6
|
+
from sky.provision.shadeform.instance import open_ports
|
|
7
|
+
from sky.provision.shadeform.instance import query_instances
|
|
8
|
+
from sky.provision.shadeform.instance import run_instances
|
|
9
|
+
from sky.provision.shadeform.instance import stop_instances
|
|
10
|
+
from sky.provision.shadeform.instance import terminate_instances
|
|
11
|
+
from sky.provision.shadeform.instance import wait_instances
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Shadeform configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
|
|
12
|
+
return config
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""Shadeform instance provisioning."""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky.provision import common
|
|
9
|
+
from sky.provision.shadeform import shadeform_utils
|
|
10
|
+
from sky.utils import status_lib
|
|
11
|
+
|
|
12
|
+
POLL_INTERVAL = 10
|
|
13
|
+
INSTANCE_READY_TIMEOUT = 3600
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Status mapping from Shadeform to SkyPilot
|
|
18
|
+
SHADEFORM_STATUS_MAP = {
|
|
19
|
+
'creating': status_lib.ClusterStatus.INIT,
|
|
20
|
+
'pending_provider': status_lib.ClusterStatus.INIT,
|
|
21
|
+
'pending': status_lib.ClusterStatus.INIT,
|
|
22
|
+
'active': status_lib.ClusterStatus.UP,
|
|
23
|
+
'deleted': status_lib.ClusterStatus.STOPPED,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
|
|
28
|
+
"""Get all instances belonging to a cluster."""
|
|
29
|
+
try:
|
|
30
|
+
response = shadeform_utils.get_instances()
|
|
31
|
+
instances = response.get('instances', [])
|
|
32
|
+
|
|
33
|
+
cluster_instances = {}
|
|
34
|
+
possible_names = [
|
|
35
|
+
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
for instance in instances:
|
|
39
|
+
if instance.get('name') in possible_names:
|
|
40
|
+
cluster_instances[instance['id']] = instance
|
|
41
|
+
|
|
42
|
+
return cluster_instances
|
|
43
|
+
except (ValueError, KeyError, requests.exceptions.RequestException) as e:
|
|
44
|
+
logger.warning(f'Failed to get instances: {e}')
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
49
|
+
"""Get the head instance ID from a list of instances."""
|
|
50
|
+
for instance_id, instance in instances.items():
|
|
51
|
+
if instance.get('name', '').endswith('-head'):
|
|
52
|
+
return instance_id
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _wait_for_instances_ready(cluster_name_on_cloud: str,
|
|
57
|
+
expected_count: int,
|
|
58
|
+
timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
|
|
59
|
+
"""Wait for instances to be ready (active state with SSH access)."""
|
|
60
|
+
start_time = time.time()
|
|
61
|
+
|
|
62
|
+
while time.time() - start_time < timeout:
|
|
63
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
64
|
+
ready_count = 0
|
|
65
|
+
|
|
66
|
+
for instance in instances.values():
|
|
67
|
+
if (instance.get('status') == 'active' and
|
|
68
|
+
instance.get('ip') is not None and
|
|
69
|
+
instance.get('ssh_port') is not None):
|
|
70
|
+
ready_count += 1
|
|
71
|
+
|
|
72
|
+
logger.info(f'Waiting for instances to be ready: '
|
|
73
|
+
f'({ready_count}/{expected_count})')
|
|
74
|
+
|
|
75
|
+
if ready_count >= expected_count:
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
time.sleep(POLL_INTERVAL)
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
84
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
85
|
+
"""Run instances for the given cluster."""
|
|
86
|
+
del cluster_name # unused - we use cluster_name_on_cloud
|
|
87
|
+
logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
|
|
88
|
+
f'in region {region}')
|
|
89
|
+
logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
|
|
90
|
+
logger.debug(f'DEBUG: config node_config={config.node_config}')
|
|
91
|
+
|
|
92
|
+
# Check existing instances
|
|
93
|
+
existing_instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
94
|
+
head_instance_id = _get_head_instance_id(existing_instances)
|
|
95
|
+
|
|
96
|
+
# Filter active instances
|
|
97
|
+
active_instances = {
|
|
98
|
+
iid: inst
|
|
99
|
+
for iid, inst in existing_instances.items()
|
|
100
|
+
if inst.get('status') == 'active'
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
current_count = len(active_instances)
|
|
104
|
+
target_count = config.count
|
|
105
|
+
|
|
106
|
+
logger.info(f'Current instances: {current_count}, target: {target_count}')
|
|
107
|
+
|
|
108
|
+
if current_count >= target_count:
|
|
109
|
+
if head_instance_id is None:
|
|
110
|
+
raise RuntimeError(
|
|
111
|
+
f'Cluster {cluster_name_on_cloud} has no head node')
|
|
112
|
+
logger.info(f'Cluster already has {current_count} instances, '
|
|
113
|
+
f'no need to start more')
|
|
114
|
+
return common.ProvisionRecord(
|
|
115
|
+
provider_name='shadeform',
|
|
116
|
+
cluster_name=cluster_name_on_cloud,
|
|
117
|
+
region=region,
|
|
118
|
+
zone=None, # Shadeform doesn't use separate zones
|
|
119
|
+
head_instance_id=head_instance_id,
|
|
120
|
+
resumed_instance_ids=[],
|
|
121
|
+
created_instance_ids=[])
|
|
122
|
+
|
|
123
|
+
# Create new instances
|
|
124
|
+
to_create = target_count - current_count
|
|
125
|
+
created_instance_ids = []
|
|
126
|
+
|
|
127
|
+
for _ in range(to_create):
|
|
128
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
|
129
|
+
instance_name = f'{cluster_name_on_cloud}-{node_type}'
|
|
130
|
+
|
|
131
|
+
# Extract configuration from node_config
|
|
132
|
+
|
|
133
|
+
# The node_config contains instance specs including InstanceType
|
|
134
|
+
# which follows the format: {cloud_provider}_{instance_type}
|
|
135
|
+
# (e.g., "massedcompute_A6000_basex2")
|
|
136
|
+
node_config = config.node_config
|
|
137
|
+
assert 'InstanceType' in node_config, \
|
|
138
|
+
'InstanceType must be present in node_config'
|
|
139
|
+
|
|
140
|
+
# Parse the instance type to extract cloud provider and instance specs
|
|
141
|
+
# Expected format: "{cloud}_{instance_type}" where cloud is provider
|
|
142
|
+
# (massedcompute, scaleway, lambda, etc.)
|
|
143
|
+
instance_type_full = node_config['InstanceType']
|
|
144
|
+
assert (isinstance(instance_type_full, str) and
|
|
145
|
+
'_' in instance_type_full), \
|
|
146
|
+
f'InstanceType must be in format cloud_instance_type, got: ' \
|
|
147
|
+
f'{instance_type_full}'
|
|
148
|
+
|
|
149
|
+
instance_type_split = instance_type_full.split('_')
|
|
150
|
+
assert len(instance_type_split) >= 2, \
|
|
151
|
+
f'InstanceType must contain at least one underscore, got: ' \
|
|
152
|
+
f'{instance_type_full}'
|
|
153
|
+
|
|
154
|
+
# Extract cloud provider (first part) and instance type (remaining)
|
|
155
|
+
# Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
|
|
156
|
+
# instance_type="A6000-basex2"
|
|
157
|
+
cloud = instance_type_split[0]
|
|
158
|
+
instance_type = '_'.join(instance_type_split[1:])
|
|
159
|
+
|
|
160
|
+
# Shadeform uses underscores instead of hyphens
|
|
161
|
+
instance_type = instance_type.replace('-', '_')
|
|
162
|
+
|
|
163
|
+
if instance_type.endswith('B'):
|
|
164
|
+
instance_type = instance_type[:-1]
|
|
165
|
+
|
|
166
|
+
# Replace "GBx" with "Gx" (case sensitive)
|
|
167
|
+
if 'GBx' in instance_type:
|
|
168
|
+
instance_type = instance_type.replace('GBx', 'Gx')
|
|
169
|
+
|
|
170
|
+
assert cloud, 'Cloud provider cannot be empty'
|
|
171
|
+
assert instance_type, 'Instance type cannot be empty'
|
|
172
|
+
|
|
173
|
+
# Get SSH key ID for authentication - this is optional and may be None
|
|
174
|
+
ssh_key_id = config.authentication_config.get('ssh_key_id')
|
|
175
|
+
|
|
176
|
+
create_config = {
|
|
177
|
+
'cloud': cloud,
|
|
178
|
+
'region': region,
|
|
179
|
+
'shade_instance_type': instance_type,
|
|
180
|
+
'name': instance_name,
|
|
181
|
+
'ssh_key_id': ssh_key_id
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
logger.info(f'Creating {node_type} instance: {instance_name}')
|
|
186
|
+
response = shadeform_utils.create_instance(create_config)
|
|
187
|
+
instance_id = response['id']
|
|
188
|
+
created_instance_ids.append(instance_id)
|
|
189
|
+
|
|
190
|
+
if head_instance_id is None:
|
|
191
|
+
head_instance_id = instance_id
|
|
192
|
+
|
|
193
|
+
logger.info(f'Created instance {instance_id} ({node_type})')
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f'Failed to create instance: {e}')
|
|
197
|
+
# Clean up any created instances
|
|
198
|
+
for iid in created_instance_ids:
|
|
199
|
+
try:
|
|
200
|
+
shadeform_utils.delete_instance(iid)
|
|
201
|
+
except requests.exceptions.RequestException as cleanup_e:
|
|
202
|
+
logger.warning(
|
|
203
|
+
f'Failed to cleanup instance {iid}: {cleanup_e}')
|
|
204
|
+
raise
|
|
205
|
+
|
|
206
|
+
# Wait for all instances to be ready
|
|
207
|
+
logger.info('Waiting for instances to become ready...')
|
|
208
|
+
if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
|
|
209
|
+
raise RuntimeError('Timed out waiting for instances to be ready')
|
|
210
|
+
|
|
211
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
212
|
+
|
|
213
|
+
return common.ProvisionRecord(provider_name='shadeform',
|
|
214
|
+
cluster_name=cluster_name_on_cloud,
|
|
215
|
+
region=region,
|
|
216
|
+
zone=region,
|
|
217
|
+
head_instance_id=head_instance_id,
|
|
218
|
+
resumed_instance_ids=[],
|
|
219
|
+
created_instance_ids=created_instance_ids)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
|
223
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
|
224
|
+
"""Wait for instances to reach the specified state."""
|
|
225
|
+
del region, cluster_name_on_cloud, state # unused
|
|
226
|
+
# For Shadeform, instances are ready when they reach 'active' status
|
|
227
|
+
# This is already handled in run_instances
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def stop_instances(cluster_name_on_cloud: str,
|
|
231
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
232
|
+
worker_only: bool = False) -> None:
|
|
233
|
+
"""Stop instances (not supported by Shadeform)."""
|
|
234
|
+
del cluster_name_on_cloud, provider_config, worker_only # unused
|
|
235
|
+
raise NotImplementedError(
|
|
236
|
+
'Stopping instances is not supported by Shadeform')
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def terminate_instances(cluster_name_on_cloud: str,
|
|
240
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
241
|
+
worker_only: bool = False) -> None:
|
|
242
|
+
"""Terminate instances."""
|
|
243
|
+
del provider_config # unused
|
|
244
|
+
logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
|
|
245
|
+
|
|
246
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
247
|
+
|
|
248
|
+
if not instances:
|
|
249
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
instances_to_delete = instances
|
|
253
|
+
if worker_only:
|
|
254
|
+
# Only delete worker nodes, not head
|
|
255
|
+
instances_to_delete = {
|
|
256
|
+
iid: inst
|
|
257
|
+
for iid, inst in instances.items()
|
|
258
|
+
if not inst.get('name', '').endswith('-head')
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for instance_id, instance in instances_to_delete.items():
|
|
262
|
+
try:
|
|
263
|
+
logger.info(
|
|
264
|
+
f'Terminating instance {instance_id} ({instance.get("name")})')
|
|
265
|
+
shadeform_utils.delete_instance(instance_id)
|
|
266
|
+
except requests.exceptions.RequestException as e:
|
|
267
|
+
logger.warning(f'Failed to terminate instance {instance_id}: {e}')
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_cluster_info(
|
|
271
|
+
region: str,
|
|
272
|
+
cluster_name_on_cloud: str,
|
|
273
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
274
|
+
"""Get cluster information."""
|
|
275
|
+
del region, provider_config # unused
|
|
276
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
277
|
+
|
|
278
|
+
if not instances:
|
|
279
|
+
return common.ClusterInfo(instances={},
|
|
280
|
+
head_instance_id=None,
|
|
281
|
+
provider_name='shadeform')
|
|
282
|
+
|
|
283
|
+
head_instance_id = _get_head_instance_id(instances)
|
|
284
|
+
|
|
285
|
+
# Convert instance format for ClusterInfo
|
|
286
|
+
cluster_instances = {}
|
|
287
|
+
for instance_id, instance in instances.items():
|
|
288
|
+
instance_info = common.InstanceInfo(
|
|
289
|
+
instance_id=instance_id,
|
|
290
|
+
internal_ip=instance.get('ip', ''),
|
|
291
|
+
external_ip=instance.get('ip', ''),
|
|
292
|
+
ssh_port=instance.get('ssh_port', 22),
|
|
293
|
+
tags={},
|
|
294
|
+
)
|
|
295
|
+
# ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
|
|
296
|
+
cluster_instances[instance_id] = [instance_info]
|
|
297
|
+
|
|
298
|
+
ssh_user = 'shadeform' # default
|
|
299
|
+
if head_instance_id is not None:
|
|
300
|
+
ssh_user = instances.get(head_instance_id,
|
|
301
|
+
{}).get('ssh_user', 'shadeform')
|
|
302
|
+
|
|
303
|
+
return common.ClusterInfo(instances=cluster_instances,
|
|
304
|
+
head_instance_id=head_instance_id,
|
|
305
|
+
provider_name='shadeform',
|
|
306
|
+
ssh_user=ssh_user)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def query_instances(
|
|
310
|
+
cluster_name: str,
|
|
311
|
+
cluster_name_on_cloud: str,
|
|
312
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
313
|
+
non_terminated_only: bool = True,
|
|
314
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
315
|
+
"""Query the status of instances."""
|
|
316
|
+
del cluster_name, provider_config # unused
|
|
317
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
318
|
+
|
|
319
|
+
if not instances:
|
|
320
|
+
return {}
|
|
321
|
+
|
|
322
|
+
status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
323
|
+
Optional[str]]] = {}
|
|
324
|
+
for instance_id, instance in instances.items():
|
|
325
|
+
shadeform_status = instance.get('status', 'unknown')
|
|
326
|
+
sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
|
|
327
|
+
status_lib.ClusterStatus.INIT)
|
|
328
|
+
|
|
329
|
+
if (non_terminated_only and
|
|
330
|
+
sky_status == status_lib.ClusterStatus.STOPPED):
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
status_map[instance_id] = (sky_status, None)
|
|
334
|
+
|
|
335
|
+
return status_map
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def open_ports(cluster_name_on_cloud: str,
|
|
339
|
+
ports: List[str],
|
|
340
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
341
|
+
"""Open ports (not supported by Shadeform)."""
|
|
342
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
343
|
+
raise NotImplementedError()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def cleanup_ports(cluster_name_on_cloud: str,
|
|
347
|
+
ports: List[str],
|
|
348
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
349
|
+
"""Cleanup ports (not supported by Shadeform)."""
|
|
350
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
351
|
+
# Nothing to cleanup since we don't support dynamic port opening
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Shadeform API utilities."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from sky.adaptors import common
|
|
7
|
+
|
|
8
|
+
# Lazy import to avoid dependency on external packages
|
|
9
|
+
requests = common.LazyImport('requests')
|
|
10
|
+
|
|
11
|
+
# Shadeform API configuration
|
|
12
|
+
SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
|
|
13
|
+
SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
"""Get Shadeform API key from file."""
|
|
18
|
+
api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
|
|
19
|
+
if not os.path.exists(api_key_path):
|
|
20
|
+
raise FileNotFoundError(
|
|
21
|
+
f'Shadeform API key not found at {api_key_path}. '
|
|
22
|
+
'Please save your API key to this file.')
|
|
23
|
+
|
|
24
|
+
with open(api_key_path, 'r', encoding='utf-8') as f:
|
|
25
|
+
api_key = f.read().strip()
|
|
26
|
+
|
|
27
|
+
if not api_key:
|
|
28
|
+
raise ValueError(f'Shadeform API key is empty in {api_key_path}')
|
|
29
|
+
|
|
30
|
+
return api_key
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def make_request(method: str, endpoint: str, **kwargs) -> Any:
|
|
34
|
+
"""Make a request to the Shadeform API."""
|
|
35
|
+
url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
|
|
36
|
+
headers = {
|
|
37
|
+
'X-API-KEY': get_api_key(),
|
|
38
|
+
'Content-Type': 'application/json',
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
response = requests.request(method, url, headers=headers, **kwargs)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
|
|
44
|
+
# Some APIs (like delete) return empty responses with just 200 status
|
|
45
|
+
if response.text.strip():
|
|
46
|
+
return response.json()
|
|
47
|
+
else:
|
|
48
|
+
# Return empty dict for empty responses (e.g., delete operations)
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_instances() -> Dict[str, Any]:
|
|
53
|
+
"""Get all instances."""
|
|
54
|
+
return make_request('GET', '/instances')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_info(instance_id: str) -> Dict[str, Any]:
|
|
58
|
+
"""Get information about a specific instance."""
|
|
59
|
+
return make_request('GET', f'/instances/{instance_id}/info')
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
63
|
+
"""Create a new instance."""
|
|
64
|
+
return make_request('POST', '/instances/create', json=config)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def delete_instance(instance_id: str) -> Dict[str, Any]:
|
|
68
|
+
"""Delete an instance.
|
|
69
|
+
|
|
70
|
+
Note: Shadeform delete API returns empty response with 200 status.
|
|
71
|
+
"""
|
|
72
|
+
return make_request('POST', f'/instances/{instance_id}/delete')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_ssh_keys() -> Dict[str, Any]:
|
|
76
|
+
"""Get all SSH keys."""
|
|
77
|
+
return make_request('GET', '/sshkeys')
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
|
|
81
|
+
"""Add a new SSH key."""
|
|
82
|
+
config = {'name': name, 'public_key': public_key}
|
|
83
|
+
return make_request('POST', '/sshkeys/add', json=config)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""SSH provisioner for SkyPilot.
|
|
2
|
+
|
|
3
|
+
This module implements the provisioner interface for SSH targets.
|
|
4
|
+
It reuses most of the functionality from the Kubernetes provisioner,
|
|
5
|
+
since the SSH implementation is based on Kubernetes under the hood.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from sky.provision.kubernetes.config import bootstrap_instances
|
|
9
|
+
from sky.provision.kubernetes.instance import get_cluster_info
|
|
10
|
+
from sky.provision.kubernetes.instance import get_command_runners
|
|
11
|
+
from sky.provision.kubernetes.instance import query_instances
|
|
12
|
+
from sky.provision.kubernetes.instance import run_instances
|
|
13
|
+
from sky.provision.kubernetes.instance import stop_instances
|
|
14
|
+
from sky.provision.kubernetes.instance import terminate_instances
|
|
15
|
+
from sky.provision.kubernetes.instance import wait_instances
|
|
16
|
+
from sky.provision.kubernetes.network import cleanup_ports
|
|
17
|
+
from sky.provision.kubernetes.network import open_ports
|
|
18
|
+
from sky.provision.kubernetes.network import query_ports
|
sky/provision/vast/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Vast instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -39,14 +39,15 @@ def _filter_instances(cluster_name_on_cloud: str,
|
|
|
39
39
|
|
|
40
40
|
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
41
41
|
for inst_id, inst in instances.items():
|
|
42
|
-
if inst['name'].endswith('-head'):
|
|
42
|
+
if inst.get('name') and inst['name'].endswith('-head'):
|
|
43
43
|
return inst_id
|
|
44
44
|
return None
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
+
del cluster_name # unused
|
|
50
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
51
52
|
|
|
52
53
|
created_instance_ids = []
|
|
@@ -97,7 +98,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
97
98
|
region=region,
|
|
98
99
|
disk_size=config.node_config['DiskSize'],
|
|
99
100
|
preemptible=config.node_config['Preemptible'],
|
|
100
|
-
image_name=config.node_config['ImageId']
|
|
101
|
+
image_name=config.node_config['ImageId'],
|
|
102
|
+
ports=config.ports_to_open_on_launch)
|
|
101
103
|
except Exception as e: # pylint: disable=broad-except
|
|
102
104
|
logger.warning(f'run_instances error: {e}')
|
|
103
105
|
raise
|
|
@@ -215,12 +217,14 @@ def open_ports(
|
|
|
215
217
|
|
|
216
218
|
|
|
217
219
|
def query_instances(
|
|
220
|
+
cluster_name: str,
|
|
218
221
|
cluster_name_on_cloud: str,
|
|
219
222
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
220
223
|
non_terminated_only: bool = True,
|
|
221
|
-
|
|
224
|
+
retry_if_missing: bool = False,
|
|
225
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
222
226
|
"""See sky/provision/__init__.py"""
|
|
223
|
-
|
|
227
|
+
del cluster_name, retry_if_missing # unused
|
|
224
228
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
225
229
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
226
230
|
# "running", "frozen", "stopped", "unknown", "loading"
|
|
@@ -230,12 +234,13 @@ def query_instances(
|
|
|
230
234
|
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
|
231
235
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
232
236
|
}
|
|
233
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
237
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
238
|
+
Optional[str]]] = {}
|
|
234
239
|
for inst_id, inst in instances.items():
|
|
235
240
|
status = status_map[inst['status']]
|
|
236
241
|
if non_terminated_only and status is None:
|
|
237
242
|
continue
|
|
238
|
-
statuses[inst_id] = status
|
|
243
|
+
statuses[inst_id] = (status, None)
|
|
239
244
|
return statuses
|
|
240
245
|
|
|
241
246
|
|
sky/provision/vast/utils.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# python sdk.
|
|
6
6
|
#
|
|
7
7
|
"""Vast library wrapper for SkyPilot."""
|
|
8
|
-
from typing import Any, Dict, List
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.adaptors import vast
|
|
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
37
|
-
image_name: str,
|
|
37
|
+
image_name: str, ports: Optional[List[int]],
|
|
38
|
+
preemptible: bool) -> str:
|
|
38
39
|
"""Launches an instance with the given parameters.
|
|
39
40
|
|
|
40
41
|
Converts the instance_type to the Vast GPU name, finds the specs for the
|
|
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
58
59
|
The disk size {xx} GB is not exactly matched the requested
|
|
59
60
|
size {yy} GB. It is possible to charge extra cost on disk.
|
|
60
61
|
|
|
62
|
+
* `ports`: This is a feature flag to expose ports to the internet.
|
|
63
|
+
|
|
61
64
|
* `geolocation`: Geolocation on Vast can be as specific as the
|
|
62
65
|
host chooses to be. They can say, for instance, "Yutakachō,
|
|
63
66
|
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
|
@@ -78,10 +81,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
78
81
|
amount of memory.
|
|
79
82
|
|
|
80
83
|
* Vast instance types are an invention for skypilot. Refer to
|
|
81
|
-
|
|
82
|
-
of the type.
|
|
83
|
-
|
|
84
|
-
"""
|
|
84
|
+
catalog/vast_catalog.py for the current construction
|
|
85
|
+
of the type."""
|
|
85
86
|
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
|
86
87
|
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
|
87
88
|
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
|
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
104
105
|
|
|
105
106
|
instance_touse = instance_list[0]
|
|
106
107
|
|
|
108
|
+
port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
|
|
109
|
+
|
|
107
110
|
launch_params = {
|
|
108
111
|
'id': instance_touse['id'],
|
|
109
112
|
'direct': True,
|
|
110
113
|
'ssh': True,
|
|
111
|
-
'env': '-e __SOURCE=skypilot',
|
|
114
|
+
'env': f'-e __SOURCE=skypilot {port_map}',
|
|
112
115
|
'onstart_cmd': ';'.join([
|
|
113
116
|
'touch ~/.no_auto_tmux',
|
|
114
117
|
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|