skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""Hyperbolic API utilities."""
|
|
2
|
+
import enum
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any, Dict, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from sky import authentication
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.utils import status_lib
|
|
13
|
+
|
|
14
|
+
#TODO update to prod endpoint
|
|
15
|
+
BASE_URL = 'https://api.hyperbolic.xyz'
|
|
16
|
+
API_KEY_PATH = '~/.hyperbolic/api_key'
|
|
17
|
+
|
|
18
|
+
MAX_RETRIES = 3
|
|
19
|
+
RETRY_DELAY = 2 # seconds
|
|
20
|
+
TIMEOUT = 120
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HyperbolicError(Exception):
|
|
26
|
+
"""Base exception for Hyperbolic API errors."""
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class HyperbolicInstanceStatus(enum.Enum):
|
|
31
|
+
"""Statuses enum for Hyperbolic instances."""
|
|
32
|
+
UNKNOWN = 'unknown'
|
|
33
|
+
ONLINE = 'online'
|
|
34
|
+
OFFLINE = 'offline'
|
|
35
|
+
STARTING = 'starting'
|
|
36
|
+
STOPPING = 'stopping'
|
|
37
|
+
BUSY = 'busy'
|
|
38
|
+
RESTARTING = 'restarting'
|
|
39
|
+
CREATING = 'creating'
|
|
40
|
+
FAILED = 'failed'
|
|
41
|
+
ERROR = 'error'
|
|
42
|
+
TERMINATED = 'terminated'
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def cluster_status_map(
|
|
46
|
+
cls
|
|
47
|
+
) -> Dict['HyperbolicInstanceStatus', Optional[status_lib.ClusterStatus]]:
|
|
48
|
+
return {
|
|
49
|
+
cls.CREATING: status_lib.ClusterStatus.INIT,
|
|
50
|
+
cls.STARTING: status_lib.ClusterStatus.INIT,
|
|
51
|
+
cls.ONLINE: status_lib.ClusterStatus.UP,
|
|
52
|
+
cls.FAILED: status_lib.ClusterStatus.INIT,
|
|
53
|
+
cls.ERROR: status_lib.ClusterStatus.INIT,
|
|
54
|
+
cls.RESTARTING: status_lib.ClusterStatus.INIT,
|
|
55
|
+
cls.STOPPING: status_lib.ClusterStatus.INIT,
|
|
56
|
+
cls.UNKNOWN: status_lib.ClusterStatus.INIT,
|
|
57
|
+
cls.BUSY: status_lib.ClusterStatus.INIT,
|
|
58
|
+
cls.OFFLINE: status_lib.ClusterStatus.INIT,
|
|
59
|
+
cls.TERMINATED: None,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_raw_status(cls, status: str) -> 'HyperbolicInstanceStatus':
|
|
64
|
+
"""Convert raw status string to HyperbolicInstanceStatus enum."""
|
|
65
|
+
try:
|
|
66
|
+
return cls(status.lower())
|
|
67
|
+
except ValueError as exc:
|
|
68
|
+
raise HyperbolicError(f'Unknown instance status: {status}') from exc
|
|
69
|
+
|
|
70
|
+
def to_cluster_status(self) -> Optional[status_lib.ClusterStatus]:
|
|
71
|
+
"""Convert to SkyPilot cluster status."""
|
|
72
|
+
return self.cluster_status_map().get(self)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class HyperbolicClient:
|
|
76
|
+
"""Client for interacting with the Hyperbolic API."""
|
|
77
|
+
|
|
78
|
+
def __init__(self):
|
|
79
|
+
"""Initialize the Hyperbolic client with API credentials."""
|
|
80
|
+
cred_path = os.path.expanduser(API_KEY_PATH)
|
|
81
|
+
if not os.path.exists(cred_path):
|
|
82
|
+
raise RuntimeError(f'API key not found at {cred_path}')
|
|
83
|
+
with open(cred_path, 'r', encoding='utf-8') as f:
|
|
84
|
+
self.api_key = f.read().strip()
|
|
85
|
+
self.headers = {'Authorization': f'Bearer {self.api_key}'}
|
|
86
|
+
self.api_url = BASE_URL
|
|
87
|
+
|
|
88
|
+
def _make_request(
|
|
89
|
+
self,
|
|
90
|
+
method: str,
|
|
91
|
+
endpoint: str,
|
|
92
|
+
payload: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
93
|
+
"""Make an API request to Hyperbolic."""
|
|
94
|
+
url = f'{BASE_URL}{endpoint}'
|
|
95
|
+
headers = {
|
|
96
|
+
'Authorization': f'Bearer {self.api_key}',
|
|
97
|
+
'Content-Type': 'application/json'
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Debug logging for request
|
|
101
|
+
logger.debug(f'Making {method} request to {url}')
|
|
102
|
+
if payload:
|
|
103
|
+
logger.debug(f'Request payload: {json.dumps(payload, indent=2)}')
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
if method == 'GET':
|
|
107
|
+
response = requests.get(url, headers=headers, timeout=120)
|
|
108
|
+
elif method == 'POST':
|
|
109
|
+
response = requests.post(url,
|
|
110
|
+
headers=headers,
|
|
111
|
+
json=payload,
|
|
112
|
+
timeout=120)
|
|
113
|
+
else:
|
|
114
|
+
raise HyperbolicError(f'Unsupported HTTP method: {method}')
|
|
115
|
+
|
|
116
|
+
# Debug logging for response
|
|
117
|
+
logger.debug(f'Response status code: {response.status_code}')
|
|
118
|
+
logger.debug(f'Response headers: {dict(response.headers)}')
|
|
119
|
+
|
|
120
|
+
# Try to parse response as JSON
|
|
121
|
+
try:
|
|
122
|
+
response_data = response.json()
|
|
123
|
+
logger.debug(
|
|
124
|
+
f'Response body: {json.dumps(response_data, indent=2)}')
|
|
125
|
+
except json.JSONDecodeError as exc:
|
|
126
|
+
# If response is not JSON, use the raw text
|
|
127
|
+
response_text = response.text
|
|
128
|
+
logger.debug(f'Response body (raw): {response_text}')
|
|
129
|
+
if not response.ok:
|
|
130
|
+
raise HyperbolicError(f'API request failed with status '
|
|
131
|
+
f'{response.status_code}: '
|
|
132
|
+
f'{response_text}') from exc
|
|
133
|
+
# If response is OK but not JSON, return empty dict
|
|
134
|
+
return {}
|
|
135
|
+
|
|
136
|
+
if not response.ok:
|
|
137
|
+
error_msg = response_data.get(
|
|
138
|
+
'error', response_data.get('message', response.text))
|
|
139
|
+
raise HyperbolicError(
|
|
140
|
+
f'API request failed with status {response.status_code}: '
|
|
141
|
+
f'{error_msg}')
|
|
142
|
+
|
|
143
|
+
return response_data
|
|
144
|
+
except requests.exceptions.RequestException as e:
|
|
145
|
+
raise HyperbolicError(f'Request failed: {str(e)}') from e
|
|
146
|
+
except Exception as e:
|
|
147
|
+
raise HyperbolicError(
|
|
148
|
+
f'Unexpected error during API request: {str(e)}') from e
|
|
149
|
+
|
|
150
|
+
def launch_instance(self, gpu_model: str, gpu_count: int,
|
|
151
|
+
name: str) -> Tuple[str, str]:
|
|
152
|
+
"""Launch a new instance with the specified configuration."""
|
|
153
|
+
# Initialize config with basic instance info
|
|
154
|
+
config = {
|
|
155
|
+
'gpuModel': gpu_model,
|
|
156
|
+
'gpuCount': str(gpu_count),
|
|
157
|
+
'userMetadata': {
|
|
158
|
+
'skypilot': {
|
|
159
|
+
'cluster_name': name,
|
|
160
|
+
'launch_time': str(int(time.time()))
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
config = authentication.setup_hyperbolic_authentication(config)
|
|
166
|
+
|
|
167
|
+
endpoint = '/v2/marketplace/instances/create-cheapest'
|
|
168
|
+
try:
|
|
169
|
+
response = self._make_request('POST', endpoint, payload=config)
|
|
170
|
+
logger.debug(f'Launch response: {json.dumps(response, indent=2)}')
|
|
171
|
+
|
|
172
|
+
instance_id = response.get('instanceName')
|
|
173
|
+
if not instance_id:
|
|
174
|
+
logger.error(f'No instance ID in response: {response}')
|
|
175
|
+
raise HyperbolicError('No instance ID returned from API')
|
|
176
|
+
|
|
177
|
+
logger.info(f'Successfully launched instance {instance_id}, '
|
|
178
|
+
f'waiting for it to be ready...')
|
|
179
|
+
|
|
180
|
+
# Wait for instance to be ready
|
|
181
|
+
if not self.wait_for_instance(
|
|
182
|
+
instance_id, HyperbolicInstanceStatus.ONLINE.value):
|
|
183
|
+
raise HyperbolicError(
|
|
184
|
+
f'Instance {instance_id} failed to reach ONLINE state')
|
|
185
|
+
|
|
186
|
+
# Get instance details to get SSH command
|
|
187
|
+
instances = self.list_instances(
|
|
188
|
+
metadata={'skypilot': {
|
|
189
|
+
'cluster_name': name
|
|
190
|
+
}})
|
|
191
|
+
instance = instances.get(instance_id)
|
|
192
|
+
if not instance:
|
|
193
|
+
raise HyperbolicError(
|
|
194
|
+
f'Instance {instance_id} not found after launch')
|
|
195
|
+
|
|
196
|
+
ssh_command = instance.get('sshCommand')
|
|
197
|
+
if not ssh_command:
|
|
198
|
+
logger.error(
|
|
199
|
+
f'No SSH command available for instance {instance_id}')
|
|
200
|
+
raise HyperbolicError('No SSH command available for instance')
|
|
201
|
+
|
|
202
|
+
logger.info(f'Instance {instance_id} is ready with SSH command')
|
|
203
|
+
return instance_id, ssh_command
|
|
204
|
+
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(f'Failed to launch instance: {str(e)}')
|
|
207
|
+
raise HyperbolicError(f'Failed to launch instance: {str(e)}') from e
|
|
208
|
+
|
|
209
|
+
def list_instances(
|
|
210
|
+
self,
|
|
211
|
+
status: Optional[str] = None,
|
|
212
|
+
metadata: Optional[Dict[str, Dict[str, str]]] = None
|
|
213
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
214
|
+
"""List all instances, optionally filtered by status and metadata."""
|
|
215
|
+
endpoint = '/v1/marketplace/instances'
|
|
216
|
+
try:
|
|
217
|
+
response = self._make_request('GET', endpoint)
|
|
218
|
+
logger.debug(f'Raw API response: {json.dumps(response, indent=2)}')
|
|
219
|
+
instances = {}
|
|
220
|
+
for instance in response.get('instances', []):
|
|
221
|
+
instance_info = instance.get('instance', {})
|
|
222
|
+
current_status = instance_info.get('status')
|
|
223
|
+
logger.debug(
|
|
224
|
+
f'Instance {instance.get("id")} status: {current_status}')
|
|
225
|
+
|
|
226
|
+
# Convert raw status to enum
|
|
227
|
+
try:
|
|
228
|
+
instance_status = HyperbolicInstanceStatus.from_raw_status(
|
|
229
|
+
current_status)
|
|
230
|
+
except HyperbolicError as e:
|
|
231
|
+
logger.warning(f'Failed to parse status for instance '
|
|
232
|
+
f'{instance.get("id")}: {e}')
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
if status and instance_status.value != status.lower():
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
if metadata:
|
|
239
|
+
skypilot_metadata: Dict[str,
|
|
240
|
+
str] = metadata.get('skypilot', {})
|
|
241
|
+
cluster_name = skypilot_metadata.get('cluster_name', '')
|
|
242
|
+
instance_skypilot = instance.get('userMetadata',
|
|
243
|
+
{}).get('skypilot', {})
|
|
244
|
+
if not instance_skypilot.get('cluster_name',
|
|
245
|
+
'').startswith(cluster_name):
|
|
246
|
+
logger.debug(
|
|
247
|
+
f'Skipping instance {instance.get("id")} - '
|
|
248
|
+
f'skypilot metadata {instance_skypilot} '
|
|
249
|
+
f'does not match {skypilot_metadata}')
|
|
250
|
+
continue
|
|
251
|
+
logger.debug(f'Including instance {instance.get("id")} '
|
|
252
|
+
f'- skypilot metadata matches')
|
|
253
|
+
|
|
254
|
+
hardware = instance_info.get('hardware', {})
|
|
255
|
+
instances[instance.get('id')] = {
|
|
256
|
+
'id': instance.get('id'),
|
|
257
|
+
'created': instance.get('created'),
|
|
258
|
+
'sshCommand': instance.get('sshCommand'),
|
|
259
|
+
'status': instance_status.value,
|
|
260
|
+
'gpu_count': instance_info.get('gpu_count'),
|
|
261
|
+
'gpus_total': instance_info.get('gpus_total'),
|
|
262
|
+
'owner': instance_info.get('owner'),
|
|
263
|
+
'cpus': hardware.get('cpus'),
|
|
264
|
+
'gpus': hardware.get('gpus'),
|
|
265
|
+
'ram': hardware.get('ram'),
|
|
266
|
+
'storage': hardware.get('storage'),
|
|
267
|
+
'pricing': instance_info.get('pricing'),
|
|
268
|
+
'metadata': instance.get('userMetadata', {})
|
|
269
|
+
}
|
|
270
|
+
return instances
|
|
271
|
+
except Exception as e:
|
|
272
|
+
raise HyperbolicError(f'Failed to list instances: {str(e)}') from e
|
|
273
|
+
|
|
274
|
+
def terminate_instance(self, instance_id: str) -> None:
|
|
275
|
+
"""Terminate an instance by ID."""
|
|
276
|
+
endpoint = '/v1/marketplace/instances/terminate'
|
|
277
|
+
data = {'id': instance_id}
|
|
278
|
+
try:
|
|
279
|
+
self._make_request('POST', endpoint, payload=data)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
raise HyperbolicError(
|
|
282
|
+
f'Failed to terminate instance {instance_id}: {str(e)}') from e
|
|
283
|
+
|
|
284
|
+
def wait_for_instance(self,
|
|
285
|
+
instance_id: str,
|
|
286
|
+
target_status: str,
|
|
287
|
+
timeout: int = TIMEOUT) -> bool:
|
|
288
|
+
"""Wait for an instance to reach a specific status."""
|
|
289
|
+
start_time = time.time()
|
|
290
|
+
target_status_enum = HyperbolicInstanceStatus.from_raw_status(
|
|
291
|
+
target_status)
|
|
292
|
+
logger.info(
|
|
293
|
+
f'Waiting for instance {instance_id} '
|
|
294
|
+
f'to reach status {target_status_enum.value} and have SSH command')
|
|
295
|
+
|
|
296
|
+
while True:
|
|
297
|
+
elapsed = time.time() - start_time
|
|
298
|
+
if elapsed >= timeout:
|
|
299
|
+
logger.error(f'Timeout after {int(elapsed)}s '
|
|
300
|
+
f'waiting for instance {instance_id}')
|
|
301
|
+
return False
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
instances = self.list_instances()
|
|
305
|
+
instance = instances.get(instance_id)
|
|
306
|
+
|
|
307
|
+
if not instance:
|
|
308
|
+
logger.warning(f'Instance {instance_id} not found')
|
|
309
|
+
time.sleep(5)
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
current_status = instance.get('status', '').lower()
|
|
313
|
+
ssh_command = instance.get('sshCommand')
|
|
314
|
+
logger.debug(f'Current status: {current_status}, '
|
|
315
|
+
f'Target status: {target_status_enum.value}, '
|
|
316
|
+
f'SSH command: {ssh_command}')
|
|
317
|
+
|
|
318
|
+
if current_status == target_status_enum.value and ssh_command:
|
|
319
|
+
logger.info(f'Instance {instance_id} reached '
|
|
320
|
+
f'target status {target_status_enum.value} '
|
|
321
|
+
f'and has SSH command after {int(elapsed)}s')
|
|
322
|
+
return True
|
|
323
|
+
|
|
324
|
+
if current_status in ['failed', 'error', 'terminated']:
|
|
325
|
+
logger.error(f'Instance {instance_id} reached '
|
|
326
|
+
f'terminal status: {current_status} '
|
|
327
|
+
f'after {int(elapsed)}s')
|
|
328
|
+
return False
|
|
329
|
+
|
|
330
|
+
time.sleep(5)
|
|
331
|
+
except Exception as e: # pylint: disable=broad-except
|
|
332
|
+
logger.warning(
|
|
333
|
+
f'Error while waiting for instance {instance_id}: {str(e)}')
|
|
334
|
+
time.sleep(5)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# Module-level singleton client
|
|
338
|
+
_client = None
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def get_client() -> HyperbolicClient:
|
|
342
|
+
"""Get or create the Hyperbolic client singleton."""
|
|
343
|
+
global _client
|
|
344
|
+
if _client is None:
|
|
345
|
+
_client = HyperbolicClient()
|
|
346
|
+
return _client
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
# Backward-compatible wrapper functions
|
|
350
|
+
def launch_instance(gpu_model: str, gpu_count: int,
|
|
351
|
+
name: str) -> Tuple[str, str]:
|
|
352
|
+
"""Launch a new instance with the specified configuration."""
|
|
353
|
+
return get_client().launch_instance(gpu_model, gpu_count, name)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def list_instances(
|
|
357
|
+
status: Optional[str] = None,
|
|
358
|
+
metadata: Optional[Dict[str, Dict[str, str]]] = None
|
|
359
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
360
|
+
"""List all instances, optionally filtered by status and metadata."""
|
|
361
|
+
return get_client().list_instances(status=status, metadata=metadata)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def terminate_instance(instance_id: str) -> None:
|
|
365
|
+
"""Terminate an instance by ID."""
|
|
366
|
+
return get_client().terminate_instance(instance_id)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def wait_for_instance(instance_id: str,
|
|
370
|
+
target_status: str,
|
|
371
|
+
timeout: int = TIMEOUT) -> bool:
|
|
372
|
+
"""Wait for an instance to reach a specific status."""
|
|
373
|
+
return get_client().wait_for_instance(instance_id, target_status, timeout)
|
sky/provision/instance_setup.py
CHANGED
|
@@ -8,7 +8,9 @@ import time
|
|
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
9
9
|
|
|
10
10
|
from sky import exceptions
|
|
11
|
+
from sky import logs
|
|
11
12
|
from sky import provision
|
|
13
|
+
from sky import resources as resources_lib
|
|
12
14
|
from sky import sky_logging
|
|
13
15
|
from sky.provision import common
|
|
14
16
|
from sky.provision import docker_utils
|
|
@@ -21,6 +23,7 @@ from sky.utils import accelerator_registry
|
|
|
21
23
|
from sky.utils import command_runner
|
|
22
24
|
from sky.utils import common_utils
|
|
23
25
|
from sky.utils import env_options
|
|
26
|
+
from sky.utils import resources_utils
|
|
24
27
|
from sky.utils import subprocess_utils
|
|
25
28
|
from sky.utils import timeline
|
|
26
29
|
from sky.utils import ux_utils
|
|
@@ -82,7 +85,7 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
82
85
|
latest one when the function is called.
|
|
83
86
|
"""
|
|
84
87
|
return (
|
|
85
|
-
f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
|
|
88
|
+
f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
|
|
86
89
|
# The run id is retrieved locally for the current run, so that the
|
|
87
90
|
# remote cluster will be set with the same run id as the initial
|
|
88
91
|
# launch operation.
|
|
@@ -90,12 +93,6 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
90
93
|
f'{usage_constants.USAGE_RUN_ID_FILE}')
|
|
91
94
|
|
|
92
95
|
|
|
93
|
-
def _set_skypilot_env_var_cmd() -> str:
|
|
94
|
-
"""Sets the skypilot environment variables on the remote machine."""
|
|
95
|
-
env_vars = env_options.Options.all_options()
|
|
96
|
-
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
97
|
-
|
|
98
|
-
|
|
99
96
|
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
|
|
100
97
|
"""Decorator that retries the function if it fails.
|
|
101
98
|
|
|
@@ -134,6 +131,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
|
|
|
134
131
|
logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
|
|
135
132
|
|
|
136
133
|
|
|
134
|
+
class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
|
|
135
|
+
"""ThreadPoolExecutor that kills children processes on exit."""
|
|
136
|
+
|
|
137
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
138
|
+
# ssh command runner eventually calls
|
|
139
|
+
# log_lib.run_with_log, which will spawn
|
|
140
|
+
# subprocesses. If we are exiting the context
|
|
141
|
+
# we need to kill the children processes
|
|
142
|
+
# to avoid leakage.
|
|
143
|
+
subprocess_utils.kill_children_processes()
|
|
144
|
+
self.shutdown()
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
|
|
137
148
|
def _parallel_ssh_with_cache(func,
|
|
138
149
|
cluster_name: str,
|
|
139
150
|
stage_name: str,
|
|
@@ -146,7 +157,7 @@ def _parallel_ssh_with_cache(func,
|
|
|
146
157
|
# as 32 is too large for some machines.
|
|
147
158
|
max_workers = subprocess_utils.get_parallel_threads(
|
|
148
159
|
cluster_info.provider_name)
|
|
149
|
-
with
|
|
160
|
+
with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
150
161
|
results = []
|
|
151
162
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
152
163
|
cluster_info, **ssh_credentials)
|
|
@@ -423,8 +434,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
423
434
|
# use the external IP of the head node.
|
|
424
435
|
use_external_ip = cluster_info.custom_ray_options.pop(
|
|
425
436
|
'use_external_ip', False)
|
|
426
|
-
|
|
427
|
-
|
|
437
|
+
|
|
438
|
+
if use_external_ip:
|
|
439
|
+
head_ip = head_instance.external_ip
|
|
440
|
+
else:
|
|
441
|
+
# For Kubernetes, use the internal service address of the head node.
|
|
442
|
+
# Keep this consistent with the logic in kubernetes-ray.yml.j2
|
|
443
|
+
if head_instance.internal_svc:
|
|
444
|
+
head_ip = head_instance.internal_svc
|
|
445
|
+
else:
|
|
446
|
+
head_ip = head_instance.internal_ip
|
|
428
447
|
|
|
429
448
|
ray_cmd = ray_worker_start_command(custom_resource,
|
|
430
449
|
cluster_info.custom_ray_options,
|
|
@@ -466,11 +485,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
466
485
|
@common.log_function_start_end
|
|
467
486
|
@_auto_retry()
|
|
468
487
|
@timeline.event
|
|
469
|
-
def start_skylet_on_head_node(
|
|
470
|
-
|
|
471
|
-
|
|
488
|
+
def start_skylet_on_head_node(
|
|
489
|
+
cluster_name: resources_utils.ClusterName,
|
|
490
|
+
cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
|
|
491
|
+
launched_resources: resources_lib.Resources) -> None:
|
|
472
492
|
"""Start skylet on the head node."""
|
|
473
|
-
|
|
493
|
+
# Avoid circular import.
|
|
494
|
+
# pylint: disable=import-outside-toplevel
|
|
495
|
+
from sky.utils import controller_utils
|
|
496
|
+
|
|
497
|
+
def _set_skypilot_env_var_cmd() -> str:
|
|
498
|
+
"""Sets the skypilot environment variables on the remote machine."""
|
|
499
|
+
env_vars = {
|
|
500
|
+
k: str(v) for (k, v) in env_options.Options.all_options().items()
|
|
501
|
+
}
|
|
502
|
+
is_controller = controller_utils.Controllers.from_name(
|
|
503
|
+
cluster_name.display_name) is not None
|
|
504
|
+
is_kubernetes = cluster_info.provider_name == 'kubernetes'
|
|
505
|
+
if is_controller and is_kubernetes:
|
|
506
|
+
# For jobs/serve controller, we pass in the CPU and memory limits
|
|
507
|
+
# when starting the skylet to handle cases where these env vars
|
|
508
|
+
# are not set on the cluster's pod spec. The skylet will read
|
|
509
|
+
# these env vars when starting (ManagedJobEvent.start()) and write
|
|
510
|
+
# it to disk.
|
|
511
|
+
resources = launched_resources.assert_launchable()
|
|
512
|
+
vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
|
|
513
|
+
resources.instance_type)
|
|
514
|
+
if vcpus is not None:
|
|
515
|
+
env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
|
|
516
|
+
if mem is not None:
|
|
517
|
+
env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
|
|
518
|
+
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
519
|
+
|
|
474
520
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
475
521
|
cluster_info, **ssh_credentials)
|
|
476
522
|
head_runner = runners[0]
|
|
@@ -557,3 +603,36 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
|
|
|
557
603
|
ssh_credentials=ssh_credentials,
|
|
558
604
|
max_workers=subprocess_utils.get_max_workers_for_file_mounts(
|
|
559
605
|
common_file_mounts, cluster_info.provider_name))
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
@common.log_function_start_end
|
|
609
|
+
@timeline.event
|
|
610
|
+
def setup_logging_on_cluster(logging_agent: logs.LoggingAgent,
|
|
611
|
+
cluster_name: resources_utils.ClusterName,
|
|
612
|
+
cluster_info: common.ClusterInfo,
|
|
613
|
+
ssh_credentials: Dict[str, Any]) -> None:
|
|
614
|
+
"""Setup logging agent (fluentbit) on all nodes after provisioning."""
|
|
615
|
+
_hint_worker_log_path(cluster_name.name_on_cloud, cluster_info,
|
|
616
|
+
'logging_setup')
|
|
617
|
+
|
|
618
|
+
@_auto_retry()
|
|
619
|
+
def _setup_node(runner: command_runner.CommandRunner, log_path: str):
|
|
620
|
+
cmd = logging_agent.get_setup_command(cluster_name)
|
|
621
|
+
logger.info(f'Running command on node: {cmd}')
|
|
622
|
+
returncode, stdout, stderr = runner.run(cmd,
|
|
623
|
+
stream_logs=False,
|
|
624
|
+
require_outputs=True,
|
|
625
|
+
log_path=log_path,
|
|
626
|
+
source_bashrc=True)
|
|
627
|
+
if returncode:
|
|
628
|
+
raise RuntimeError(f'Failed to setup logging agent\n{cmd}\n'
|
|
629
|
+
f'(exit code {returncode}). Error: '
|
|
630
|
+
f'===== stdout ===== \n{stdout}\n'
|
|
631
|
+
f'===== stderr ====={stderr}')
|
|
632
|
+
|
|
633
|
+
_parallel_ssh_with_cache(_setup_node,
|
|
634
|
+
cluster_name.name_on_cloud,
|
|
635
|
+
stage_name='logging_setup',
|
|
636
|
+
digest=None,
|
|
637
|
+
cluster_info=cluster_info,
|
|
638
|
+
ssh_credentials=ssh_credentials)
|
|
@@ -11,3 +11,8 @@ from sky.provision.kubernetes.instance import wait_instances
|
|
|
11
11
|
from sky.provision.kubernetes.network import cleanup_ports
|
|
12
12
|
from sky.provision.kubernetes.network import open_ports
|
|
13
13
|
from sky.provision.kubernetes.network import query_ports
|
|
14
|
+
from sky.provision.kubernetes.volume import apply_volume
|
|
15
|
+
from sky.provision.kubernetes.volume import delete_volume
|
|
16
|
+
from sky.provision.kubernetes.volume import get_all_volumes_usedby
|
|
17
|
+
from sky.provision.kubernetes.volume import get_volume_usedby
|
|
18
|
+
from sky.provision.kubernetes.volume import map_all_volumes_usedby
|
|
@@ -3,20 +3,12 @@ import copy
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
7
|
-
from typing import Any, Dict, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
8
7
|
|
|
9
|
-
from sky.adaptors import common as adaptors_common
|
|
10
8
|
from sky.adaptors import kubernetes
|
|
11
9
|
from sky.provision import common
|
|
12
|
-
from sky.provision.kubernetes import network_utils
|
|
13
10
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
14
|
-
from sky.utils import
|
|
15
|
-
|
|
16
|
-
if typing.TYPE_CHECKING:
|
|
17
|
-
import yaml
|
|
18
|
-
else:
|
|
19
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
11
|
+
from sky.utils import yaml_utils
|
|
20
12
|
|
|
21
13
|
logger = logging.getLogger(__name__)
|
|
22
14
|
|
|
@@ -34,11 +26,6 @@ def bootstrap_instances(
|
|
|
34
26
|
|
|
35
27
|
_configure_services(namespace, context, config.provider_config)
|
|
36
28
|
|
|
37
|
-
networking_mode = network_utils.get_networking_mode(
|
|
38
|
-
config.provider_config.get('networking_mode'))
|
|
39
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
40
|
-
config = _configure_ssh_jump(namespace, context, config)
|
|
41
|
-
|
|
42
29
|
requested_service_account = config.node_config['spec']['serviceAccountName']
|
|
43
30
|
if (requested_service_account ==
|
|
44
31
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
|
|
@@ -487,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
|
|
|
487
474
|
f'{created_msg(binding_field, name)}')
|
|
488
475
|
|
|
489
476
|
|
|
490
|
-
def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
|
|
491
|
-
"""Creates a SSH jump pod to connect to the cluster.
|
|
492
|
-
|
|
493
|
-
Also updates config['auth']['ssh_proxy_command'] to use the newly created
|
|
494
|
-
jump pod.
|
|
495
|
-
"""
|
|
496
|
-
provider_config = config.provider_config
|
|
497
|
-
pod_cfg = config.node_config
|
|
498
|
-
|
|
499
|
-
ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
|
|
500
|
-
ssh_jump_image = provider_config['ssh_jump_image']
|
|
501
|
-
|
|
502
|
-
volumes = pod_cfg['spec']['volumes']
|
|
503
|
-
# find 'secret-volume' and get the secret name
|
|
504
|
-
secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
|
|
505
|
-
volumes))
|
|
506
|
-
ssh_key_secret_name = secret_volume['secret']['secretName']
|
|
507
|
-
|
|
508
|
-
# TODO(romilb): We currently split SSH jump pod and svc creation. Service
|
|
509
|
-
# is first created in authentication.py::setup_kubernetes_authentication
|
|
510
|
-
# and then SSH jump pod creation happens here. This is because we need to
|
|
511
|
-
# set the ssh_proxy_command in the ray YAML before we pass it to the
|
|
512
|
-
# autoscaler. If in the future if we can write the ssh_proxy_command to the
|
|
513
|
-
# cluster yaml through this method, then we should move the service
|
|
514
|
-
# creation here.
|
|
515
|
-
|
|
516
|
-
# TODO(romilb): We should add a check here to make sure the service is up
|
|
517
|
-
# and available before we create the SSH jump pod. If for any reason the
|
|
518
|
-
# service is missing, we should raise an error.
|
|
519
|
-
|
|
520
|
-
kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
|
|
521
|
-
ssh_key_secret_name, namespace, context)
|
|
522
|
-
return config
|
|
523
|
-
|
|
524
|
-
|
|
525
477
|
def _configure_skypilot_system_namespace(
|
|
526
478
|
provider_config: Dict[str, Any]) -> None:
|
|
527
479
|
"""Creates the namespace for skypilot-system mounting if it does not exist.
|
|
@@ -592,7 +544,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
|
|
592
544
|
daemonset_path = os.path.join(
|
|
593
545
|
root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
|
|
594
546
|
with open(daemonset_path, 'r', encoding='utf-8') as file:
|
|
595
|
-
daemonset =
|
|
547
|
+
daemonset = yaml_utils.safe_load(file)
|
|
596
548
|
kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
|
|
597
549
|
try:
|
|
598
550
|
kubernetes.apps_api(context).create_namespaced_daemon_set(
|
|
@@ -672,4 +624,9 @@ def _configure_services(namespace: str, context: Optional[str],
|
|
|
672
624
|
|
|
673
625
|
|
|
674
626
|
class KubernetesError(Exception):
|
|
675
|
-
|
|
627
|
+
|
|
628
|
+
def __init__(self,
|
|
629
|
+
*args,
|
|
630
|
+
insufficent_resources: Optional[List[str]] = None):
|
|
631
|
+
self.insufficent_resources = insufficent_resources
|
|
632
|
+
super().__init__(*args)
|
|
@@ -6,3 +6,20 @@ NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
|
|
|
6
6
|
'(e.g., skypilot.co/accelerator) are setup correctly. ')
|
|
7
7
|
|
|
8
8
|
KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR = 'SKYPILOT_IN_CLUSTER_NAMESPACE'
|
|
9
|
+
|
|
10
|
+
# Name of kubernetes exec auth wrapper script
|
|
11
|
+
SKY_K8S_EXEC_AUTH_WRAPPER = 'sky-kube-exec-wrapper'
|
|
12
|
+
|
|
13
|
+
# PATH envvar for kubectl exec auth execve
|
|
14
|
+
SKY_K8S_EXEC_AUTH_PATH = '$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk/bin:$PATH' # pylint: disable=line-too-long
|
|
15
|
+
|
|
16
|
+
# cache directory for kubeconfig with modified exec auth
|
|
17
|
+
SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
|
|
18
|
+
|
|
19
|
+
# Labels for the Pods created by SkyPilot
|
|
20
|
+
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
|
21
|
+
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
|
22
|
+
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
|
23
|
+
|
|
24
|
+
# Pod phases that are not holding PVCs
|
|
25
|
+
PVC_NOT_HOLD_POD_PHASES = ['Succeeded', 'Failed']
|