skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/rest.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""REST API client of SkyPilot API server"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import contextvars
|
|
6
|
+
import functools
|
|
7
|
+
import time
|
|
8
|
+
import typing
|
|
9
|
+
from typing import Any, Callable, cast, Optional, TypeVar
|
|
10
|
+
|
|
11
|
+
import colorama
|
|
12
|
+
import urllib3.exceptions
|
|
13
|
+
|
|
14
|
+
from sky import exceptions
|
|
15
|
+
from sky import sky_logging
|
|
16
|
+
from sky.adaptors import common as adaptors_common
|
|
17
|
+
from sky.server import constants
|
|
18
|
+
from sky.server import versions
|
|
19
|
+
from sky.utils import common_utils
|
|
20
|
+
from sky.utils import rich_utils
|
|
21
|
+
from sky.utils import ux_utils
|
|
22
|
+
|
|
23
|
+
logger = sky_logging.init_logger(__name__)
|
|
24
|
+
|
|
25
|
+
if typing.TYPE_CHECKING:
|
|
26
|
+
import aiohttp
|
|
27
|
+
import requests
|
|
28
|
+
|
|
29
|
+
else:
|
|
30
|
+
aiohttp = adaptors_common.LazyImport('aiohttp')
|
|
31
|
+
requests = adaptors_common.LazyImport('requests')
|
|
32
|
+
|
|
33
|
+
F = TypeVar('F', bound=Callable[..., Any])
|
|
34
|
+
|
|
35
|
+
_RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
|
|
36
|
+
|
|
37
|
+
_session = requests.Session()
|
|
38
|
+
# Tune connection pool size, otherwise the default max is just 10.
|
|
39
|
+
adapter = requests.adapters.HTTPAdapter(
|
|
40
|
+
pool_connections=50,
|
|
41
|
+
pool_maxsize=200,
|
|
42
|
+
# We handle retries by ourselves in SDK.
|
|
43
|
+
max_retries=0,
|
|
44
|
+
)
|
|
45
|
+
_session.mount('http://', adapter)
|
|
46
|
+
_session.mount('https://', adapter)
|
|
47
|
+
|
|
48
|
+
_session.headers[constants.API_VERSION_HEADER] = str(constants.API_VERSION)
|
|
49
|
+
_session.headers[constants.VERSION_HEADER] = (
|
|
50
|
+
versions.get_local_readable_version())
|
|
51
|
+
|
|
52
|
+
# Enumerate error types that might be transient and can be addressed by
|
|
53
|
+
# retrying.
|
|
54
|
+
_transient_errors = [
|
|
55
|
+
requests.exceptions.RequestException,
|
|
56
|
+
ConnectionError,
|
|
57
|
+
urllib3.exceptions.HTTPError,
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class RetryContext:
|
|
62
|
+
|
|
63
|
+
def __init__(self):
|
|
64
|
+
self.line_processed = 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@contextlib.contextmanager
|
|
68
|
+
def _retry_in_context():
|
|
69
|
+
context = RetryContext()
|
|
70
|
+
token = _RETRY_CONTEXT.set(context)
|
|
71
|
+
try:
|
|
72
|
+
yield context
|
|
73
|
+
finally:
|
|
74
|
+
_RETRY_CONTEXT.reset(token)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_retry_context() -> Optional[RetryContext]:
|
|
78
|
+
return _RETRY_CONTEXT.get()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def retry_transient_errors(max_retries: int = 3,
|
|
82
|
+
initial_backoff=1,
|
|
83
|
+
max_backoff_factor=5):
|
|
84
|
+
"""Decorator that retries a function when a transient error is caught.
|
|
85
|
+
|
|
86
|
+
This decorator is mainly used to decorate idempotent SDK functions to make
|
|
87
|
+
it more robust to transient errors.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
max_retries: Maximum number of retries
|
|
91
|
+
initial_backoff: Initial backoff time in seconds
|
|
92
|
+
max_backoff_factor: Maximum backoff factor for exponential backoff
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def is_transient_error(e: Exception) -> bool:
|
|
96
|
+
if isinstance(e, requests.exceptions.HTTPError):
|
|
97
|
+
# Only server error is considered as transient.
|
|
98
|
+
return e.response.status_code >= 500
|
|
99
|
+
for error in _transient_errors:
|
|
100
|
+
if isinstance(e, error):
|
|
101
|
+
return True
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def decorator(func: F) -> F:
|
|
105
|
+
|
|
106
|
+
@functools.wraps(func)
|
|
107
|
+
def wrapper(*args, **kwargs):
|
|
108
|
+
backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
|
|
109
|
+
consecutive_failed_count = 0
|
|
110
|
+
|
|
111
|
+
with _retry_in_context() as context:
|
|
112
|
+
previous_line_processed = context.line_processed # should be 0
|
|
113
|
+
|
|
114
|
+
def _handle_exception():
|
|
115
|
+
# If the function made progress on a retry,
|
|
116
|
+
# clears the backoff and resets the failed retry count.
|
|
117
|
+
# Otherwise, increments the failed retry count.
|
|
118
|
+
nonlocal backoff
|
|
119
|
+
nonlocal consecutive_failed_count
|
|
120
|
+
nonlocal previous_line_processed
|
|
121
|
+
if context.line_processed > previous_line_processed:
|
|
122
|
+
backoff = common_utils.Backoff(initial_backoff,
|
|
123
|
+
max_backoff_factor)
|
|
124
|
+
previous_line_processed = context.line_processed
|
|
125
|
+
consecutive_failed_count = 0
|
|
126
|
+
else:
|
|
127
|
+
consecutive_failed_count += 1
|
|
128
|
+
|
|
129
|
+
while consecutive_failed_count < max_retries:
|
|
130
|
+
try:
|
|
131
|
+
return func(*args, **kwargs)
|
|
132
|
+
# Occurs when the server proactively interrupts the request
|
|
133
|
+
# during rolling update, we can retry immediately on the
|
|
134
|
+
# new replica.
|
|
135
|
+
except exceptions.RequestInterruptedError:
|
|
136
|
+
_handle_exception()
|
|
137
|
+
logger.debug('Request interrupted. Retry immediately.')
|
|
138
|
+
continue
|
|
139
|
+
except Exception as e: # pylint: disable=broad-except
|
|
140
|
+
_handle_exception()
|
|
141
|
+
if consecutive_failed_count >= max_retries:
|
|
142
|
+
# Retries exhausted.
|
|
143
|
+
raise
|
|
144
|
+
if not is_transient_error(e):
|
|
145
|
+
# Permanent error, no need to retry.
|
|
146
|
+
raise
|
|
147
|
+
logger.debug(
|
|
148
|
+
f'Retry {func.__name__} due to {e}, '
|
|
149
|
+
f'attempt {consecutive_failed_count}/{max_retries}')
|
|
150
|
+
# Only sleep if this is not the first retry.
|
|
151
|
+
# The idea is that if the function made progress on a
|
|
152
|
+
# retry, we should try again immediately to reduce the
|
|
153
|
+
# waiting time.
|
|
154
|
+
if consecutive_failed_count > 0:
|
|
155
|
+
time.sleep(backoff.current_backoff())
|
|
156
|
+
|
|
157
|
+
return cast(F, wrapper)
|
|
158
|
+
|
|
159
|
+
return decorator
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
163
|
+
initial_backoff: float = 5.0,
|
|
164
|
+
max_backoff_factor: int = 5):
|
|
165
|
+
"""Decorator that retries a function when ServerTemporarilyUnavailableError
|
|
166
|
+
is caught.
|
|
167
|
+
|
|
168
|
+
This decorator is mainly used to decorate a Restful API call to make
|
|
169
|
+
the API call wait for server recovery when server is temporarily
|
|
170
|
+
unavailable.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
max_wait_seconds: Maximum number of seconds to wait for the server to
|
|
174
|
+
be healthy
|
|
175
|
+
initial_backoff: Initial backoff time in seconds
|
|
176
|
+
max_backoff_factor: Maximum backoff factor for exponential backoff
|
|
177
|
+
|
|
178
|
+
Notes(dev):
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
def _readable_error_msg(message: str) -> str:
|
|
182
|
+
return (f'{colorama.Fore.YELLOW}API server is temporarily '
|
|
183
|
+
f'unavailable: {message}.\nRetrying...'
|
|
184
|
+
f'{colorama.Style.RESET_ALL}')
|
|
185
|
+
|
|
186
|
+
def decorator(func: F) -> F:
|
|
187
|
+
|
|
188
|
+
@functools.wraps(func)
|
|
189
|
+
def wrapper(*args, **kwargs) -> Any:
|
|
190
|
+
|
|
191
|
+
backoff = common_utils.Backoff(
|
|
192
|
+
initial_backoff=initial_backoff,
|
|
193
|
+
max_backoff_factor=max_backoff_factor)
|
|
194
|
+
start_time = time.time()
|
|
195
|
+
attempt = 0
|
|
196
|
+
|
|
197
|
+
with _retry_in_context():
|
|
198
|
+
while True:
|
|
199
|
+
attempt += 1
|
|
200
|
+
try:
|
|
201
|
+
return func(*args, **kwargs)
|
|
202
|
+
except exceptions.ServerTemporarilyUnavailableError as e:
|
|
203
|
+
# This will cause the status spinner being stopped and
|
|
204
|
+
# restarted in every retry loop. But it is necessary to
|
|
205
|
+
# stop the status spinner before retrying func() to
|
|
206
|
+
# avoid the status spinner get stuck if the func() runs
|
|
207
|
+
# for a long time without update status, e.g. sky logs.
|
|
208
|
+
with rich_utils.client_status(
|
|
209
|
+
_readable_error_msg(e.message)):
|
|
210
|
+
if time.time() - start_time > max_wait_seconds:
|
|
211
|
+
# pylint: disable=line-too-long
|
|
212
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
|
213
|
+
'Timeout waiting for the API server to be '
|
|
214
|
+
f'available after {max_wait_seconds}s.') \
|
|
215
|
+
from e
|
|
216
|
+
|
|
217
|
+
sleep_time = backoff.current_backoff()
|
|
218
|
+
time.sleep(sleep_time)
|
|
219
|
+
logger.debug('The API server is unavailable. '
|
|
220
|
+
f'Retrying {func.__name__} '
|
|
221
|
+
f'(attempt {attempt}, '
|
|
222
|
+
f'backoff {sleep_time}s).')
|
|
223
|
+
|
|
224
|
+
return cast(F, wrapper)
|
|
225
|
+
|
|
226
|
+
return decorator
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def handle_server_unavailable(response: 'requests.Response') -> None:
|
|
230
|
+
"""Handle 503 (Service Unavailable) error
|
|
231
|
+
|
|
232
|
+
The client get 503 error in the following cases:
|
|
233
|
+
1. The reverse proxy cannot find any ready backend endpoints to serve the
|
|
234
|
+
request, e.g. when there is and rolling-update.
|
|
235
|
+
2. The skypilot API server has temporary resource issue, e.g. when the
|
|
236
|
+
cucurrency of the handling process is exhausted.
|
|
237
|
+
|
|
238
|
+
We expect the caller (CLI or SDK) retry on these cases and show clear wait
|
|
239
|
+
message to the user to let user decide whether keep waiting or abort the
|
|
240
|
+
request.
|
|
241
|
+
"""
|
|
242
|
+
if response.status_code != 503:
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
# error_msg = 'SkyPilot API server is temporarily unavailable. '
|
|
246
|
+
error_msg = ''
|
|
247
|
+
try:
|
|
248
|
+
response_data = response.json()
|
|
249
|
+
if 'detail' in response_data:
|
|
250
|
+
error_msg = response_data['detail']
|
|
251
|
+
except Exception: # pylint: disable=broad-except
|
|
252
|
+
if response.text:
|
|
253
|
+
error_msg = response.text
|
|
254
|
+
|
|
255
|
+
with ux_utils.print_exception_no_traceback():
|
|
256
|
+
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
async def handle_server_unavailable_async(
|
|
260
|
+
response: 'aiohttp.ClientResponse') -> None:
|
|
261
|
+
"""Async version: Handle 503 (Service Unavailable) error
|
|
262
|
+
|
|
263
|
+
The client get 503 error in the following cases:
|
|
264
|
+
1. The reverse proxy cannot find any ready backend endpoints to serve the
|
|
265
|
+
request, e.g. when there is and rolling-update.
|
|
266
|
+
2. The skypilot API server has temporary resource issue, e.g. when the
|
|
267
|
+
cucurrency of the handling process is exhausted.
|
|
268
|
+
|
|
269
|
+
We expect the caller (CLI or SDK) retry on these cases and show clear wait
|
|
270
|
+
message to the user to let user decide whether keep waiting or abort the
|
|
271
|
+
request.
|
|
272
|
+
"""
|
|
273
|
+
if response.status != 503:
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
error_msg = ''
|
|
277
|
+
try:
|
|
278
|
+
response_data = await response.json()
|
|
279
|
+
if 'detail' in response_data:
|
|
280
|
+
error_msg = response_data['detail']
|
|
281
|
+
except Exception: # pylint: disable=broad-except
|
|
282
|
+
try:
|
|
283
|
+
text = await response.text()
|
|
284
|
+
if text:
|
|
285
|
+
error_msg = text
|
|
286
|
+
except Exception: # pylint: disable=broad-except
|
|
287
|
+
pass
|
|
288
|
+
|
|
289
|
+
with ux_utils.print_exception_no_traceback():
|
|
290
|
+
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@_retry_on_server_unavailable()
|
|
294
|
+
def request(method, url, **kwargs) -> 'requests.Response':
|
|
295
|
+
"""Send a request to the API server, retry on server temporarily
|
|
296
|
+
unavailable."""
|
|
297
|
+
return request_without_retry(method, url, **kwargs)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def request_without_retry(method, url, **kwargs) -> 'requests.Response':
|
|
301
|
+
"""Send a request to the API server without retry."""
|
|
302
|
+
response = _session.request(method, url, **kwargs)
|
|
303
|
+
handle_server_unavailable(response)
|
|
304
|
+
remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
|
|
305
|
+
remote_version = response.headers.get(constants.VERSION_HEADER)
|
|
306
|
+
if remote_api_version is not None:
|
|
307
|
+
versions.set_remote_api_version(int(remote_api_version))
|
|
308
|
+
if remote_version is not None:
|
|
309
|
+
versions.set_remote_version(remote_version)
|
|
310
|
+
return response
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# Async versions of the above functions
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
async def request_async(session: 'aiohttp.ClientSession', method: str, url: str,
|
|
317
|
+
**kwargs) -> 'aiohttp.ClientResponse':
|
|
318
|
+
"""Send an async request to the API server, retry on server temporarily
|
|
319
|
+
unavailable."""
|
|
320
|
+
max_retries = 3
|
|
321
|
+
initial_backoff = 1.0
|
|
322
|
+
max_backoff_factor = 5
|
|
323
|
+
|
|
324
|
+
backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
|
|
325
|
+
last_exception = Exception('Uknown Exception') # this will be replaced by e
|
|
326
|
+
|
|
327
|
+
for retry_count in range(max_retries):
|
|
328
|
+
try:
|
|
329
|
+
return await request_without_retry_async(session, method, url,
|
|
330
|
+
**kwargs)
|
|
331
|
+
except exceptions.RequestInterruptedError:
|
|
332
|
+
logger.debug('Request interrupted. Retry immediately.')
|
|
333
|
+
continue
|
|
334
|
+
except Exception as e: # pylint: disable=broad-except
|
|
335
|
+
last_exception = e
|
|
336
|
+
if retry_count >= max_retries - 1:
|
|
337
|
+
# Retries exhausted
|
|
338
|
+
raise
|
|
339
|
+
|
|
340
|
+
# Check if this is a transient error (similar to sync version logic)
|
|
341
|
+
is_transient = _is_transient_error_async(e)
|
|
342
|
+
if not is_transient:
|
|
343
|
+
# Permanent error, no need to retry
|
|
344
|
+
raise
|
|
345
|
+
|
|
346
|
+
logger.debug(f'Retry async request due to {e}, '
|
|
347
|
+
f'attempt {retry_count + 1}/{max_retries}')
|
|
348
|
+
await asyncio.sleep(backoff.current_backoff())
|
|
349
|
+
|
|
350
|
+
# This should never be reached, but just in case
|
|
351
|
+
raise last_exception
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
async def request_without_retry_async(session: 'aiohttp.ClientSession',
|
|
355
|
+
method: str, url: str,
|
|
356
|
+
**kwargs) -> 'aiohttp.ClientResponse':
|
|
357
|
+
"""Send an async request to the API server without retry."""
|
|
358
|
+
# Add API version headers for compatibility (like sync version does)
|
|
359
|
+
if 'headers' not in kwargs:
|
|
360
|
+
kwargs['headers'] = {}
|
|
361
|
+
kwargs['headers'][constants.API_VERSION_HEADER] = str(constants.API_VERSION)
|
|
362
|
+
kwargs['headers'][constants.VERSION_HEADER] = (
|
|
363
|
+
versions.get_local_readable_version())
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
response = await session.request(method, url, **kwargs)
|
|
367
|
+
|
|
368
|
+
# Handle server unavailability (503 status) - same as sync version
|
|
369
|
+
await handle_server_unavailable_async(response)
|
|
370
|
+
|
|
371
|
+
# Set remote API version and version from headers - same as sync version
|
|
372
|
+
remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
|
|
373
|
+
remote_version = response.headers.get(constants.VERSION_HEADER)
|
|
374
|
+
if remote_api_version is not None:
|
|
375
|
+
versions.set_remote_api_version(int(remote_api_version))
|
|
376
|
+
if remote_version is not None:
|
|
377
|
+
versions.set_remote_version(remote_version)
|
|
378
|
+
|
|
379
|
+
return response
|
|
380
|
+
|
|
381
|
+
except aiohttp.ClientError as e:
|
|
382
|
+
# Convert aiohttp errors to appropriate SkyPilot exceptions
|
|
383
|
+
if isinstance(e, aiohttp.ClientConnectorError):
|
|
384
|
+
raise exceptions.RequestInterruptedError(
|
|
385
|
+
f'Connection failed: {e}') from e
|
|
386
|
+
elif isinstance(e, aiohttp.ClientTimeout):
|
|
387
|
+
raise exceptions.RequestInterruptedError(
|
|
388
|
+
f'Request timeout: {e}') from e
|
|
389
|
+
else:
|
|
390
|
+
raise
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _is_transient_error_async(e: Exception) -> bool:
|
|
394
|
+
"""Check if an exception from async request is transient and should be
|
|
395
|
+
retried.
|
|
396
|
+
|
|
397
|
+
Mirrors the logic from the sync version's is_transient_error().
|
|
398
|
+
"""
|
|
399
|
+
if isinstance(e, aiohttp.ClientError):
|
|
400
|
+
# For response errors, check status code if available
|
|
401
|
+
if isinstance(e, aiohttp.ClientResponseError):
|
|
402
|
+
# Only server error is considered as transient (same as sync
|
|
403
|
+
# version)
|
|
404
|
+
return e.status >= 500
|
|
405
|
+
# Consider connection errors and timeouts as transient
|
|
406
|
+
if isinstance(e, (aiohttp.ClientConnectorError, aiohttp.ClientTimeout)):
|
|
407
|
+
return True
|
|
408
|
+
|
|
409
|
+
# Consider server temporarily unavailable as transient
|
|
410
|
+
if isinstance(e, exceptions.ServerTemporarilyUnavailableError):
|
|
411
|
+
return True
|
|
412
|
+
|
|
413
|
+
# It is hard to enumerate all other errors that are transient, e.g.
|
|
414
|
+
# broken pipe, connection refused, etc. Instead, it is safer to assume
|
|
415
|
+
# all other errors might be transient since we only retry for 3 times
|
|
416
|
+
# by default. (Same comment as in sync version)
|
|
417
|
+
return True
|