skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/common.py
CHANGED
|
@@ -3,42 +3,54 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
5
|
import functools
|
|
6
|
+
from http.cookiejar import CookieJar
|
|
6
7
|
from http.cookiejar import MozillaCookieJar
|
|
7
|
-
import json
|
|
8
8
|
import os
|
|
9
9
|
import pathlib
|
|
10
10
|
import re
|
|
11
|
+
import shutil
|
|
11
12
|
import subprocess
|
|
12
13
|
import sys
|
|
14
|
+
import tempfile
|
|
15
|
+
import threading
|
|
13
16
|
import time
|
|
14
17
|
import typing
|
|
15
|
-
from typing import Any, Dict, Optional
|
|
16
|
-
|
|
18
|
+
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
|
+
Tuple, TypeVar, Union)
|
|
17
20
|
import uuid
|
|
18
21
|
|
|
22
|
+
import cachetools
|
|
19
23
|
import colorama
|
|
20
24
|
import filelock
|
|
25
|
+
from passlib import context as passlib_context
|
|
26
|
+
from typing_extensions import ParamSpec
|
|
21
27
|
|
|
22
|
-
import sky
|
|
23
28
|
from sky import exceptions
|
|
24
29
|
from sky import sky_logging
|
|
25
30
|
from sky import skypilot_config
|
|
26
31
|
from sky.adaptors import common as adaptors_common
|
|
32
|
+
from sky.client import service_account_auth
|
|
27
33
|
from sky.data import data_utils
|
|
28
34
|
from sky.server import constants as server_constants
|
|
35
|
+
from sky.server import rest
|
|
36
|
+
from sky.server import versions
|
|
29
37
|
from sky.skylet import constants
|
|
30
38
|
from sky.usage import usage_lib
|
|
31
39
|
from sky.utils import annotations
|
|
32
40
|
from sky.utils import common_utils
|
|
33
41
|
from sky.utils import rich_utils
|
|
34
42
|
from sky.utils import ux_utils
|
|
43
|
+
from sky.utils import yaml_utils
|
|
35
44
|
|
|
36
45
|
if typing.TYPE_CHECKING:
|
|
46
|
+
import aiohttp
|
|
37
47
|
import pydantic
|
|
38
48
|
import requests
|
|
39
49
|
|
|
40
50
|
from sky import dag as dag_lib
|
|
51
|
+
from sky import models
|
|
41
52
|
else:
|
|
53
|
+
aiohttp = adaptors_common.LazyImport('aiohttp')
|
|
42
54
|
pydantic = adaptors_common.LazyImport('pydantic')
|
|
43
55
|
requests = adaptors_common.LazyImport('requests')
|
|
44
56
|
|
|
@@ -50,7 +62,7 @@ AVAILABLE_LOCAL_API_SERVER_URLS = [
|
|
|
50
62
|
|
|
51
63
|
API_SERVER_CMD = '-m sky.server.server'
|
|
52
64
|
# The client dir on the API server for storing user-specific data, such as file
|
|
53
|
-
# mounts, logs, etc. This dir is
|
|
65
|
+
# mounts, logs, etc. This dir is ephemeral and will be cleaned up when the API
|
|
54
66
|
# server is restarted.
|
|
55
67
|
API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
|
|
56
68
|
RETRY_COUNT_ON_TIMEOUT = 3
|
|
@@ -60,34 +72,11 @@ RETRY_COUNT_ON_TIMEOUT = 3
|
|
|
60
72
|
# (e.g. in high contention env) and we will exit eagerly if server exit.
|
|
61
73
|
WAIT_APISERVER_START_TIMEOUT_SEC = 60
|
|
62
74
|
|
|
63
|
-
_VERSION_INFO = (
|
|
64
|
-
f'{colorama.Style.RESET_ALL}'
|
|
65
|
-
f'{colorama.Style.DIM}'
|
|
66
|
-
'client version: v{client_version} (API version: v{client_api_version})\n'
|
|
67
|
-
'server version: v{server_version} (API version: v{server_api_version})'
|
|
68
|
-
f'{colorama.Style.RESET_ALL}')
|
|
69
75
|
_LOCAL_API_SERVER_RESTART_HINT = (
|
|
70
|
-
f'{colorama.Fore.YELLOW}
|
|
76
|
+
f'{colorama.Fore.YELLOW}The local SkyPilot API server is not compatible '
|
|
77
|
+
'with the client. Please restart the API server with:\n'
|
|
71
78
|
f'{colorama.Style.BRIGHT}sky api stop; sky api start'
|
|
72
79
|
f'{colorama.Style.RESET_ALL}')
|
|
73
|
-
_LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
|
|
74
|
-
f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
|
|
75
|
-
'{version_info}\n'
|
|
76
|
-
f'{_LOCAL_API_SERVER_RESTART_HINT}'
|
|
77
|
-
f'{colorama.Style.RESET_ALL}')
|
|
78
|
-
_CLIENT_TOO_OLD_WARNING = (
|
|
79
|
-
f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
|
|
80
|
-
'{version_info}\n'
|
|
81
|
-
f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
|
|
82
|
-
'{command}'
|
|
83
|
-
f'{colorama.Style.RESET_ALL}')
|
|
84
|
-
_REMOTE_SERVER_TOO_OLD_WARNING = (
|
|
85
|
-
f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
|
|
86
|
-
'{version_info}\n'
|
|
87
|
-
f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
|
|
88
|
-
'remote API server or downgrade your local client with:\n'
|
|
89
|
-
'{command}\n'
|
|
90
|
-
f'{colorama.Style.RESET_ALL}')
|
|
91
80
|
_SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
|
|
92
81
|
f'{colorama.Fore.YELLOW}SkyPilot API server version does not match the '
|
|
93
82
|
'installation on disk:\n'
|
|
@@ -99,23 +88,32 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
|
|
|
99
88
|
f'{colorama.Fore.YELLOW}This can happen if you upgraded SkyPilot without '
|
|
100
89
|
'restarting the API server.'
|
|
101
90
|
f'{colorama.Style.RESET_ALL}')
|
|
102
|
-
# Parse local API version eargly to catch version format errors.
|
|
103
|
-
_LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
|
|
104
|
-
# SkyPilot dev version.
|
|
105
|
-
_DEV_VERSION = '1.0.0-dev0'
|
|
106
91
|
|
|
107
|
-
|
|
92
|
+
T = TypeVar('T')
|
|
93
|
+
P = ParamSpec('P')
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class RequestId(str, Generic[T]):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
108
100
|
ApiVersion = Optional[str]
|
|
109
101
|
|
|
110
102
|
logger = sky_logging.init_logger(__name__)
|
|
111
103
|
|
|
112
104
|
hinted_for_server_install_version_mismatch = False
|
|
113
105
|
|
|
106
|
+
crypt_ctx = passlib_context.CryptContext([
|
|
107
|
+
'bcrypt', 'sha256_crypt', 'sha512_crypt', 'des_crypt', 'apr_md5_crypt',
|
|
108
|
+
'ldap_sha1'
|
|
109
|
+
])
|
|
110
|
+
|
|
114
111
|
|
|
115
112
|
class ApiServerStatus(enum.Enum):
|
|
116
113
|
HEALTHY = 'healthy'
|
|
117
114
|
UNHEALTHY = 'unhealthy'
|
|
118
115
|
VERSION_MISMATCH = 'version_mismatch'
|
|
116
|
+
NEEDS_AUTH = 'needs_auth'
|
|
119
117
|
|
|
120
118
|
|
|
121
119
|
@dataclasses.dataclass
|
|
@@ -125,20 +123,209 @@ class ApiServerInfo:
|
|
|
125
123
|
version: Optional[str] = None
|
|
126
124
|
version_on_disk: Optional[str] = None
|
|
127
125
|
commit: Optional[str] = None
|
|
126
|
+
user: Optional[Dict[str, Any]] = None
|
|
127
|
+
basic_auth_enabled: bool = False
|
|
128
|
+
error: Optional[str] = None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_api_cookie_jar_path() -> pathlib.Path:
|
|
132
|
+
"""Returns the Path to the API cookie jar file."""
|
|
133
|
+
return pathlib.Path(
|
|
134
|
+
os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR,
|
|
135
|
+
server_constants.API_COOKIE_FILE_DEFAULT_LOCATION)
|
|
136
|
+
).expanduser().resolve()
|
|
128
137
|
|
|
129
138
|
|
|
130
139
|
def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
|
131
140
|
"""Returns the cookie jar used by the client to access the API server."""
|
|
132
|
-
cookie_file = os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR)
|
|
133
141
|
cookie_jar = requests.cookies.RequestsCookieJar()
|
|
134
|
-
|
|
135
|
-
|
|
142
|
+
cookie_path = get_api_cookie_jar_path()
|
|
143
|
+
if cookie_path.exists():
|
|
136
144
|
file_cookie_jar = MozillaCookieJar(cookie_path)
|
|
137
145
|
file_cookie_jar.load()
|
|
138
146
|
cookie_jar.update(file_cookie_jar)
|
|
139
147
|
return cookie_jar
|
|
140
148
|
|
|
141
149
|
|
|
150
|
+
def set_api_cookie_jar(cookie_jar: CookieJar,
|
|
151
|
+
create_if_not_exists: bool = True) -> None:
|
|
152
|
+
"""Updates the file cookie jar with the given cookie jar."""
|
|
153
|
+
if len(cookie_jar) == 0:
|
|
154
|
+
return
|
|
155
|
+
cookie_path = get_api_cookie_jar_path()
|
|
156
|
+
if not cookie_path.exists() and not create_if_not_exists:
|
|
157
|
+
# if the file doesn't exist and we don't want to create it, do nothing
|
|
158
|
+
return
|
|
159
|
+
if not cookie_path.parent.exists():
|
|
160
|
+
cookie_path.parent.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
|
|
162
|
+
# Writing directly to the cookie jar path can race with other processes that
|
|
163
|
+
# are reading the cookie jar, making it look malformed. Instead, write to a
|
|
164
|
+
# temporary file and then move it to the final location.
|
|
165
|
+
# Avoid hardcoding the tmp file path, since it could cause a race with other
|
|
166
|
+
# processes that are also writing to the tmp file.
|
|
167
|
+
with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
|
|
168
|
+
delete=False) as tmp_file:
|
|
169
|
+
tmp_cookie_path = tmp_file.name
|
|
170
|
+
file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
|
|
171
|
+
if cookie_path.exists():
|
|
172
|
+
file_cookie_jar.load(str(cookie_path))
|
|
173
|
+
|
|
174
|
+
for cookie in cookie_jar:
|
|
175
|
+
file_cookie_jar.set_cookie(cookie)
|
|
176
|
+
file_cookie_jar.save()
|
|
177
|
+
|
|
178
|
+
# Move the temporary file to the final location.
|
|
179
|
+
os.replace(tmp_cookie_path, cookie_path)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_cookies_from_response(
|
|
183
|
+
response: 'requests.Response') -> requests.cookies.RequestsCookieJar:
|
|
184
|
+
"""Returns the cookies from the API server response."""
|
|
185
|
+
server_url = get_server_url()
|
|
186
|
+
cookies = response.cookies
|
|
187
|
+
for prev_resp in response.history:
|
|
188
|
+
for cookie in prev_resp.cookies:
|
|
189
|
+
if cookie.domain in server_url:
|
|
190
|
+
cookies.set_cookie(cookie)
|
|
191
|
+
return cookies
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _prepare_authenticated_request_params(
|
|
195
|
+
path: str,
|
|
196
|
+
server_url: Optional[str] = None,
|
|
197
|
+
**kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
198
|
+
"""Prepare common parameters for authenticated requests (sync or async).
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Tuple of (url, updated_kwargs)
|
|
202
|
+
"""
|
|
203
|
+
if server_url is None:
|
|
204
|
+
server_url = get_server_url()
|
|
205
|
+
|
|
206
|
+
# Prepare headers and URL for service account authentication
|
|
207
|
+
headers = service_account_auth.get_service_account_headers()
|
|
208
|
+
|
|
209
|
+
# Merge with existing headers
|
|
210
|
+
if 'headers' in kwargs:
|
|
211
|
+
headers.update(kwargs['headers'])
|
|
212
|
+
kwargs['headers'] = headers
|
|
213
|
+
|
|
214
|
+
# Always use the same URL regardless of authentication type
|
|
215
|
+
# OAuth2 proxy will handle authentication based on headers
|
|
216
|
+
url = f'{server_url}/{path}' if not path.startswith(
|
|
217
|
+
'/') else f'{server_url}{path}'
|
|
218
|
+
|
|
219
|
+
# Use cookie authentication if no Bearer token present
|
|
220
|
+
if not headers.get('Authorization') and 'cookies' not in kwargs:
|
|
221
|
+
kwargs['cookies'] = get_api_cookie_jar()
|
|
222
|
+
|
|
223
|
+
return url, kwargs
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _convert_requests_cookies_to_aiohttp(
|
|
227
|
+
cookie_jar: requests.cookies.RequestsCookieJar) -> Dict[str, str]:
|
|
228
|
+
"""Convert requests cookie jar to aiohttp-compatible dict format."""
|
|
229
|
+
cookies = {}
|
|
230
|
+
for cookie in cookie_jar:
|
|
231
|
+
cookies[cookie.name] = cookie.value
|
|
232
|
+
return cookies # type: ignore
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def make_authenticated_request(method: str,
|
|
236
|
+
path: str,
|
|
237
|
+
server_url: Optional[str] = None,
|
|
238
|
+
retry: bool = True,
|
|
239
|
+
**kwargs) -> 'requests.Response':
|
|
240
|
+
"""Make an authenticated HTTP request to the API server.
|
|
241
|
+
|
|
242
|
+
Automatically handles service account token authentication or cookie-based
|
|
243
|
+
authentication based on what's available.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
method: HTTP method (GET, POST, etc.)
|
|
247
|
+
path: API path (e.g., '/api/v1/status')
|
|
248
|
+
server_url: Server URL, defaults to configured server
|
|
249
|
+
retry: Whether to retry on transient errors
|
|
250
|
+
**kwargs: Additional arguments to pass to requests
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
requests.Response object
|
|
254
|
+
"""
|
|
255
|
+
url, kwargs = _prepare_authenticated_request_params(path, server_url,
|
|
256
|
+
**kwargs)
|
|
257
|
+
|
|
258
|
+
# Make the request
|
|
259
|
+
if retry:
|
|
260
|
+
return rest.request(method, url, **kwargs)
|
|
261
|
+
else:
|
|
262
|
+
assert method == 'GET', 'Only GET requests can be done without retry'
|
|
263
|
+
return rest.request_without_retry(method, url, **kwargs)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
async def make_authenticated_request_async(
|
|
267
|
+
session: 'aiohttp.ClientSession',
|
|
268
|
+
method: str,
|
|
269
|
+
path: str,
|
|
270
|
+
server_url: Optional[str] = None,
|
|
271
|
+
retry: bool = True,
|
|
272
|
+
**kwargs) -> 'aiohttp.ClientResponse':
|
|
273
|
+
"""Make an authenticated async HTTP request to the API server using aiohttp.
|
|
274
|
+
|
|
275
|
+
Automatically handles service account token authentication or cookie-based
|
|
276
|
+
authentication based on what's available.
|
|
277
|
+
|
|
278
|
+
Example usage:
|
|
279
|
+
async with aiohttp.ClientSession() as session:
|
|
280
|
+
response = await make_authenticated_request_async(
|
|
281
|
+
session, 'GET', '/api/v1/status')
|
|
282
|
+
data = await response.json()
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
session: aiohttp ClientSession to use for the request
|
|
286
|
+
method: HTTP method (GET, POST, etc.)
|
|
287
|
+
path: API path (e.g., '/api/v1/status')
|
|
288
|
+
server_url: Server URL, defaults to configured server
|
|
289
|
+
retry: Whether to retry on transient errors
|
|
290
|
+
**kwargs: Additional arguments to pass to aiohttp
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
aiohttp.ClientResponse object
|
|
294
|
+
|
|
295
|
+
Raises:
|
|
296
|
+
aiohttp.ClientError: For HTTP-related errors
|
|
297
|
+
exceptions.ServerTemporarilyUnavailableError: When server returns 503
|
|
298
|
+
exceptions.RequestInterruptedError: When request is interrupted
|
|
299
|
+
"""
|
|
300
|
+
url, kwargs = _prepare_authenticated_request_params(path, server_url,
|
|
301
|
+
**kwargs)
|
|
302
|
+
|
|
303
|
+
# Convert cookies to aiohttp format if needed
|
|
304
|
+
if 'cookies' in kwargs and isinstance(kwargs['cookies'],
|
|
305
|
+
requests.cookies.RequestsCookieJar):
|
|
306
|
+
kwargs['cookies'] = _convert_requests_cookies_to_aiohttp(
|
|
307
|
+
kwargs['cookies'])
|
|
308
|
+
|
|
309
|
+
# Convert params to strings for aiohttp compatibility
|
|
310
|
+
if 'params' in kwargs and kwargs['params'] is not None:
|
|
311
|
+
normalized_params = {}
|
|
312
|
+
for key, value in kwargs['params'].items():
|
|
313
|
+
if isinstance(value, bool):
|
|
314
|
+
normalized_params[key] = str(value).lower()
|
|
315
|
+
elif value is not None:
|
|
316
|
+
normalized_params[key] = str(value)
|
|
317
|
+
# Skip None values
|
|
318
|
+
kwargs['params'] = normalized_params
|
|
319
|
+
|
|
320
|
+
# Make the request
|
|
321
|
+
if retry:
|
|
322
|
+
return await rest.request_async(session, method, url, **kwargs)
|
|
323
|
+
else:
|
|
324
|
+
assert method == 'GET', 'Only GET requests can be done without retry'
|
|
325
|
+
return await rest.request_without_retry_async(session, method, url,
|
|
326
|
+
**kwargs)
|
|
327
|
+
|
|
328
|
+
|
|
142
329
|
@annotations.lru_cache(scope='global')
|
|
143
330
|
def get_server_url(host: Optional[str] = None) -> str:
|
|
144
331
|
endpoint = DEFAULT_SERVER_URL
|
|
@@ -152,27 +339,42 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
|
152
339
|
|
|
153
340
|
|
|
154
341
|
@annotations.lru_cache(scope='global')
|
|
155
|
-
def get_dashboard_url(server_url: str
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
|
|
163
|
-
if parsed.port:
|
|
164
|
-
dashboard_url = f'{dashboard_url}:{parsed.port}'
|
|
165
|
-
if parsed.path:
|
|
166
|
-
dashboard_url = f'{dashboard_url}{parsed.path}'
|
|
167
|
-
dashboard_url = dashboard_url.rstrip('/')
|
|
168
|
-
return f'{dashboard_url}/dashboard'
|
|
342
|
+
def get_dashboard_url(server_url: str,
|
|
343
|
+
starting_page: Optional[str] = None) -> str:
|
|
344
|
+
dashboard_url = server_url.rstrip('/')
|
|
345
|
+
dashboard_url = f'{dashboard_url}/dashboard'
|
|
346
|
+
if starting_page:
|
|
347
|
+
dashboard_url = f'{dashboard_url}/{starting_page}'
|
|
348
|
+
return dashboard_url
|
|
169
349
|
|
|
170
350
|
|
|
171
351
|
@annotations.lru_cache(scope='global')
|
|
172
|
-
def is_api_server_local():
|
|
173
|
-
|
|
352
|
+
def is_api_server_local(endpoint: Optional[str] = None):
|
|
353
|
+
server_url = endpoint if endpoint is not None else get_server_url()
|
|
354
|
+
return server_url in AVAILABLE_LOCAL_API_SERVER_URLS
|
|
174
355
|
|
|
175
356
|
|
|
357
|
+
def _handle_non_200_server_status(
|
|
358
|
+
response: 'requests.Response') -> ApiServerInfo:
|
|
359
|
+
if response.status_code == 401:
|
|
360
|
+
return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
|
|
361
|
+
if response.status_code == 400:
|
|
362
|
+
# Check if a version mismatch error is returned.
|
|
363
|
+
try:
|
|
364
|
+
body = response.json()
|
|
365
|
+
if (body.get('error',
|
|
366
|
+
'') == ApiServerStatus.VERSION_MISMATCH.value):
|
|
367
|
+
return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
|
|
368
|
+
error=body.get('message', ''))
|
|
369
|
+
except requests.JSONDecodeError:
|
|
370
|
+
pass
|
|
371
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
@cachetools.cached(cache=cachetools.TTLCache(maxsize=10,
|
|
375
|
+
ttl=5.0,
|
|
376
|
+
timer=time.time),
|
|
377
|
+
lock=threading.RLock())
|
|
176
378
|
def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
177
379
|
"""Retrieve the status of the API server.
|
|
178
380
|
|
|
@@ -193,35 +395,10 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
193
395
|
server_url = endpoint if endpoint is not None else get_server_url()
|
|
194
396
|
while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
|
|
195
397
|
try:
|
|
196
|
-
response =
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
result = response.json()
|
|
202
|
-
api_version = result.get('api_version')
|
|
203
|
-
version = result.get('version')
|
|
204
|
-
version_on_disk = result.get('version_on_disk')
|
|
205
|
-
commit = result.get('commit')
|
|
206
|
-
server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
|
|
207
|
-
api_version=api_version,
|
|
208
|
-
version=version,
|
|
209
|
-
version_on_disk=version_on_disk,
|
|
210
|
-
commit=commit)
|
|
211
|
-
if api_version is None or version is None or commit is None:
|
|
212
|
-
logger.warning(f'API server response missing '
|
|
213
|
-
f'version info. {server_url} may '
|
|
214
|
-
f'not be running SkyPilot API server.')
|
|
215
|
-
server_info.status = ApiServerStatus.UNHEALTHY
|
|
216
|
-
elif api_version != server_constants.API_VERSION:
|
|
217
|
-
server_info.status = ApiServerStatus.VERSION_MISMATCH
|
|
218
|
-
return server_info
|
|
219
|
-
except (json.JSONDecodeError, AttributeError) as e:
|
|
220
|
-
logger.warning('Failed to parse API server response: '
|
|
221
|
-
f'{str(e)}')
|
|
222
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
223
|
-
else:
|
|
224
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
398
|
+
response = make_authenticated_request('GET',
|
|
399
|
+
'/api/health',
|
|
400
|
+
server_url=server_url,
|
|
401
|
+
timeout=2.5)
|
|
225
402
|
except requests.exceptions.Timeout:
|
|
226
403
|
if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
|
|
227
404
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
@@ -230,38 +407,127 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
230
407
|
except requests.exceptions.ConnectionError:
|
|
231
408
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
232
409
|
|
|
410
|
+
logger.debug(f'Health check status: {response.status_code}')
|
|
411
|
+
|
|
412
|
+
if response.status_code != 200:
|
|
413
|
+
return _handle_non_200_server_status(response)
|
|
414
|
+
|
|
415
|
+
# The response is 200, so we can parse the response.
|
|
416
|
+
try:
|
|
417
|
+
result = response.json()
|
|
418
|
+
server_status = result.get('status')
|
|
419
|
+
api_version = result.get('api_version')
|
|
420
|
+
version = result.get('version')
|
|
421
|
+
version_on_disk = result.get('version_on_disk')
|
|
422
|
+
commit = result.get('commit')
|
|
423
|
+
user = result.get('user')
|
|
424
|
+
basic_auth_enabled = result.get('basic_auth_enabled')
|
|
425
|
+
server_info = ApiServerInfo(status=ApiServerStatus(server_status),
|
|
426
|
+
api_version=api_version,
|
|
427
|
+
version=version,
|
|
428
|
+
version_on_disk=version_on_disk,
|
|
429
|
+
commit=commit,
|
|
430
|
+
user=user,
|
|
431
|
+
basic_auth_enabled=basic_auth_enabled)
|
|
432
|
+
if api_version is None or version is None or commit is None:
|
|
433
|
+
logger.warning(f'API server response missing '
|
|
434
|
+
f'version info. {server_url} may '
|
|
435
|
+
f'not be running SkyPilot API server.')
|
|
436
|
+
server_info.status = ApiServerStatus.UNHEALTHY
|
|
437
|
+
version_info = versions.check_compatibility_at_client(
|
|
438
|
+
response.headers)
|
|
439
|
+
if version_info is None:
|
|
440
|
+
# Backward compatibility for server prior to v0.11.0 which
|
|
441
|
+
# does not check compatibility at server side.
|
|
442
|
+
# TODO(aylei): remove this after v0.13.0 is released.
|
|
443
|
+
return ApiServerInfo(
|
|
444
|
+
status=ApiServerStatus.VERSION_MISMATCH,
|
|
445
|
+
error=versions.SERVER_TOO_OLD_ERROR.format(
|
|
446
|
+
remote_version=version,
|
|
447
|
+
local_version=versions.get_local_readable_version(),
|
|
448
|
+
min_version=server_constants.MIN_COMPATIBLE_VERSION,
|
|
449
|
+
command=versions.install_version_command(
|
|
450
|
+
version, commit)))
|
|
451
|
+
if version_info.error is not None:
|
|
452
|
+
return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
|
|
453
|
+
error=version_info.error)
|
|
454
|
+
|
|
455
|
+
cookies = get_cookies_from_response(response)
|
|
456
|
+
# Save or refresh the cookie jar in case of session affinity and
|
|
457
|
+
# OAuth.
|
|
458
|
+
set_api_cookie_jar(cookies, create_if_not_exists=True)
|
|
459
|
+
return server_info
|
|
460
|
+
except (requests.JSONDecodeError, AttributeError) as e:
|
|
461
|
+
# Try to check if we got redirected to a login page.
|
|
462
|
+
for prev_response in response.history:
|
|
463
|
+
logger.debug(f'Previous response: {prev_response.url}')
|
|
464
|
+
# Heuristic: check if the url looks like a login page or
|
|
465
|
+
# oauth flow.
|
|
466
|
+
if any(key in prev_response.url for key in ['login', 'oauth2']):
|
|
467
|
+
logger.debug(f'URL {prev_response.url} looks like '
|
|
468
|
+
'a login page or oauth flow, so try to '
|
|
469
|
+
'get the cookie.')
|
|
470
|
+
return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
|
|
471
|
+
logger.warning('Failed to parse API server response: '
|
|
472
|
+
f'{str(e)}')
|
|
473
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
474
|
+
|
|
233
475
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
234
476
|
|
|
235
477
|
|
|
236
478
|
def handle_request_error(response: 'requests.Response') -> None:
|
|
479
|
+
# Keep the original HTTPError if the response code >= 400
|
|
480
|
+
response.raise_for_status()
|
|
481
|
+
|
|
482
|
+
# Other status codes are not expected neither, e.g. we do not expect to
|
|
483
|
+
# handle redirection here.
|
|
237
484
|
if response.status_code != 200:
|
|
238
485
|
with ux_utils.print_exception_no_traceback():
|
|
239
486
|
raise RuntimeError(
|
|
240
487
|
'Failed to process response from SkyPilot API server at '
|
|
241
|
-
f'{
|
|
488
|
+
f'{response.url}. '
|
|
242
489
|
f'Response: {response.status_code} '
|
|
243
490
|
f'{response.text}')
|
|
244
491
|
|
|
245
492
|
|
|
246
|
-
def get_request_id(response: 'requests.Response') -> RequestId:
|
|
493
|
+
def get_request_id(response: 'requests.Response') -> RequestId[T]:
|
|
247
494
|
handle_request_error(response)
|
|
248
|
-
request_id = response.headers.get('X-Request-ID')
|
|
495
|
+
request_id = response.headers.get('X-Skypilot-Request-ID')
|
|
496
|
+
if request_id is None:
|
|
497
|
+
request_id = response.headers.get('X-Request-ID')
|
|
249
498
|
if request_id is None:
|
|
250
499
|
with ux_utils.print_exception_no_traceback():
|
|
251
500
|
raise RuntimeError(
|
|
252
501
|
'Failed to get request ID from SkyPilot API server at '
|
|
253
502
|
f'{get_server_url()}. Response: {response.status_code} '
|
|
254
503
|
f'{response.text}')
|
|
255
|
-
return request_id
|
|
504
|
+
return RequestId[T](request_id)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def get_stream_request_id(
|
|
508
|
+
response: 'requests.Response') -> Optional[RequestId[T]]:
|
|
509
|
+
"""This is same as the above function, but just for `sdk.stream_and_get.
|
|
510
|
+
We do this because `/api/stream` may choose the latest request id, and
|
|
511
|
+
we need to keep track of that information. Request id in this case can
|
|
512
|
+
be None."""
|
|
513
|
+
handle_request_error(response)
|
|
514
|
+
request_id = response.headers.get(server_constants.STREAM_REQUEST_HEADER)
|
|
515
|
+
if request_id is not None:
|
|
516
|
+
return RequestId[T](request_id)
|
|
517
|
+
return None
|
|
256
518
|
|
|
257
519
|
|
|
258
520
|
def _start_api_server(deploy: bool = False,
|
|
259
521
|
host: str = '127.0.0.1',
|
|
260
|
-
foreground: bool = False
|
|
522
|
+
foreground: bool = False,
|
|
523
|
+
metrics: bool = False,
|
|
524
|
+
metrics_port: Optional[int] = None,
|
|
525
|
+
enable_basic_auth: bool = False):
|
|
261
526
|
"""Starts a SkyPilot API server locally."""
|
|
262
527
|
server_url = get_server_url(host)
|
|
263
528
|
assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
|
|
264
529
|
f'server url {server_url} is not a local url')
|
|
530
|
+
|
|
265
531
|
with rich_utils.client_status('Starting SkyPilot API server, '
|
|
266
532
|
f'view logs at {constants.API_SERVER_LOGS}'):
|
|
267
533
|
logger.info(f'{colorama.Style.DIM}Failed to connect to '
|
|
@@ -273,40 +539,71 @@ def _start_api_server(deploy: bool = False,
|
|
|
273
539
|
'is not a local URL')
|
|
274
540
|
|
|
275
541
|
# Check available memory before starting the server.
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
542
|
+
# Skip this warning if postgres is used, as:
|
|
543
|
+
# 1) that's almost certainly a remote API server;
|
|
544
|
+
# 2) the actual consolidation mode config is stashed in the database,
|
|
545
|
+
# and the value of `job_utils.is_consolidation_mode` will not be
|
|
546
|
+
# the actual value in the db, but only None as in this case, the
|
|
547
|
+
# whole YAML config is really just `db: <URI>`.
|
|
548
|
+
if skypilot_config.get_nested(('db',), None) is None:
|
|
549
|
+
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
|
550
|
+
# pylint: disable=import-outside-toplevel
|
|
551
|
+
import sky.jobs.utils as job_utils
|
|
552
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
553
|
+
if job_utils.is_consolidation_mode(
|
|
554
|
+
on_api_restart=True) else
|
|
555
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
556
|
+
if avail_mem_size_gb <= max_memory:
|
|
557
|
+
logger.warning(
|
|
558
|
+
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
|
|
559
|
+
f'only has {avail_mem_size_gb:.1f}GB memory available. '
|
|
560
|
+
f'At least {max_memory}GB is recommended to support higher '
|
|
561
|
+
'load with better performance.'
|
|
562
|
+
f'{colorama.Style.RESET_ALL}')
|
|
284
563
|
|
|
285
564
|
args = [sys.executable, *API_SERVER_CMD.split()]
|
|
286
565
|
if deploy:
|
|
287
566
|
args += ['--deploy']
|
|
288
567
|
if host is not None:
|
|
289
568
|
args += [f'--host={host}']
|
|
569
|
+
if metrics_port is not None:
|
|
570
|
+
args += [f'--metrics-port={metrics_port}']
|
|
290
571
|
|
|
291
572
|
if foreground:
|
|
292
573
|
# Replaces the current process with the API server
|
|
293
574
|
os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
|
575
|
+
_set_metrics_env_var(os.environ, metrics, deploy)
|
|
576
|
+
if enable_basic_auth:
|
|
577
|
+
os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
|
294
578
|
os.execvp(args[0], args)
|
|
295
579
|
|
|
296
580
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
|
297
581
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
|
298
|
-
cmd = f'{" ".join(args)} > {log_path} 2>&1 < /dev/null'
|
|
299
582
|
|
|
583
|
+
# For spawn mode, copy the environ to avoid polluting the SDK process.
|
|
584
|
+
server_env = os.environ.copy()
|
|
585
|
+
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
|
300
586
|
# Start the API server process in the background and don't wait for it.
|
|
301
587
|
# If this is called from a CLI invocation, we need
|
|
302
588
|
# start_new_session=True so that SIGINT on the CLI will not also kill
|
|
303
589
|
# the API server.
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
590
|
+
if enable_basic_auth:
|
|
591
|
+
server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
|
|
592
|
+
_set_metrics_env_var(server_env, metrics, deploy)
|
|
593
|
+
with open(log_path, 'w', encoding='utf-8') as log_file:
|
|
594
|
+
# Because the log file is opened using a with statement, it may seem
|
|
595
|
+
# that the file will be closed when the with statement is exited
|
|
596
|
+
# causing the child process to be unable to write to the log file.
|
|
597
|
+
# However, Popen makes the file descriptor inheritable which means
|
|
598
|
+
# the child process will inherit its own copy of the fd,
|
|
599
|
+
# independent of the parent's fd table which enables to child
|
|
600
|
+
# process to continue writing to the log file.
|
|
601
|
+
proc = subprocess.Popen(args,
|
|
602
|
+
stdout=log_file,
|
|
603
|
+
stderr=subprocess.STDOUT,
|
|
604
|
+
stdin=subprocess.DEVNULL,
|
|
605
|
+
start_new_session=True,
|
|
606
|
+
env=server_env)
|
|
310
607
|
|
|
311
608
|
start_time = time.time()
|
|
312
609
|
while True:
|
|
@@ -317,6 +614,8 @@ def _start_api_server(deploy: bool = False,
|
|
|
317
614
|
'SkyPilot API server process exited unexpectedly.\n'
|
|
318
615
|
f'View logs at: {constants.API_SERVER_LOGS}')
|
|
319
616
|
try:
|
|
617
|
+
# Clear the cache to ensure fresh checks during startup
|
|
618
|
+
get_api_server_status.cache_clear() # type: ignore
|
|
320
619
|
check_server_healthy()
|
|
321
620
|
except exceptions.APIVersionMismatchError:
|
|
322
621
|
raise
|
|
@@ -335,7 +634,7 @@ def _start_api_server(deploy: bool = False,
|
|
|
335
634
|
server_url = get_server_url(host)
|
|
336
635
|
dashboard_msg = ''
|
|
337
636
|
api_server_info = get_api_server_status(server_url)
|
|
338
|
-
if api_server_info.version ==
|
|
637
|
+
if api_server_info.version == versions.DEV_VERSION:
|
|
339
638
|
dashboard_msg += (
|
|
340
639
|
f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
|
|
341
640
|
f'{colorama.Fore.YELLOW}')
|
|
@@ -348,17 +647,40 @@ def _start_api_server(deploy: bool = False,
|
|
|
348
647
|
dashboard_msg += (
|
|
349
648
|
'Dashboard may be stale when installed from source, '
|
|
350
649
|
'to rebuild: npm --prefix sky/dashboard install '
|
|
351
|
-
'&& npm --prefix sky/dashboard run build
|
|
352
|
-
dashboard_msg += (
|
|
353
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
|
354
|
-
f'Dashboard: {get_dashboard_url(server_url)}')
|
|
355
|
-
dashboard_msg += f'{colorama.Style.RESET_ALL}'
|
|
650
|
+
'&& npm --prefix sky/dashboard run build')
|
|
356
651
|
logger.info(
|
|
357
652
|
ux_utils.finishing_message(
|
|
358
653
|
f'SkyPilot API server started. {dashboard_msg}'))
|
|
359
654
|
|
|
360
655
|
|
|
361
|
-
def
|
|
656
|
+
def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
|
|
657
|
+
deploy: bool):
|
|
658
|
+
"""Sets the metrics environment variables.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
env: The environment variables to set.
|
|
662
|
+
metrics: Whether to enable metrics.
|
|
663
|
+
deploy: Whether the server is running in deploy mode, which means
|
|
664
|
+
multiple processes might be running.
|
|
665
|
+
"""
|
|
666
|
+
del deploy
|
|
667
|
+
if metrics or os.getenv(constants.ENV_VAR_SERVER_METRICS_ENABLED) == 'true':
|
|
668
|
+
env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
|
|
669
|
+
# Always set the metrics dir since we need to collect metrics from
|
|
670
|
+
# subprocesses like the executor.
|
|
671
|
+
metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
|
|
672
|
+
shutil.rmtree(metrics_dir, ignore_errors=True)
|
|
673
|
+
os.makedirs(metrics_dir, exist_ok=True)
|
|
674
|
+
# Refer to https://prometheus.github.io/client_python/multiprocess/
|
|
675
|
+
env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def check_server_healthy(
|
|
679
|
+
endpoint: Optional[str] = None
|
|
680
|
+
) -> Tuple[Literal[
|
|
681
|
+
# Use an incomplete list of Literals here to enforce raising for other
|
|
682
|
+
# enum values.
|
|
683
|
+
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH], ApiServerInfo]:
|
|
362
684
|
"""Check if the API server is healthy.
|
|
363
685
|
|
|
364
686
|
Args:
|
|
@@ -368,38 +690,21 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
|
368
690
|
Raises:
|
|
369
691
|
RuntimeError: If the server is not healthy or the client version does
|
|
370
692
|
not match the server version.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
ApiServerStatus: The status of the API server, unless the server is
|
|
696
|
+
unhealthy or the client version does not match the server version,
|
|
697
|
+
in which case an exception is raised.
|
|
371
698
|
"""
|
|
372
699
|
endpoint = endpoint if endpoint is not None else get_server_url()
|
|
373
700
|
api_server_info = get_api_server_status(endpoint)
|
|
374
701
|
api_server_status = api_server_info.status
|
|
375
702
|
if api_server_status == ApiServerStatus.VERSION_MISMATCH:
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
try:
|
|
379
|
-
server_is_older = int(sv) < _LOCAL_API_VERSION
|
|
380
|
-
except ValueError:
|
|
381
|
-
# Raised when the server version using an unknown scheme.
|
|
382
|
-
# Version compatibility checking is expected to handle all legacy
|
|
383
|
-
# cases so we safely assume the server is newer when the version
|
|
384
|
-
# scheme is unknown.
|
|
385
|
-
logger.debug('API server version using unknown scheme: %s', sv)
|
|
386
|
-
server_is_older = False
|
|
387
|
-
version_info = _get_version_info_hint(api_server_info)
|
|
388
|
-
if is_api_server_local():
|
|
703
|
+
msg = api_server_info.error
|
|
704
|
+
if is_api_server_local(endpoint):
|
|
389
705
|
# For local server, just hint user to restart the server to get
|
|
390
706
|
# a consistent version.
|
|
391
|
-
msg =
|
|
392
|
-
version_info=version_info)
|
|
393
|
-
else:
|
|
394
|
-
assert api_server_info.version is not None, 'Server version is None'
|
|
395
|
-
if server_is_older:
|
|
396
|
-
msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
|
|
397
|
-
version_info=version_info,
|
|
398
|
-
command=_install_server_version_command(api_server_info))
|
|
399
|
-
else:
|
|
400
|
-
msg = _CLIENT_TOO_OLD_WARNING.format(
|
|
401
|
-
version_info=version_info,
|
|
402
|
-
command=_install_server_version_command(api_server_info))
|
|
707
|
+
msg = _LOCAL_API_SERVER_RESTART_HINT
|
|
403
708
|
with ux_utils.print_exception_no_traceback():
|
|
404
709
|
raise exceptions.APIVersionMismatchError(msg)
|
|
405
710
|
elif api_server_status == ApiServerStatus.UNHEALTHY:
|
|
@@ -430,36 +735,7 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
|
430
735
|
|
|
431
736
|
hinted_for_server_install_version_mismatch = True
|
|
432
737
|
|
|
433
|
-
|
|
434
|
-
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
|
435
|
-
assert server_info.version is not None, 'Server version is None'
|
|
436
|
-
# version_on_disk may be None if the server is older
|
|
437
|
-
assert server_info.commit is not None, 'Server commit is None'
|
|
438
|
-
sv = server_info.version
|
|
439
|
-
cv = sky.__version__
|
|
440
|
-
if server_info.version == _DEV_VERSION:
|
|
441
|
-
sv = f'{sv} with commit {server_info.commit}'
|
|
442
|
-
if cv == _DEV_VERSION:
|
|
443
|
-
cv = f'{cv} with commit {sky.__commit__}'
|
|
444
|
-
return _VERSION_INFO.format(client_version=cv,
|
|
445
|
-
server_version=sv,
|
|
446
|
-
client_api_version=server_constants.API_VERSION,
|
|
447
|
-
server_api_version=server_info.api_version)
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
def _install_server_version_command(server_info: ApiServerInfo) -> str:
|
|
451
|
-
assert server_info.version is not None, 'Server version is None'
|
|
452
|
-
assert server_info.commit is not None, 'Server commit is None'
|
|
453
|
-
if server_info.version == _DEV_VERSION:
|
|
454
|
-
# Dev build without valid version.
|
|
455
|
-
return ('pip install git+https://github.com/skypilot-org/skypilot@'
|
|
456
|
-
f'{server_info.commit}')
|
|
457
|
-
elif 'dev' in server_info.version:
|
|
458
|
-
# Nightly version.
|
|
459
|
-
return f'pip install -U "skypilot-nightly=={server_info.version}"'
|
|
460
|
-
else:
|
|
461
|
-
# Stable version.
|
|
462
|
-
return f'pip install -U "skypilot=={server_info.version}"'
|
|
738
|
+
return api_server_status, api_server_info
|
|
463
739
|
|
|
464
740
|
|
|
465
741
|
# Keep in sync with sky/setup_files/setup.py find_version()
|
|
@@ -479,9 +755,17 @@ def get_skypilot_version_on_disk() -> str:
|
|
|
479
755
|
|
|
480
756
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
481
757
|
host: str = '127.0.0.1',
|
|
482
|
-
foreground: bool = False
|
|
758
|
+
foreground: bool = False,
|
|
759
|
+
metrics: bool = False,
|
|
760
|
+
metrics_port: Optional[int] = None,
|
|
761
|
+
enable_basic_auth: bool = False):
|
|
762
|
+
api_server_status = None
|
|
483
763
|
try:
|
|
484
|
-
check_server_healthy()
|
|
764
|
+
api_server_status, _ = check_server_healthy()
|
|
765
|
+
if api_server_status == ApiServerStatus.NEEDS_AUTH:
|
|
766
|
+
endpoint = get_server_url()
|
|
767
|
+
with ux_utils.print_exception_no_traceback():
|
|
768
|
+
raise exceptions.ApiServerAuthenticationError(endpoint)
|
|
485
769
|
except exceptions.ApiServerConnectionError as exc:
|
|
486
770
|
endpoint = get_server_url()
|
|
487
771
|
if not is_api_server_local():
|
|
@@ -493,19 +777,21 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
|
493
777
|
os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
|
|
494
778
|
# Check again if server is already running. Other processes may
|
|
495
779
|
# have started the server while we were waiting for the lock.
|
|
780
|
+
get_api_server_status.cache_clear() # type: ignore[attr-defined]
|
|
496
781
|
api_server_info = get_api_server_status(endpoint)
|
|
497
782
|
if api_server_info.status == ApiServerStatus.UNHEALTHY:
|
|
498
|
-
_start_api_server(deploy, host, foreground
|
|
783
|
+
_start_api_server(deploy, host, foreground, metrics,
|
|
784
|
+
metrics_port, enable_basic_auth)
|
|
499
785
|
|
|
500
786
|
|
|
501
|
-
def check_server_healthy_or_start(func):
|
|
787
|
+
def check_server_healthy_or_start(func: Callable[P, T]) -> Callable[P, T]:
|
|
502
788
|
|
|
503
789
|
@functools.wraps(func)
|
|
504
790
|
def wrapper(*args, deploy: bool = False, host: str = '127.0.0.1', **kwargs):
|
|
505
791
|
check_server_healthy_or_start_fn(deploy, host)
|
|
506
792
|
return func(*args, **kwargs)
|
|
507
793
|
|
|
508
|
-
return wrapper
|
|
794
|
+
return cast(Callable[P, T], wrapper)
|
|
509
795
|
|
|
510
796
|
|
|
511
797
|
def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
@@ -549,20 +835,21 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
549
835
|
return str(client_file_mounts_dir /
|
|
550
836
|
file_mounts_mapping[original_path].lstrip('/'))
|
|
551
837
|
|
|
552
|
-
task_configs =
|
|
838
|
+
task_configs = yaml_utils.read_yaml_all(str(client_task_path))
|
|
553
839
|
for task_config in task_configs:
|
|
554
840
|
if task_config is None:
|
|
555
841
|
continue
|
|
556
|
-
file_mounts_mapping = task_config.
|
|
842
|
+
file_mounts_mapping = task_config.pop('file_mounts_mapping', {})
|
|
557
843
|
if not file_mounts_mapping:
|
|
558
844
|
# We did not mount any files to new paths on the remote server
|
|
559
845
|
# so no need to resolve filepaths.
|
|
560
846
|
continue
|
|
561
847
|
if 'workdir' in task_config:
|
|
562
848
|
workdir = task_config['workdir']
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
849
|
+
if isinstance(workdir, str):
|
|
850
|
+
task_config['workdir'] = str(
|
|
851
|
+
client_file_mounts_dir /
|
|
852
|
+
file_mounts_mapping[workdir].lstrip('/'))
|
|
566
853
|
if workdir_only:
|
|
567
854
|
continue
|
|
568
855
|
if 'file_mounts' in task_config:
|
|
@@ -601,7 +888,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
601
888
|
# We can switch to using string, but this is to make it easier to debug, by
|
|
602
889
|
# persisting the translated task yaml file.
|
|
603
890
|
translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
|
|
604
|
-
|
|
891
|
+
yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
|
|
605
892
|
|
|
606
893
|
dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
|
|
607
894
|
return dag
|
|
@@ -622,25 +909,33 @@ def request_body_to_params(body: 'pydantic.BaseModel') -> Dict[str, Any]:
|
|
|
622
909
|
|
|
623
910
|
def reload_for_new_request(client_entrypoint: Optional[str],
|
|
624
911
|
client_command: Optional[str],
|
|
625
|
-
using_remote_api_server: bool
|
|
626
|
-
|
|
912
|
+
using_remote_api_server: bool, user: 'models.User',
|
|
913
|
+
request_id: str) -> None:
|
|
914
|
+
"""Reload modules, global variables, and usage message for a new request.
|
|
915
|
+
|
|
916
|
+
Must be called within the request's context.
|
|
917
|
+
"""
|
|
627
918
|
# This should be called first to make sure the logger is up-to-date.
|
|
628
919
|
sky_logging.reload_logger()
|
|
629
920
|
|
|
630
921
|
# Reload the skypilot config to make sure the latest config is used.
|
|
631
|
-
|
|
922
|
+
# We don't need to grab the lock here because this function is only
|
|
923
|
+
# run once we are inside the request's context, so there shouldn't
|
|
924
|
+
# be any race conditions when reloading the config.
|
|
925
|
+
skypilot_config.reload_config()
|
|
632
926
|
|
|
633
927
|
# Reset the client entrypoint and command for the usage message.
|
|
634
|
-
common_utils.
|
|
928
|
+
common_utils.set_request_context(
|
|
635
929
|
client_entrypoint=client_entrypoint,
|
|
636
930
|
client_command=client_command,
|
|
637
931
|
using_remote_api_server=using_remote_api_server,
|
|
932
|
+
user=user,
|
|
933
|
+
request_id=request_id,
|
|
638
934
|
)
|
|
639
935
|
|
|
640
936
|
# Clear cache should be called before reload_logger and usage reset,
|
|
641
937
|
# otherwise, the latest env var will not be used.
|
|
642
|
-
|
|
643
|
-
func.cache_clear()
|
|
938
|
+
annotations.clear_request_level_cache()
|
|
644
939
|
|
|
645
940
|
# We need to reset usage message, so that the message is up-to-date with the
|
|
646
941
|
# latest information in the context, e.g. client entrypoint and run id.
|
|
@@ -658,6 +953,7 @@ def clear_local_api_server_database() -> None:
|
|
|
658
953
|
db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
659
954
|
for extension in ['', '-shm', '-wal']:
|
|
660
955
|
try:
|
|
956
|
+
logger.debug(f'Removing database file {db_path}{extension}')
|
|
661
957
|
os.remove(f'{db_path}{extension}')
|
|
662
958
|
except FileNotFoundError:
|
|
663
959
|
logger.debug(f'Database file {db_path}{extension} not found.')
|