skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/config.py
CHANGED
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
+
from typing import Optional
|
|
5
6
|
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky.server import constants as server_constants
|
|
9
|
+
from sky.server import daemons
|
|
8
10
|
from sky.utils import common_utils
|
|
9
11
|
|
|
10
12
|
# Constants based on profiling the peak memory usage while serving various
|
|
@@ -18,8 +20,9 @@ from sky.utils import common_utils
|
|
|
18
20
|
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
|
19
21
|
# automatically tune parallelism at runtime according to system usage stats
|
|
20
22
|
# in the future.
|
|
21
|
-
|
|
22
|
-
|
|
23
|
+
# TODO(luca): The future is now! ^^^
|
|
24
|
+
LONG_WORKER_MEM_GB = 0.4
|
|
25
|
+
SHORT_WORKER_MEM_GB = 0.3
|
|
23
26
|
# To control the number of long workers.
|
|
24
27
|
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
|
25
28
|
# Limit the number of long workers of local API server, since local server is
|
|
@@ -34,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
|
|
|
34
37
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
|
35
38
|
# Minimal number of long workers to ensure responsiveness.
|
|
36
39
|
_MIN_LONG_WORKERS = 1
|
|
37
|
-
# Minimal number of short workers
|
|
38
|
-
|
|
39
|
-
_MIN_SHORT_WORKERS = 2
|
|
40
|
+
# Minimal number of idle short workers to ensure responsiveness.
|
|
41
|
+
_MIN_IDLE_SHORT_WORKERS = 1
|
|
40
42
|
|
|
41
43
|
# Default number of burstable workers for local API server. A heuristic number
|
|
42
44
|
# that is large enough for most local cases.
|
|
@@ -61,6 +63,7 @@ class QueueBackend(enum.Enum):
|
|
|
61
63
|
class WorkerConfig:
|
|
62
64
|
garanteed_parallelism: int
|
|
63
65
|
burstable_parallelism: int
|
|
66
|
+
num_db_connections_per_worker: int
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
@dataclasses.dataclass
|
|
@@ -68,10 +71,15 @@ class ServerConfig:
|
|
|
68
71
|
num_server_workers: int
|
|
69
72
|
long_worker_config: WorkerConfig
|
|
70
73
|
short_worker_config: WorkerConfig
|
|
74
|
+
num_db_connections_per_worker: int
|
|
71
75
|
queue_backend: QueueBackend
|
|
72
76
|
|
|
73
77
|
|
|
74
|
-
def compute_server_config(
|
|
78
|
+
def compute_server_config(
|
|
79
|
+
deploy: bool,
|
|
80
|
+
max_db_connections: Optional[int] = None,
|
|
81
|
+
quiet: bool = False,
|
|
82
|
+
reserved_memory_mb: Optional[float] = None) -> ServerConfig:
|
|
75
83
|
"""Compute the server config based on environment.
|
|
76
84
|
|
|
77
85
|
We have different assumptions for the resources in different deployment
|
|
@@ -105,7 +113,11 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
105
113
|
process after API server was introduced.
|
|
106
114
|
"""
|
|
107
115
|
cpu_count = common_utils.get_cpu_count()
|
|
116
|
+
logger.debug(f'CPU count: {cpu_count}')
|
|
108
117
|
mem_size_gb = common_utils.get_mem_size_gb()
|
|
118
|
+
if reserved_memory_mb is not None:
|
|
119
|
+
mem_size_gb -= (reserved_memory_mb / 1024)
|
|
120
|
+
logger.debug(f'Memory size: {mem_size_gb}GB')
|
|
109
121
|
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
|
110
122
|
mem_size_gb,
|
|
111
123
|
local=not deploy)
|
|
@@ -114,7 +126,17 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
114
126
|
queue_backend = QueueBackend.MULTIPROCESSING
|
|
115
127
|
burstable_parallel_for_long = 0
|
|
116
128
|
burstable_parallel_for_short = 0
|
|
129
|
+
# if num_db_connections_per_worker is 0, server will use NullPool
|
|
130
|
+
# to conserve the number of concurrent db connections.
|
|
131
|
+
# This could lead to performance degradation.
|
|
132
|
+
num_db_connections_per_worker = 0
|
|
117
133
|
num_server_workers = cpu_count
|
|
134
|
+
|
|
135
|
+
# +1 for the event loop running the main process
|
|
136
|
+
# and gc daemons in the '__main__' body of sky/server/server.py
|
|
137
|
+
max_parallel_all_workers = (max_parallel_for_long + max_parallel_for_short +
|
|
138
|
+
num_server_workers + 1)
|
|
139
|
+
|
|
118
140
|
if not deploy:
|
|
119
141
|
# For local mode, use local queue backend since we only run 1 uvicorn
|
|
120
142
|
# worker in local mode and no multiprocessing is needed.
|
|
@@ -125,7 +147,12 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
125
147
|
burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
|
|
126
148
|
# Runs in low resource mode if the available memory is less than
|
|
127
149
|
# server_constants.MIN_AVAIL_MEM_GB.
|
|
128
|
-
|
|
150
|
+
# pylint: disable=import-outside-toplevel
|
|
151
|
+
import sky.jobs.utils as job_utils
|
|
152
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
153
|
+
if job_utils.is_consolidation_mode() else
|
|
154
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
155
|
+
if not deploy and mem_size_gb < max_memory:
|
|
129
156
|
# Permanent worker process may have significant memory consumption
|
|
130
157
|
# (~350MB per worker) after running commands like `sky check`, so we
|
|
131
158
|
# don't start any permanent workers in low resource local mode. This
|
|
@@ -136,24 +163,41 @@ def compute_server_config(deploy: bool) -> ServerConfig:
|
|
|
136
163
|
# permanently because it never exits.
|
|
137
164
|
max_parallel_for_long = 0
|
|
138
165
|
max_parallel_for_short = 0
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
166
|
+
if not quiet:
|
|
167
|
+
logger.warning(
|
|
168
|
+
'SkyPilot API server will run in low resource mode because '
|
|
169
|
+
'the available memory is less than '
|
|
170
|
+
f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
|
|
171
|
+
elif max_db_connections is not None:
|
|
172
|
+
if max_parallel_all_workers > max_db_connections:
|
|
173
|
+
if not quiet:
|
|
174
|
+
logger.warning(
|
|
175
|
+
f'Max parallel all workers ({max_parallel_all_workers}) '
|
|
176
|
+
'is greater than max db connections '
|
|
177
|
+
f'({max_db_connections}). Increase the number of max db '
|
|
178
|
+
f'connections to at least {max_parallel_all_workers} for '
|
|
179
|
+
'optimal performance.')
|
|
180
|
+
else:
|
|
181
|
+
num_db_connections_per_worker = 1
|
|
182
|
+
|
|
183
|
+
if not quiet:
|
|
184
|
+
logger.info(
|
|
185
|
+
f'SkyPilot API server will start {num_server_workers} server '
|
|
186
|
+
f'processes with {max_parallel_for_long} background workers for '
|
|
187
|
+
f'long requests and will allow at max {max_parallel_for_short} '
|
|
188
|
+
'short requests in parallel.')
|
|
148
189
|
return ServerConfig(
|
|
149
190
|
num_server_workers=num_server_workers,
|
|
150
191
|
queue_backend=queue_backend,
|
|
151
192
|
long_worker_config=WorkerConfig(
|
|
152
193
|
garanteed_parallelism=max_parallel_for_long,
|
|
153
|
-
burstable_parallelism=burstable_parallel_for_long
|
|
194
|
+
burstable_parallelism=burstable_parallel_for_long,
|
|
195
|
+
num_db_connections_per_worker=num_db_connections_per_worker),
|
|
154
196
|
short_worker_config=WorkerConfig(
|
|
155
197
|
garanteed_parallelism=max_parallel_for_short,
|
|
156
|
-
burstable_parallelism=burstable_parallel_for_short
|
|
198
|
+
burstable_parallelism=burstable_parallel_for_short,
|
|
199
|
+
num_db_connections_per_worker=num_db_connections_per_worker),
|
|
200
|
+
num_db_connections_per_worker=num_db_connections_per_worker,
|
|
157
201
|
)
|
|
158
202
|
|
|
159
203
|
|
|
@@ -162,10 +206,15 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
162
206
|
local=False) -> int:
|
|
163
207
|
"""Max parallelism for long workers."""
|
|
164
208
|
# Reserve min available memory to avoid OOM.
|
|
165
|
-
|
|
209
|
+
# pylint: disable=import-outside-toplevel
|
|
210
|
+
import sky.jobs.utils as job_utils
|
|
211
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
212
|
+
if job_utils.is_consolidation_mode() else
|
|
213
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
214
|
+
available_mem = max(0, mem_size_gb - max_memory)
|
|
166
215
|
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
|
167
216
|
mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
|
168
|
-
|
|
217
|
+
LONG_WORKER_MEM_GB)
|
|
169
218
|
n = max(_MIN_LONG_WORKERS,
|
|
170
219
|
min(cpu_based_max_parallel, mem_based_max_parallel))
|
|
171
220
|
if local:
|
|
@@ -173,12 +222,25 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
173
222
|
return n
|
|
174
223
|
|
|
175
224
|
|
|
225
|
+
def _get_min_short_workers() -> int:
|
|
226
|
+
"""Min number of short workers."""
|
|
227
|
+
daemon_count = 0
|
|
228
|
+
for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
229
|
+
if not daemon.should_skip():
|
|
230
|
+
daemon_count += 1
|
|
231
|
+
return _MIN_IDLE_SHORT_WORKERS + daemon_count
|
|
232
|
+
|
|
233
|
+
|
|
176
234
|
def _max_short_worker_parallism(mem_size_gb: float,
|
|
177
235
|
long_worker_parallism: int) -> int:
|
|
178
236
|
"""Max parallelism for short workers."""
|
|
179
237
|
# Reserve memory for long workers and min available memory.
|
|
180
|
-
|
|
181
|
-
|
|
238
|
+
# pylint: disable=import-outside-toplevel
|
|
239
|
+
import sky.jobs.utils as job_utils
|
|
240
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
241
|
+
if job_utils.is_consolidation_mode() else
|
|
242
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
243
|
+
reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
|
|
182
244
|
available_mem = max(0, mem_size_gb - reserved_mem)
|
|
183
|
-
n = max(
|
|
245
|
+
n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
|
|
184
246
|
return n
|
sky/server/constants.py
CHANGED
|
@@ -4,17 +4,37 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from sky.skylet import constants
|
|
6
6
|
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
# the
|
|
10
|
-
|
|
7
|
+
# pylint: disable=line-too-long
|
|
8
|
+
# The SkyPilot API version that the code currently use.
|
|
9
|
+
# Bump this version when the API is changed and special compatibility handling
|
|
10
|
+
# based on version info is needed.
|
|
11
|
+
# For more details and code guidelines, refer to:
|
|
12
|
+
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
+
API_VERSION = 24
|
|
14
|
+
|
|
15
|
+
# The minimum peer API version that the code should still work with.
|
|
16
|
+
# Notes (dev):
|
|
17
|
+
# - This value is maintained by the CI pipeline, DO NOT EDIT this manually.
|
|
18
|
+
# - Compatibility code for versions lower than this can be safely removed.
|
|
19
|
+
# Refer to API_VERSION for more details.
|
|
20
|
+
MIN_COMPATIBLE_API_VERSION = 11
|
|
21
|
+
|
|
22
|
+
# The semantic version of the minimum compatible API version.
|
|
23
|
+
# Refer to MIN_COMPATIBLE_API_VERSION for more details.
|
|
24
|
+
# Note (dev): DO NOT EDIT this constant manually.
|
|
25
|
+
MIN_COMPATIBLE_VERSION = '0.10.0'
|
|
26
|
+
|
|
27
|
+
# The HTTP header name for the API version of the sender.
|
|
28
|
+
API_VERSION_HEADER = 'X-SkyPilot-API-Version'
|
|
29
|
+
|
|
30
|
+
# The HTTP header name for the SkyPilot version of the sender.
|
|
31
|
+
VERSION_HEADER = 'X-SkyPilot-Version'
|
|
11
32
|
|
|
12
33
|
# Prefix for API request names.
|
|
13
34
|
REQUEST_NAME_PREFIX = 'sky.'
|
|
14
|
-
# The user ID of the SkyPilot system.
|
|
15
|
-
SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
|
|
16
35
|
# The memory (GB) that SkyPilot tries to not use to prevent OOM.
|
|
17
36
|
MIN_AVAIL_MEM_GB = 2
|
|
37
|
+
MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
|
|
18
38
|
# Default encoder/decoder handler name.
|
|
19
39
|
DEFAULT_HANDLER_NAME = 'default'
|
|
20
40
|
# The path to the API request database.
|
|
@@ -24,9 +44,27 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
|
|
|
24
44
|
# background.
|
|
25
45
|
CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
|
26
46
|
|
|
47
|
+
# The interval (seconds) for the volume status to be refreshed in the
|
|
48
|
+
# background.
|
|
49
|
+
VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
|
50
|
+
|
|
27
51
|
# Environment variable for a file path to the API cookie file.
|
|
52
|
+
# Keep in sync with websocket_proxy.py
|
|
28
53
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
|
54
|
+
# Default file if unset.
|
|
55
|
+
# Keep in sync with websocket_proxy.py
|
|
56
|
+
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
|
29
57
|
|
|
30
58
|
# The path to the dashboard build output
|
|
31
59
|
DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|
|
32
60
|
'out')
|
|
61
|
+
|
|
62
|
+
# The interval (seconds) for the event to be restarted in the background.
|
|
63
|
+
DAEMON_RESTART_INTERVAL_SECONDS = 20
|
|
64
|
+
|
|
65
|
+
# Cookie header for stream request id.
|
|
66
|
+
STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
|
|
67
|
+
|
|
68
|
+
# Valid empty values for pickled fields (base64-encoded pickled None)
|
|
69
|
+
# base64.b64encode(pickle.dumps(None)).decode('utf-8')
|
|
70
|
+
EMPTY_PICKLED_VALUE = 'gAROLg=='
|
sky/server/daemons.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Internal server daemons that run in the background."""
|
|
2
|
+
import atexit
|
|
3
|
+
import dataclasses
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import typing
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky import skypilot_config
|
|
11
|
+
from sky.adaptors import common as adaptors_common
|
|
12
|
+
from sky.server import constants as server_constants
|
|
13
|
+
from sky.server.requests import request_names
|
|
14
|
+
from sky.skylet import constants
|
|
15
|
+
from sky.utils import annotations
|
|
16
|
+
from sky.utils import common_utils
|
|
17
|
+
from sky.utils import env_options
|
|
18
|
+
from sky.utils import locks
|
|
19
|
+
from sky.utils import subprocess_utils
|
|
20
|
+
from sky.utils import timeline
|
|
21
|
+
from sky.utils import ux_utils
|
|
22
|
+
|
|
23
|
+
if typing.TYPE_CHECKING:
|
|
24
|
+
import pathlib
|
|
25
|
+
else:
|
|
26
|
+
pathlib = adaptors_common.LazyImport('pathlib')
|
|
27
|
+
|
|
28
|
+
logger = sky_logging.init_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _default_should_skip():
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclasses.dataclass
|
|
36
|
+
class InternalRequestDaemon:
|
|
37
|
+
"""Internal daemon that runs an event in the background."""
|
|
38
|
+
|
|
39
|
+
id: str
|
|
40
|
+
name: request_names.RequestName
|
|
41
|
+
event_fn: Callable[[], None]
|
|
42
|
+
default_log_level: str = 'INFO'
|
|
43
|
+
should_skip: Callable[[], bool] = _default_should_skip
|
|
44
|
+
|
|
45
|
+
def refresh_log_level(self) -> int:
|
|
46
|
+
# pylint: disable=import-outside-toplevel
|
|
47
|
+
import logging
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
# Refresh config within the while loop.
|
|
51
|
+
# Since this is a long running daemon,
|
|
52
|
+
# reload_for_new_request()
|
|
53
|
+
# is not called in between the event runs.
|
|
54
|
+
# We don't need to grab the lock here because each of the daemons
|
|
55
|
+
# run in their own process and thus have their own request context.
|
|
56
|
+
skypilot_config.reload_config()
|
|
57
|
+
# Get the configured log level for the daemon inside the event loop
|
|
58
|
+
# in case the log level changes after the API server is started.
|
|
59
|
+
level_str = skypilot_config.get_nested(
|
|
60
|
+
('daemons', self.id, 'log_level'), self.default_log_level)
|
|
61
|
+
return getattr(logging, level_str.upper())
|
|
62
|
+
except AttributeError:
|
|
63
|
+
# Bad level should be rejected by
|
|
64
|
+
# schema validation, just in case.
|
|
65
|
+
logger.warning(f'Invalid log level: {level_str}, using DEBUG')
|
|
66
|
+
return logging.DEBUG
|
|
67
|
+
except Exception as e: # pylint: disable=broad-except
|
|
68
|
+
logger.exception(f'Error refreshing log level for {self.id}: {e}')
|
|
69
|
+
return logging.DEBUG
|
|
70
|
+
|
|
71
|
+
def run_event(self):
|
|
72
|
+
"""Run the event."""
|
|
73
|
+
|
|
74
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
|
75
|
+
# sent multiple times.
|
|
76
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
77
|
+
|
|
78
|
+
level = self.refresh_log_level()
|
|
79
|
+
while True:
|
|
80
|
+
try:
|
|
81
|
+
with ux_utils.enable_traceback(), \
|
|
82
|
+
sky_logging.set_sky_logging_levels(level):
|
|
83
|
+
sky_logging.reload_logger()
|
|
84
|
+
level = self.refresh_log_level()
|
|
85
|
+
self.event_fn()
|
|
86
|
+
except Exception: # pylint: disable=broad-except
|
|
87
|
+
# It is OK to fail to run the event, as the event is not
|
|
88
|
+
# critical, but we should log the error.
|
|
89
|
+
logger.exception(
|
|
90
|
+
f'Error running {self.name} event. '
|
|
91
|
+
f'Restarting in '
|
|
92
|
+
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
|
93
|
+
'seconds...')
|
|
94
|
+
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
|
95
|
+
finally:
|
|
96
|
+
# Clear request level cache after each run to avoid
|
|
97
|
+
# using too much memory.
|
|
98
|
+
annotations.clear_request_level_cache()
|
|
99
|
+
timeline.save_timeline()
|
|
100
|
+
# Kill all children processes related to this request.
|
|
101
|
+
# Each executor handles a single request, so we can safely
|
|
102
|
+
# kill all children processes related to this request.
|
|
103
|
+
subprocess_utils.kill_children_processes()
|
|
104
|
+
common_utils.release_memory()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def refresh_cluster_status_event():
|
|
108
|
+
"""Periodically refresh the cluster status."""
|
|
109
|
+
# pylint: disable=import-outside-toplevel
|
|
110
|
+
from sky.backends import backend_utils
|
|
111
|
+
|
|
112
|
+
logger.info('=== Refreshing cluster status ===')
|
|
113
|
+
# This periodically refresh will hold the lock for the cluster being
|
|
114
|
+
# refreshed, but it is OK because other operations will just wait for
|
|
115
|
+
# the lock and get the just refreshed status without refreshing again.
|
|
116
|
+
backend_utils.refresh_cluster_records()
|
|
117
|
+
logger.info('Status refreshed. Sleeping '
|
|
118
|
+
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
119
|
+
' seconds for the next refresh...\n')
|
|
120
|
+
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def refresh_volume_status_event():
|
|
124
|
+
"""Periodically refresh the volume status."""
|
|
125
|
+
# pylint: disable=import-outside-toplevel
|
|
126
|
+
from sky.volumes.server import core
|
|
127
|
+
|
|
128
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
|
129
|
+
# sent multiple times.
|
|
130
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
131
|
+
|
|
132
|
+
logger.info('=== Refreshing volume status ===')
|
|
133
|
+
core.volume_refresh()
|
|
134
|
+
logger.info('Volume status refreshed. Sleeping '
|
|
135
|
+
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
136
|
+
' seconds for the next refresh...\n')
|
|
137
|
+
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
_managed_job_consolidation_mode_lock = None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Attempt to gracefully release the lock when the process exits.
|
|
144
|
+
# If this fails, it's okay, the lock will be released when the process dies.
|
|
145
|
+
def _release_managed_job_consolidation_mode_lock() -> None:
|
|
146
|
+
global _managed_job_consolidation_mode_lock
|
|
147
|
+
if _managed_job_consolidation_mode_lock is not None:
|
|
148
|
+
_managed_job_consolidation_mode_lock.release()
|
|
149
|
+
_managed_job_consolidation_mode_lock = None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
atexit.register(_release_managed_job_consolidation_mode_lock)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def managed_job_status_refresh_event():
|
|
156
|
+
"""Refresh the managed job status for controller consolidation mode."""
|
|
157
|
+
# pylint: disable=import-outside-toplevel
|
|
158
|
+
from sky.jobs import constants as managed_job_constants
|
|
159
|
+
from sky.jobs import utils as managed_job_utils
|
|
160
|
+
|
|
161
|
+
global _managed_job_consolidation_mode_lock
|
|
162
|
+
if _managed_job_consolidation_mode_lock is None:
|
|
163
|
+
_managed_job_consolidation_mode_lock = locks.get_lock(
|
|
164
|
+
managed_job_constants.CONSOLIDATION_MODE_LOCK_ID)
|
|
165
|
+
|
|
166
|
+
# Touch the signal file here to avoid conflict with
|
|
167
|
+
# update_managed_jobs_statuses. Although we run
|
|
168
|
+
# ha_recovery_for_consolidation_mode before checking the job statuses
|
|
169
|
+
# (events.ManagedJobEvent), update_managed_jobs_statuses is also called in
|
|
170
|
+
# cancel_jobs_by_id.
|
|
171
|
+
# We also need to make sure that new controllers are not started until we
|
|
172
|
+
# acquire the consolidation mode lock, since if we have controllers on both
|
|
173
|
+
# the new and old API server during a rolling update, calling
|
|
174
|
+
# update_managed_jobs_statuses on the old API server could lead to
|
|
175
|
+
# FAILED_CONTROLLER.
|
|
176
|
+
signal_file = pathlib.Path(
|
|
177
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
|
|
178
|
+
try:
|
|
179
|
+
signal_file.touch()
|
|
180
|
+
|
|
181
|
+
# Make sure the lock is acquired for this process before proceeding to
|
|
182
|
+
# do recovery. This will block if another API server is still running,
|
|
183
|
+
# but should proceed once it is terminated and releases the lock.
|
|
184
|
+
if not _managed_job_consolidation_mode_lock.is_locked():
|
|
185
|
+
logger.info('Acquiring the consolidation mode lock: '
|
|
186
|
+
f'{_managed_job_consolidation_mode_lock}')
|
|
187
|
+
_managed_job_consolidation_mode_lock.acquire()
|
|
188
|
+
logger.info('Lock acquired!')
|
|
189
|
+
# We don't explicitly release the lock until the process exits.
|
|
190
|
+
# Even if _release_managed_job_consolidation_mode_lock is not called,
|
|
191
|
+
# the lock should be released when the process dies (either due to the
|
|
192
|
+
# advisory file lock being released or the postgres session dying).
|
|
193
|
+
|
|
194
|
+
# We run the recovery logic before checking the job statuses as those
|
|
195
|
+
# two are conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for
|
|
196
|
+
# details.
|
|
197
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
198
|
+
finally:
|
|
199
|
+
# Now, we should be sure that this is the only API server, we have
|
|
200
|
+
# started the new controllers and unclaimed all the jobs, and we are
|
|
201
|
+
# ready to update the job statuses.
|
|
202
|
+
signal_file.unlink()
|
|
203
|
+
|
|
204
|
+
# After recovery, we start the event loop.
|
|
205
|
+
from sky.skylet import events
|
|
206
|
+
refresh_event = events.ManagedJobEvent()
|
|
207
|
+
logger.info('=== Running managed job event ===')
|
|
208
|
+
refresh_event.run()
|
|
209
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def should_skip_managed_job_status_refresh():
|
|
213
|
+
"""Check if the managed job status refresh event should be skipped."""
|
|
214
|
+
# pylint: disable=import-outside-toplevel
|
|
215
|
+
from sky.jobs import utils as managed_job_utils
|
|
216
|
+
return not managed_job_utils.is_consolidation_mode()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _serve_status_refresh_event(pool: bool):
|
|
220
|
+
"""Refresh the sky serve status for controller consolidation mode."""
|
|
221
|
+
# pylint: disable=import-outside-toplevel
|
|
222
|
+
from sky.serve import serve_utils
|
|
223
|
+
|
|
224
|
+
# We run the recovery logic before starting the event loop as those two are
|
|
225
|
+
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
226
|
+
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
227
|
+
|
|
228
|
+
# After recovery, we start the event loop.
|
|
229
|
+
from sky.skylet import events
|
|
230
|
+
event = events.ServiceUpdateEvent(pool=pool)
|
|
231
|
+
noun = 'pool' if pool else 'serve'
|
|
232
|
+
logger.info(f'=== Running {noun} status refresh event ===')
|
|
233
|
+
event.run()
|
|
234
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _should_skip_serve_status_refresh_event(pool: bool):
|
|
238
|
+
"""Check if the serve status refresh event should be skipped."""
|
|
239
|
+
# pylint: disable=import-outside-toplevel
|
|
240
|
+
from sky.serve import serve_utils
|
|
241
|
+
return not serve_utils.is_consolidation_mode(pool=pool)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def sky_serve_status_refresh_event():
|
|
245
|
+
_serve_status_refresh_event(pool=False)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def should_skip_sky_serve_status_refresh():
|
|
249
|
+
return _should_skip_serve_status_refresh_event(pool=False)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def pool_status_refresh_event():
|
|
253
|
+
_serve_status_refresh_event(pool=True)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def should_skip_pool_status_refresh():
|
|
257
|
+
return _should_skip_serve_status_refresh_event(pool=True)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# Register the events to run in the background.
|
|
261
|
+
INTERNAL_REQUEST_DAEMONS = [
|
|
262
|
+
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
263
|
+
# set to updated status automatically, without showing users the hint of
|
|
264
|
+
# cluster being stopped or down when `sky status -r` is called.
|
|
265
|
+
InternalRequestDaemon(
|
|
266
|
+
id='skypilot-status-refresh-daemon',
|
|
267
|
+
name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
|
|
268
|
+
event_fn=refresh_cluster_status_event,
|
|
269
|
+
default_log_level='DEBUG'),
|
|
270
|
+
# Volume status refresh daemon to update the volume status periodically.
|
|
271
|
+
InternalRequestDaemon(
|
|
272
|
+
id='skypilot-volume-status-refresh-daemon',
|
|
273
|
+
name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
|
|
274
|
+
event_fn=refresh_volume_status_event),
|
|
275
|
+
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
276
|
+
name=request_names.RequestName.
|
|
277
|
+
REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
|
|
278
|
+
event_fn=managed_job_status_refresh_event,
|
|
279
|
+
should_skip=should_skip_managed_job_status_refresh),
|
|
280
|
+
InternalRequestDaemon(
|
|
281
|
+
id='sky-serve-status-refresh-daemon',
|
|
282
|
+
name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
|
|
283
|
+
event_fn=sky_serve_status_refresh_event,
|
|
284
|
+
should_skip=should_skip_sky_serve_status_refresh),
|
|
285
|
+
InternalRequestDaemon(
|
|
286
|
+
id='pool-status-refresh-daemon',
|
|
287
|
+
name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
|
|
288
|
+
event_fn=pool_status_refresh_event,
|
|
289
|
+
should_skip=should_skip_pool_status_refresh),
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def is_daemon_request_id(request_id: str) -> bool:
|
|
294
|
+
"""Returns whether a specific request_id is an internal daemon."""
|
|
295
|
+
return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])
|