skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/requests/executor.py
CHANGED
|
@@ -18,6 +18,8 @@ The number of the workers is determined by the system resources.
|
|
|
18
18
|
|
|
19
19
|
See the [README.md](../README.md) for detailed architecture of the executor.
|
|
20
20
|
"""
|
|
21
|
+
import asyncio
|
|
22
|
+
import concurrent.futures
|
|
21
23
|
import contextlib
|
|
22
24
|
import multiprocessing
|
|
23
25
|
import os
|
|
@@ -29,26 +31,38 @@ import time
|
|
|
29
31
|
import typing
|
|
30
32
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
|
31
33
|
|
|
34
|
+
import psutil
|
|
32
35
|
import setproctitle
|
|
33
36
|
|
|
37
|
+
from sky import exceptions
|
|
34
38
|
from sky import global_user_state
|
|
35
39
|
from sky import models
|
|
36
40
|
from sky import sky_logging
|
|
37
41
|
from sky import skypilot_config
|
|
42
|
+
from sky.metrics import utils as metrics_utils
|
|
38
43
|
from sky.server import common as server_common
|
|
39
44
|
from sky.server import config as server_config
|
|
40
45
|
from sky.server import constants as server_constants
|
|
46
|
+
from sky.server import metrics as metrics_lib
|
|
41
47
|
from sky.server.requests import payloads
|
|
42
48
|
from sky.server.requests import preconditions
|
|
43
49
|
from sky.server.requests import process
|
|
50
|
+
from sky.server.requests import request_names
|
|
44
51
|
from sky.server.requests import requests as api_requests
|
|
52
|
+
from sky.server.requests import threads
|
|
45
53
|
from sky.server.requests.queues import local_queue
|
|
46
54
|
from sky.server.requests.queues import mp_queue
|
|
47
55
|
from sky.skylet import constants
|
|
48
56
|
from sky.utils import annotations
|
|
49
57
|
from sky.utils import common_utils
|
|
58
|
+
from sky.utils import context
|
|
59
|
+
from sky.utils import context_utils
|
|
50
60
|
from sky.utils import subprocess_utils
|
|
61
|
+
from sky.utils import tempstore
|
|
51
62
|
from sky.utils import timeline
|
|
63
|
+
from sky.utils import yaml_utils
|
|
64
|
+
from sky.utils.db import db_utils
|
|
65
|
+
from sky.workspaces import core as workspaces_core
|
|
52
66
|
|
|
53
67
|
if typing.TYPE_CHECKING:
|
|
54
68
|
import types
|
|
@@ -60,7 +74,6 @@ else:
|
|
|
60
74
|
from typing_extensions import ParamSpec
|
|
61
75
|
|
|
62
76
|
P = ParamSpec('P')
|
|
63
|
-
|
|
64
77
|
logger = sky_logging.init_logger(__name__)
|
|
65
78
|
|
|
66
79
|
# On macOS, the default start method for multiprocessing is 'fork', which
|
|
@@ -70,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
|
|
|
70
83
|
# platforms, including macOS.
|
|
71
84
|
multiprocessing.set_start_method('spawn', force=True)
|
|
72
85
|
|
|
86
|
+
# An upper limit of max threads for request execution per server process that
|
|
87
|
+
# unlikely to be reached to allow higher concurrency while still prevent the
|
|
88
|
+
# server process become overloaded.
|
|
89
|
+
_REQUEST_THREADS_LIMIT = 128
|
|
90
|
+
|
|
91
|
+
_REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
|
|
92
|
+
# A dedicated thread pool executor for synced requests execution in coroutine to
|
|
93
|
+
# avoid:
|
|
94
|
+
# 1. blocking the event loop;
|
|
95
|
+
# 2. exhausting the default thread pool executor of event loop;
|
|
96
|
+
_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
|
|
100
|
+
"""Lazy init and return the request thread executor for current process."""
|
|
101
|
+
global _REQUEST_THREAD_EXECUTOR
|
|
102
|
+
if _REQUEST_THREAD_EXECUTOR is not None:
|
|
103
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
104
|
+
with _REQUEST_THREAD_EXECUTOR_LOCK:
|
|
105
|
+
if _REQUEST_THREAD_EXECUTOR is None:
|
|
106
|
+
_REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
|
|
107
|
+
name='request_thread_executor',
|
|
108
|
+
max_workers=_REQUEST_THREADS_LIMIT)
|
|
109
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
110
|
+
|
|
73
111
|
|
|
74
112
|
class RequestQueue:
|
|
75
113
|
"""The queue for the requests, either redis or multiprocessing.
|
|
@@ -89,21 +127,21 @@ class RequestQueue:
|
|
|
89
127
|
else:
|
|
90
128
|
raise RuntimeError(f'Invalid queue backend: {backend}')
|
|
91
129
|
|
|
92
|
-
def put(self, request: Tuple[str, bool]) -> None:
|
|
130
|
+
def put(self, request: Tuple[str, bool, bool]) -> None:
|
|
93
131
|
"""Put and request to the queue.
|
|
94
132
|
|
|
95
133
|
Args:
|
|
96
|
-
request: A tuple of request_id and
|
|
134
|
+
request: A tuple of request_id, ignore_return_value, and retryable.
|
|
97
135
|
"""
|
|
98
136
|
self.queue.put(request) # type: ignore
|
|
99
137
|
|
|
100
|
-
def get(self) -> Optional[Tuple[str, bool]]:
|
|
138
|
+
def get(self) -> Optional[Tuple[str, bool, bool]]:
|
|
101
139
|
"""Get a request from the queue.
|
|
102
140
|
|
|
103
141
|
It is non-blocking if the queue is empty, and returns None.
|
|
104
142
|
|
|
105
143
|
Returns:
|
|
106
|
-
A tuple of request_id and
|
|
144
|
+
A tuple of request_id, ignore_return_value, and retryable.
|
|
107
145
|
"""
|
|
108
146
|
try:
|
|
109
147
|
return self.queue.get(block=False)
|
|
@@ -121,6 +159,10 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
|
|
121
159
|
def executor_initializer(proc_group: str):
|
|
122
160
|
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
|
123
161
|
f'{multiprocessing.current_process().pid}')
|
|
162
|
+
# Executor never stops, unless the whole process is killed.
|
|
163
|
+
threading.Thread(target=metrics_lib.process_monitor,
|
|
164
|
+
args=(f'worker:{proc_group}', threading.Event()),
|
|
165
|
+
daemon=True).start()
|
|
124
166
|
|
|
125
167
|
|
|
126
168
|
class RequestWorker:
|
|
@@ -144,10 +186,27 @@ class RequestWorker:
|
|
|
144
186
|
self.schedule_type = schedule_type
|
|
145
187
|
self.garanteed_parallelism = config.garanteed_parallelism
|
|
146
188
|
self.burstable_parallelism = config.burstable_parallelism
|
|
189
|
+
self.num_db_connections_per_worker = (
|
|
190
|
+
config.num_db_connections_per_worker)
|
|
191
|
+
self._thread: Optional[threading.Thread] = None
|
|
192
|
+
self._cancel_event = threading.Event()
|
|
147
193
|
|
|
148
194
|
def __str__(self) -> str:
|
|
149
195
|
return f'Worker(schedule_type={self.schedule_type.value})'
|
|
150
196
|
|
|
197
|
+
def run_in_background(self) -> None:
|
|
198
|
+
# Thread dispatcher is sufficient for current scale, refer to
|
|
199
|
+
# tests/load_tests/test_queue_dispatcher.py for more details.
|
|
200
|
+
# Use daemon thread for automatic cleanup.
|
|
201
|
+
thread = threading.Thread(target=self.run, daemon=True)
|
|
202
|
+
thread.start()
|
|
203
|
+
self._thread = thread
|
|
204
|
+
|
|
205
|
+
def cancel(self) -> None:
|
|
206
|
+
if self._thread is not None:
|
|
207
|
+
self._cancel_event.set()
|
|
208
|
+
self._thread.join()
|
|
209
|
+
|
|
151
210
|
def process_request(self, executor: process.BurstableExecutor,
|
|
152
211
|
queue: RequestQueue) -> None:
|
|
153
212
|
try:
|
|
@@ -155,11 +214,12 @@ class RequestWorker:
|
|
|
155
214
|
if request_element is None:
|
|
156
215
|
time.sleep(0.1)
|
|
157
216
|
return
|
|
158
|
-
request_id, ignore_return_value = request_element
|
|
159
|
-
request = api_requests.get_request(request_id)
|
|
217
|
+
request_id, ignore_return_value, _ = request_element
|
|
218
|
+
request = api_requests.get_request(request_id, fields=['status'])
|
|
160
219
|
assert request is not None, f'Request with ID {request_id} is None'
|
|
161
220
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
|
162
221
|
return
|
|
222
|
+
del request
|
|
163
223
|
logger.info(f'[{self}] Submitting request: {request_id}')
|
|
164
224
|
# Start additional process to run the request, so that it can be
|
|
165
225
|
# cancelled when requested by a user.
|
|
@@ -167,8 +227,19 @@ class RequestWorker:
|
|
|
167
227
|
# multiple requests can share the same process pid, which may cause
|
|
168
228
|
# issues with SkyPilot core functions if they rely on the exit of
|
|
169
229
|
# the process, such as subprocess_daemon.py.
|
|
170
|
-
executor.submit_until_success(
|
|
171
|
-
|
|
230
|
+
fut = executor.submit_until_success(
|
|
231
|
+
_request_execution_wrapper, request_id, ignore_return_value,
|
|
232
|
+
self.num_db_connections_per_worker)
|
|
233
|
+
# Decrement the free executor count when a request starts
|
|
234
|
+
if metrics_utils.METRICS_ENABLED:
|
|
235
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
236
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.dec()
|
|
237
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
238
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.dec()
|
|
239
|
+
# Monitor the result of the request execution.
|
|
240
|
+
threading.Thread(target=self.handle_task_result,
|
|
241
|
+
args=(fut, request_element),
|
|
242
|
+
daemon=True).start()
|
|
172
243
|
|
|
173
244
|
logger.info(f'[{self}] Submitted request: {request_id}')
|
|
174
245
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
@@ -178,6 +249,45 @@ class RequestWorker:
|
|
|
178
249
|
f'{request_id if "request_id" in locals() else ""} '
|
|
179
250
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
180
251
|
|
|
252
|
+
def handle_task_result(self, fut: concurrent.futures.Future,
|
|
253
|
+
request_element: Tuple[str, bool, bool]) -> None:
|
|
254
|
+
try:
|
|
255
|
+
fut.result()
|
|
256
|
+
except concurrent.futures.process.BrokenProcessPool as e:
|
|
257
|
+
# Happens when the worker process dies unexpectedly, e.g. OOM
|
|
258
|
+
# killed.
|
|
259
|
+
request_id, _, retryable = request_element
|
|
260
|
+
# Ensure the request status.
|
|
261
|
+
api_requests.set_request_failed(request_id, e)
|
|
262
|
+
logger.error(
|
|
263
|
+
f'Request {request_id} failed to get processed '
|
|
264
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
265
|
+
if retryable:
|
|
266
|
+
# If the request is retryable and disrupted by broken
|
|
267
|
+
# process pool, reschedule it immediately to get it
|
|
268
|
+
# retried in the new process pool.
|
|
269
|
+
queue = _get_queue(self.schedule_type)
|
|
270
|
+
queue.put(request_element)
|
|
271
|
+
except exceptions.ExecutionRetryableError as e:
|
|
272
|
+
time.sleep(e.retry_wait_seconds)
|
|
273
|
+
# Reset the request status to PENDING so it can be picked up again.
|
|
274
|
+
# Assume retryable since the error is ExecutionRetryableError.
|
|
275
|
+
request_id, _, _ = request_element
|
|
276
|
+
with api_requests.update_request(request_id) as request_task:
|
|
277
|
+
assert request_task is not None, request_id
|
|
278
|
+
request_task.status = api_requests.RequestStatus.PENDING
|
|
279
|
+
# Reschedule the request.
|
|
280
|
+
queue = _get_queue(self.schedule_type)
|
|
281
|
+
queue.put(request_element)
|
|
282
|
+
logger.info(f'Rescheduled request {request_id} for retry')
|
|
283
|
+
finally:
|
|
284
|
+
# Increment the free executor count when a request finishes
|
|
285
|
+
if metrics_utils.METRICS_ENABLED:
|
|
286
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
287
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.inc()
|
|
288
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
289
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.inc()
|
|
290
|
+
|
|
181
291
|
def run(self) -> None:
|
|
182
292
|
# Handle the SIGTERM signal to abort the executor process gracefully.
|
|
183
293
|
proc_group = f'{self.schedule_type.value}'
|
|
@@ -198,7 +308,17 @@ class RequestWorker:
|
|
|
198
308
|
burst_workers=self.burstable_parallelism,
|
|
199
309
|
initializer=executor_initializer,
|
|
200
310
|
initargs=(proc_group,))
|
|
201
|
-
|
|
311
|
+
# Initialize the appropriate gauge for the number of free executors
|
|
312
|
+
total_executors = (self.garanteed_parallelism +
|
|
313
|
+
self.burstable_parallelism)
|
|
314
|
+
if metrics_utils.METRICS_ENABLED:
|
|
315
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
316
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.set(
|
|
317
|
+
total_executors)
|
|
318
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
319
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.set(
|
|
320
|
+
total_executors)
|
|
321
|
+
while not self._cancel_event.is_set():
|
|
202
322
|
self.process_request(executor, queue)
|
|
203
323
|
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
|
204
324
|
except KeyboardInterrupt:
|
|
@@ -221,22 +341,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
|
|
|
221
341
|
|
|
222
342
|
@contextlib.contextmanager
|
|
223
343
|
def override_request_env_and_config(
|
|
224
|
-
request_body: payloads.RequestBody
|
|
344
|
+
request_body: payloads.RequestBody, request_id: str,
|
|
345
|
+
request_name: str) -> Generator[None, None, None]:
|
|
225
346
|
"""Override the environment and SkyPilot config for a request."""
|
|
226
347
|
original_env = os.environ.copy()
|
|
227
|
-
os.environ.update(request_body.env_vars)
|
|
228
|
-
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
229
|
-
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
230
|
-
global_user_state.add_or_update_user(user)
|
|
231
|
-
# Force color to be enabled.
|
|
232
|
-
os.environ['CLICOLOR_FORCE'] = '1'
|
|
233
|
-
server_common.reload_for_new_request(
|
|
234
|
-
client_entrypoint=request_body.entrypoint,
|
|
235
|
-
client_command=request_body.entrypoint_command,
|
|
236
|
-
using_remote_api_server=request_body.using_remote_api_server)
|
|
237
348
|
try:
|
|
349
|
+
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
|
|
350
|
+
# server affecting client requests. If set on the client side, it will
|
|
351
|
+
# be overridden by the request body.
|
|
352
|
+
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
353
|
+
# Remove the db connection uri from client supplied env vars, as the
|
|
354
|
+
# client should not set the db string on server side.
|
|
355
|
+
request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
|
|
356
|
+
os.environ.update(request_body.env_vars)
|
|
357
|
+
# Note: may be overridden by AuthProxyMiddleware.
|
|
358
|
+
# TODO(zhwu): we need to make the entire request a context available to
|
|
359
|
+
# the entire request execution, so that we can access info like user
|
|
360
|
+
# through the execution.
|
|
361
|
+
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
362
|
+
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
363
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
364
|
+
|
|
365
|
+
# Force color to be enabled.
|
|
366
|
+
os.environ['CLICOLOR_FORCE'] = '1'
|
|
367
|
+
server_common.reload_for_new_request(
|
|
368
|
+
client_entrypoint=request_body.entrypoint,
|
|
369
|
+
client_command=request_body.entrypoint_command,
|
|
370
|
+
using_remote_api_server=request_body.using_remote_api_server,
|
|
371
|
+
user=user,
|
|
372
|
+
request_id=request_id)
|
|
373
|
+
logger.debug(
|
|
374
|
+
f'override path: {request_body.override_skypilot_config_path}')
|
|
238
375
|
with skypilot_config.override_skypilot_config(
|
|
239
|
-
request_body.override_skypilot_config
|
|
376
|
+
request_body.override_skypilot_config,
|
|
377
|
+
request_body.override_skypilot_config_path):
|
|
378
|
+
# Skip permission check for sky.workspaces.get request
|
|
379
|
+
# as it is used to determine which workspaces the user
|
|
380
|
+
# has access to.
|
|
381
|
+
if request_name != 'sky.workspaces.get':
|
|
382
|
+
try:
|
|
383
|
+
# Reject requests that the user does not have permission
|
|
384
|
+
# to access.
|
|
385
|
+
workspaces_core.reject_request_for_unauthorized_workspace(
|
|
386
|
+
user)
|
|
387
|
+
except exceptions.PermissionDeniedError as e:
|
|
388
|
+
logger.debug(
|
|
389
|
+
f'{request_id} permission denied to workspace: '
|
|
390
|
+
f'{skypilot_config.get_active_workspace()}: {e}')
|
|
391
|
+
raise e
|
|
392
|
+
logger.debug(
|
|
393
|
+
f'{request_id} permission granted to {request_name} request')
|
|
240
394
|
yield
|
|
241
395
|
finally:
|
|
242
396
|
# We need to call the save_timeline() since atexit will not be
|
|
@@ -250,35 +404,13 @@ def override_request_env_and_config(
|
|
|
250
404
|
os.environ.update(original_env)
|
|
251
405
|
|
|
252
406
|
|
|
253
|
-
def _redirect_output(file: TextIO) -> Tuple[int, int]:
|
|
254
|
-
"""Redirect stdout and stderr to the log file."""
|
|
255
|
-
fd = file.fileno() # Get the file descriptor from the file object
|
|
256
|
-
# Store copies of the original stdout and stderr file descriptors
|
|
257
|
-
original_stdout = os.dup(sys.stdout.fileno())
|
|
258
|
-
original_stderr = os.dup(sys.stderr.fileno())
|
|
259
|
-
|
|
260
|
-
# Copy this fd to stdout and stderr
|
|
261
|
-
os.dup2(fd, sys.stdout.fileno())
|
|
262
|
-
os.dup2(fd, sys.stderr.fileno())
|
|
263
|
-
return original_stdout, original_stderr
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
def _restore_output(original_stdout: int, original_stderr: int) -> None:
|
|
267
|
-
"""Restore stdout and stderr to their original file descriptors."""
|
|
268
|
-
os.dup2(original_stdout, sys.stdout.fileno())
|
|
269
|
-
os.dup2(original_stderr, sys.stderr.fileno())
|
|
270
|
-
|
|
271
|
-
# Close the duplicate file descriptors
|
|
272
|
-
os.close(original_stdout)
|
|
273
|
-
os.close(original_stderr)
|
|
274
|
-
|
|
275
|
-
|
|
276
407
|
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
277
408
|
raise KeyboardInterrupt
|
|
278
409
|
|
|
279
410
|
|
|
280
411
|
def _request_execution_wrapper(request_id: str,
|
|
281
|
-
ignore_return_value: bool
|
|
412
|
+
ignore_return_value: bool,
|
|
413
|
+
num_db_connections_per_worker: int = 0) -> None:
|
|
282
414
|
"""Wrapper for a request execution.
|
|
283
415
|
|
|
284
416
|
It wraps the execution of a request to:
|
|
@@ -287,71 +419,353 @@ def _request_execution_wrapper(request_id: str,
|
|
|
287
419
|
2. Update the request status based on the execution result;
|
|
288
420
|
3. Redirect the stdout and stderr of the execution to log file;
|
|
289
421
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
|
422
|
+
5. Maintain the lifecycle of the temp dir used by the request.
|
|
290
423
|
"""
|
|
424
|
+
pid = multiprocessing.current_process().pid
|
|
425
|
+
proc = psutil.Process(pid)
|
|
426
|
+
rss_begin = proc.memory_info().rss
|
|
427
|
+
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
291
428
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
292
|
-
signal.signal(
|
|
429
|
+
# Only set up signal handlers in the main thread, as signal.signal() raises
|
|
430
|
+
# ValueError if called from a non-main thread (e.g., in tests).
|
|
431
|
+
if threading.current_thread() is threading.main_thread():
|
|
432
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
293
433
|
|
|
294
|
-
pid = multiprocessing.current_process().pid
|
|
295
434
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
435
|
+
|
|
436
|
+
original_stdout = original_stderr = None
|
|
437
|
+
|
|
438
|
+
def _save_current_output() -> None:
|
|
439
|
+
"""Save the current stdout and stderr file descriptors."""
|
|
440
|
+
nonlocal original_stdout, original_stderr
|
|
441
|
+
original_stdout = os.dup(sys.stdout.fileno())
|
|
442
|
+
original_stderr = os.dup(sys.stderr.fileno())
|
|
443
|
+
|
|
444
|
+
def _redirect_output(file: TextIO) -> None:
|
|
445
|
+
"""Redirect stdout and stderr to the log file."""
|
|
446
|
+
# Get the file descriptor from the file object
|
|
447
|
+
fd = file.fileno()
|
|
448
|
+
# Copy this fd to stdout and stderr
|
|
449
|
+
os.dup2(fd, sys.stdout.fileno())
|
|
450
|
+
os.dup2(fd, sys.stderr.fileno())
|
|
451
|
+
|
|
452
|
+
def _restore_output() -> None:
|
|
453
|
+
"""Restore stdout and stderr to their original file descriptors."""
|
|
454
|
+
nonlocal original_stdout, original_stderr
|
|
455
|
+
if original_stdout is not None:
|
|
456
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
|
457
|
+
os.close(original_stdout)
|
|
458
|
+
original_stdout = None
|
|
459
|
+
|
|
460
|
+
if original_stderr is not None:
|
|
461
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
|
462
|
+
os.close(original_stderr)
|
|
463
|
+
original_stderr = None
|
|
464
|
+
|
|
465
|
+
request_name = None
|
|
466
|
+
try:
|
|
467
|
+
# As soon as the request is updated with the executor PID, we can
|
|
468
|
+
# receive SIGTERM from cancellation. So, we update the request inside
|
|
469
|
+
# the try block to ensure we have the KeyboardInterrupt handling.
|
|
470
|
+
with api_requests.update_request(request_id) as request_task:
|
|
471
|
+
assert request_task is not None, request_id
|
|
472
|
+
if request_task.status != api_requests.RequestStatus.PENDING:
|
|
473
|
+
logger.debug(f'Request is already {request_task.status.value}, '
|
|
474
|
+
f'skipping execution')
|
|
475
|
+
return
|
|
476
|
+
log_path = request_task.log_path
|
|
477
|
+
request_task.pid = pid
|
|
478
|
+
request_task.status = api_requests.RequestStatus.RUNNING
|
|
479
|
+
func = request_task.entrypoint
|
|
480
|
+
request_body = request_task.request_body
|
|
481
|
+
request_name = request_task.name
|
|
482
|
+
|
|
305
483
|
# Store copies of the original stdout and stderr file descriptors
|
|
306
|
-
|
|
307
|
-
#
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
484
|
+
# We do this in two steps because we should make sure to restore the
|
|
485
|
+
# original values even if we are cancelled or fail during the redirect.
|
|
486
|
+
_save_current_output()
|
|
487
|
+
|
|
488
|
+
# Append to the log file instead of overwriting it since there might be
|
|
489
|
+
# logs from previous retries.
|
|
490
|
+
with log_path.open('a', encoding='utf-8') as f:
|
|
491
|
+
# Redirect the stdout/stderr before overriding the environment and
|
|
492
|
+
# config, as there can be some logs during override that needs to be
|
|
493
|
+
# captured in the log file.
|
|
494
|
+
_redirect_output(f)
|
|
495
|
+
|
|
496
|
+
with sky_logging.add_debug_log_handler(request_id), \
|
|
497
|
+
override_request_env_and_config(
|
|
498
|
+
request_body, request_id, request_name), \
|
|
499
|
+
tempstore.tempdir():
|
|
312
500
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
313
501
|
config = skypilot_config.to_dict()
|
|
314
502
|
logger.debug(f'request config: \n'
|
|
315
|
-
f'{
|
|
316
|
-
|
|
503
|
+
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
504
|
+
(metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
|
|
505
|
+
labels(request=request_name, pid=pid).inc())
|
|
506
|
+
with metrics_utils.time_it(name=request_name,
|
|
507
|
+
group='request_execution'):
|
|
508
|
+
return_value = func(**request_body.to_kwargs())
|
|
317
509
|
f.flush()
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
510
|
+
except KeyboardInterrupt:
|
|
511
|
+
logger.info(f'Request {request_id} cancelled by user')
|
|
512
|
+
# Kill all children processes related to this request.
|
|
513
|
+
# Each executor handles a single request, so we can safely kill all
|
|
514
|
+
# children processes related to this request.
|
|
515
|
+
# This is required as python does not pass the KeyboardInterrupt to the
|
|
516
|
+
# threads that are not main thread.
|
|
517
|
+
subprocess_utils.kill_children_processes()
|
|
518
|
+
return
|
|
519
|
+
except exceptions.ExecutionRetryableError as e:
|
|
520
|
+
logger.error(e)
|
|
521
|
+
logger.info(e.hint)
|
|
522
|
+
with api_requests.update_request(request_id) as request_task:
|
|
523
|
+
assert request_task is not None, request_id
|
|
524
|
+
# Retried request will undergo rescheduling and a new execution,
|
|
525
|
+
# clear the pid of the request.
|
|
526
|
+
request_task.pid = None
|
|
527
|
+
# Yield control to the scheduler for uniform handling of retries.
|
|
528
|
+
_restore_output()
|
|
529
|
+
raise
|
|
530
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
531
|
+
api_requests.set_request_failed(request_id, e)
|
|
532
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
533
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
534
|
+
# written to the original stdout and stderr file descriptors.
|
|
535
|
+
_restore_output()
|
|
536
|
+
logger.info(f'Request {request_id} failed due to '
|
|
537
|
+
f'{common_utils.format_exception(e)}')
|
|
538
|
+
return
|
|
539
|
+
else:
|
|
540
|
+
api_requests.set_request_succeeded(
|
|
541
|
+
request_id, return_value if not ignore_return_value else None)
|
|
542
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
543
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
544
|
+
# written to the original stdout and stderr file descriptors.
|
|
545
|
+
_restore_output()
|
|
546
|
+
logger.info(f'Request {request_id} finished')
|
|
547
|
+
finally:
|
|
548
|
+
_restore_output()
|
|
549
|
+
try:
|
|
550
|
+
# Capture the peak RSS before GC.
|
|
551
|
+
peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
|
|
552
|
+
# Clear request level cache to release all memory used by the
|
|
553
|
+
# request.
|
|
554
|
+
annotations.clear_request_level_cache()
|
|
555
|
+
with metrics_utils.time_it(name='release_memory', group='internal'):
|
|
556
|
+
common_utils.release_memory()
|
|
557
|
+
if request_name is not None:
|
|
558
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
559
|
+
except Exception as e: # pylint: disable=broad-except
|
|
560
|
+
logger.error(f'Failed to record memory metrics: '
|
|
561
|
+
f'{common_utils.format_exception(e)}')
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
_first_request = True
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
568
|
+
rss_begin: int, peak_rss: int) -> None:
|
|
569
|
+
"""Record the memory metrics for a request."""
|
|
570
|
+
# Do not record full memory delta for the first request as it
|
|
571
|
+
# will loads the sky core modules and make the memory usage
|
|
572
|
+
# estimation inaccurate.
|
|
573
|
+
global _first_request
|
|
574
|
+
if _first_request:
|
|
575
|
+
_first_request = False
|
|
576
|
+
return
|
|
577
|
+
rss_end = proc.memory_info().rss
|
|
578
|
+
|
|
579
|
+
# Answer "how much RSS this request contributed?"
|
|
580
|
+
metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
581
|
+
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
582
|
+
# Estimate the memory usage by the request by capturing the
|
|
583
|
+
# peak memory delta during the request execution.
|
|
584
|
+
metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
585
|
+
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
class CoroutineTask:
|
|
589
|
+
"""Wrapper of a background task runs in coroutine"""
|
|
590
|
+
|
|
591
|
+
def __init__(self, task: asyncio.Task):
|
|
592
|
+
self.task = task
|
|
593
|
+
|
|
594
|
+
async def cancel(self):
|
|
595
|
+
try:
|
|
596
|
+
self.task.cancel()
|
|
597
|
+
await self.task
|
|
598
|
+
except asyncio.CancelledError:
|
|
599
|
+
pass
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def check_request_thread_executor_available() -> None:
|
|
603
|
+
"""Check if the request thread executor is available.
|
|
604
|
+
|
|
605
|
+
This is a best effort check to hint the client to retry other server
|
|
606
|
+
processes when there is no avaiable thread worker in current one. But
|
|
607
|
+
a request may pass this check and still cannot get worker on execution
|
|
608
|
+
time due to race condition. In this case, the client will see a failed
|
|
609
|
+
request instead of retry.
|
|
610
|
+
|
|
611
|
+
TODO(aylei): this can be refined with a refactor of our coroutine
|
|
612
|
+
execution flow.
|
|
613
|
+
"""
|
|
614
|
+
get_request_thread_executor().check_available()
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def execute_request_in_coroutine(
|
|
618
|
+
request: api_requests.Request) -> CoroutineTask:
|
|
619
|
+
"""Execute a request in current event loop.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
request: The request to execute.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
A CoroutineTask handle to operate the background task.
|
|
626
|
+
"""
|
|
627
|
+
task = asyncio.create_task(_execute_request_coroutine(request))
|
|
628
|
+
return CoroutineTask(task)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _execute_with_config_override(func: Callable,
|
|
632
|
+
request_body: payloads.RequestBody,
|
|
633
|
+
request_id: str, request_name: str,
|
|
634
|
+
**kwargs) -> Any:
|
|
635
|
+
"""Execute a function with env and config override inside a thread."""
|
|
636
|
+
# Override the environment and config within this thread's context,
|
|
637
|
+
# which gets copied when we call to_thread.
|
|
638
|
+
with override_request_env_and_config(request_body, request_id,
|
|
639
|
+
request_name):
|
|
640
|
+
return func(**kwargs)
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
async def _execute_request_coroutine(request: api_requests.Request):
|
|
644
|
+
"""Execute a request in current event loop.
|
|
645
|
+
|
|
646
|
+
Similar to _request_execution_wrapper, but executed as coroutine in current
|
|
647
|
+
event loop. This is designed for executing tasks that are not CPU
|
|
648
|
+
intensive, e.g. sky logs.
|
|
649
|
+
"""
|
|
650
|
+
context.initialize()
|
|
651
|
+
ctx = context.get()
|
|
652
|
+
assert ctx is not None, 'Context is not initialized'
|
|
653
|
+
logger.info(f'Executing request {request.request_id} in coroutine')
|
|
654
|
+
func = request.entrypoint
|
|
655
|
+
request_body = request.request_body
|
|
656
|
+
await api_requests.update_status_async(request.request_id,
|
|
657
|
+
api_requests.RequestStatus.RUNNING)
|
|
658
|
+
# Redirect stdout and stderr to the request log path.
|
|
659
|
+
original_output = ctx.redirect_log(request.log_path)
|
|
660
|
+
try:
|
|
661
|
+
fut: asyncio.Future = context_utils.to_thread_with_executor(
|
|
662
|
+
get_request_thread_executor(), _execute_with_config_override, func,
|
|
663
|
+
request_body, request.request_id, request.name,
|
|
664
|
+
**request_body.to_kwargs())
|
|
665
|
+
except Exception as e: # pylint: disable=broad-except
|
|
666
|
+
ctx.redirect_log(original_output)
|
|
667
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
668
|
+
logger.error(f'Failed to run request {request.request_id} due to '
|
|
669
|
+
f'{common_utils.format_exception(e)}')
|
|
670
|
+
return
|
|
671
|
+
|
|
672
|
+
async def poll_task(request_id: str) -> bool:
|
|
673
|
+
req_status = await api_requests.get_request_status_async(request_id)
|
|
674
|
+
if req_status is None:
|
|
675
|
+
raise RuntimeError('Request not found')
|
|
676
|
+
|
|
677
|
+
if req_status.status == api_requests.RequestStatus.CANCELLED:
|
|
678
|
+
ctx.cancel()
|
|
679
|
+
return True
|
|
680
|
+
|
|
681
|
+
if fut.done():
|
|
682
|
+
try:
|
|
683
|
+
result = await fut
|
|
684
|
+
await api_requests.set_request_succeeded_async(
|
|
685
|
+
request_id, result)
|
|
686
|
+
except asyncio.CancelledError:
|
|
687
|
+
# The task is cancelled by ctx.cancel(), where the status
|
|
688
|
+
# should already be set to CANCELLED.
|
|
689
|
+
pass
|
|
690
|
+
except Exception as e: # pylint: disable=broad-except
|
|
691
|
+
ctx.redirect_log(original_output)
|
|
692
|
+
await api_requests.set_request_failed_async(request_id, e)
|
|
693
|
+
logger.error(f'Request {request_id} failed due to '
|
|
694
|
+
f'{common_utils.format_exception(e)}')
|
|
695
|
+
return True
|
|
696
|
+
return False
|
|
697
|
+
|
|
698
|
+
try:
|
|
699
|
+
while True:
|
|
700
|
+
res = await poll_task(request.request_id)
|
|
701
|
+
if res:
|
|
702
|
+
break
|
|
703
|
+
await asyncio.sleep(0.5)
|
|
704
|
+
except asyncio.CancelledError:
|
|
705
|
+
# Current coroutine is cancelled due to client disconnect, set the
|
|
706
|
+
# request status for consistency.
|
|
707
|
+
await api_requests.set_request_cancelled_async(request.request_id)
|
|
708
|
+
pass
|
|
709
|
+
# pylint: disable=broad-except
|
|
710
|
+
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
711
|
+
# Handle any other error
|
|
712
|
+
ctx.redirect_log(original_output)
|
|
713
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
714
|
+
logger.error(f'Request {request.request_id} interrupted due to '
|
|
715
|
+
f'unhandled exception: {common_utils.format_exception(e)}')
|
|
716
|
+
raise
|
|
717
|
+
finally:
|
|
718
|
+
# Always cancel the context to kill potentially running background
|
|
719
|
+
# routine.
|
|
720
|
+
ctx.cancel()
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
async def prepare_request_async(
|
|
724
|
+
request_id: str,
|
|
725
|
+
request_name: request_names.RequestName,
|
|
726
|
+
request_body: payloads.RequestBody,
|
|
727
|
+
func: Callable[P, Any],
|
|
728
|
+
request_cluster_name: Optional[str] = None,
|
|
729
|
+
schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
|
|
730
|
+
is_skypilot_system: bool = False,
|
|
731
|
+
) -> api_requests.Request:
|
|
732
|
+
"""Prepare a request for execution."""
|
|
733
|
+
user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
|
|
734
|
+
if is_skypilot_system:
|
|
735
|
+
user_id = constants.SKYPILOT_SYSTEM_USER_ID
|
|
736
|
+
global_user_state.add_or_update_user(
|
|
737
|
+
models.User(id=user_id, name=user_id))
|
|
738
|
+
request = api_requests.Request(request_id=request_id,
|
|
739
|
+
name=server_constants.REQUEST_NAME_PREFIX +
|
|
740
|
+
request_name,
|
|
741
|
+
entrypoint=func,
|
|
742
|
+
request_body=request_body,
|
|
743
|
+
status=api_requests.RequestStatus.PENDING,
|
|
744
|
+
created_at=time.time(),
|
|
745
|
+
schedule_type=schedule_type,
|
|
746
|
+
user_id=user_id,
|
|
747
|
+
cluster_name=request_cluster_name)
|
|
748
|
+
|
|
749
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
750
|
+
raise exceptions.RequestAlreadyExistsError(
|
|
751
|
+
f'Request {request_id} already exists.')
|
|
752
|
+
|
|
753
|
+
request.log_path.touch()
|
|
754
|
+
return request
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
async def schedule_request_async(request_id: str,
|
|
758
|
+
request_name: request_names.RequestName,
|
|
759
|
+
request_body: payloads.RequestBody,
|
|
760
|
+
func: Callable[P, Any],
|
|
761
|
+
request_cluster_name: Optional[str] = None,
|
|
762
|
+
ignore_return_value: bool = False,
|
|
763
|
+
schedule_type: api_requests.ScheduleType = (
|
|
764
|
+
api_requests.ScheduleType.LONG),
|
|
765
|
+
is_skypilot_system: bool = False,
|
|
766
|
+
precondition: Optional[
|
|
767
|
+
preconditions.Precondition] = None,
|
|
768
|
+
retryable: bool = False) -> None:
|
|
355
769
|
"""Enqueue a request to the request queue.
|
|
356
770
|
|
|
357
771
|
Args:
|
|
@@ -372,32 +786,37 @@ def schedule_request(
|
|
|
372
786
|
The precondition is waited asynchronously and does not block the
|
|
373
787
|
caller.
|
|
374
788
|
"""
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
if not api_requests.create_if_not_exists(request):
|
|
392
|
-
logger.debug(f'Request {request_id} already exists.')
|
|
393
|
-
return
|
|
789
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
790
|
+
request_body, func,
|
|
791
|
+
request_cluster_name,
|
|
792
|
+
schedule_type,
|
|
793
|
+
is_skypilot_system)
|
|
794
|
+
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
795
|
+
retryable)
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def schedule_prepared_request(request_task: api_requests.Request,
|
|
799
|
+
ignore_return_value: bool = False,
|
|
800
|
+
precondition: Optional[
|
|
801
|
+
preconditions.Precondition] = None,
|
|
802
|
+
retryable: bool = False) -> None:
|
|
803
|
+
"""Enqueue a request to the request queue
|
|
394
804
|
|
|
395
|
-
|
|
805
|
+
Args:
|
|
806
|
+
request_task: The prepared request task to schedule.
|
|
807
|
+
ignore_return_value: If True, the return value of the function will be
|
|
808
|
+
ignored.
|
|
809
|
+
precondition: If a precondition is provided, the request will only be
|
|
810
|
+
scheduled for execution when the precondition is met (returns True).
|
|
811
|
+
The precondition is waited asynchronously and does not block the
|
|
812
|
+
caller.
|
|
813
|
+
retryable: Whether the request should be retried if it fails.
|
|
814
|
+
"""
|
|
396
815
|
|
|
397
816
|
def enqueue():
|
|
398
|
-
input_tuple = (request_id, ignore_return_value)
|
|
399
|
-
logger.info(f'Queuing request: {request_id}')
|
|
400
|
-
_get_queue(schedule_type).put(input_tuple)
|
|
817
|
+
input_tuple = (request_task.request_id, ignore_return_value, retryable)
|
|
818
|
+
logger.info(f'Queuing request: {request_task.request_id}')
|
|
819
|
+
_get_queue(request_task.schedule_type).put(input_tuple)
|
|
401
820
|
|
|
402
821
|
if precondition is not None:
|
|
403
822
|
# Wait async to avoid blocking caller.
|
|
@@ -406,15 +825,21 @@ def schedule_request(
|
|
|
406
825
|
enqueue()
|
|
407
826
|
|
|
408
827
|
|
|
409
|
-
def start(
|
|
828
|
+
def start(
|
|
829
|
+
config: server_config.ServerConfig
|
|
830
|
+
) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
|
|
410
831
|
"""Start the request workers.
|
|
411
832
|
|
|
412
833
|
Request workers run in background, schedule the requests and delegate the
|
|
413
834
|
request execution to executor processes.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
A tuple of the queue server process and the list of request worker
|
|
838
|
+
threads.
|
|
414
839
|
"""
|
|
415
840
|
global queue_backend
|
|
416
841
|
queue_backend = config.queue_backend
|
|
417
|
-
|
|
842
|
+
queue_server = None
|
|
418
843
|
# Setup the queues.
|
|
419
844
|
if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
|
|
420
845
|
logger.info('Creating shared request queues')
|
|
@@ -431,7 +856,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
|
431
856
|
queue_server = multiprocessing.Process(
|
|
432
857
|
target=mp_queue.start_queue_manager, args=(queue_names, port))
|
|
433
858
|
queue_server.start()
|
|
434
|
-
sub_procs.append(queue_server)
|
|
435
859
|
mp_queue.wait_for_queues_to_be_ready(queue_names,
|
|
436
860
|
queue_server,
|
|
437
861
|
port=port)
|
|
@@ -444,20 +868,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
|
|
|
444
868
|
|
|
445
869
|
logger.info('Request queues created')
|
|
446
870
|
|
|
447
|
-
|
|
448
|
-
# Thread dispatcher is sufficient for current scale, refer to
|
|
449
|
-
# tests/load_tests/test_queue_dispatcher.py for more details.
|
|
450
|
-
# Use daemon thread for automatic cleanup.
|
|
451
|
-
thread = threading.Thread(target=worker.run, daemon=True)
|
|
452
|
-
thread.start()
|
|
453
|
-
|
|
871
|
+
workers = []
|
|
454
872
|
# Start a worker for long requests.
|
|
455
873
|
long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
|
|
456
874
|
config=config.long_worker_config)
|
|
457
|
-
|
|
875
|
+
long_worker.run_in_background()
|
|
876
|
+
workers.append(long_worker)
|
|
458
877
|
|
|
459
878
|
# Start a worker for short requests.
|
|
460
879
|
short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
|
|
461
880
|
config=config.short_worker_config)
|
|
462
|
-
|
|
463
|
-
|
|
881
|
+
short_worker.run_in_background()
|
|
882
|
+
workers.append(short_worker)
|
|
883
|
+
return queue_server, workers
|