skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/requests/requests.py
CHANGED
|
@@ -1,34 +1,42 @@
|
|
|
1
1
|
"""Utilities for REST API."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import atexit
|
|
2
4
|
import contextlib
|
|
3
5
|
import dataclasses
|
|
4
6
|
import enum
|
|
5
7
|
import functools
|
|
6
|
-
import json
|
|
7
8
|
import os
|
|
8
9
|
import pathlib
|
|
9
10
|
import shutil
|
|
10
11
|
import signal
|
|
11
12
|
import sqlite3
|
|
13
|
+
import threading
|
|
12
14
|
import time
|
|
13
15
|
import traceback
|
|
14
|
-
from typing import Any, Callable, Dict, List, Optional,
|
|
16
|
+
from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
|
|
17
|
+
Tuple)
|
|
18
|
+
import uuid
|
|
15
19
|
|
|
20
|
+
import anyio
|
|
16
21
|
import colorama
|
|
17
22
|
import filelock
|
|
23
|
+
import orjson
|
|
18
24
|
|
|
19
25
|
from sky import exceptions
|
|
20
26
|
from sky import global_user_state
|
|
21
27
|
from sky import sky_logging
|
|
28
|
+
from sky import skypilot_config
|
|
29
|
+
from sky.metrics import utils as metrics_lib
|
|
22
30
|
from sky.server import common as server_common
|
|
23
31
|
from sky.server import constants as server_constants
|
|
32
|
+
from sky.server import daemons
|
|
24
33
|
from sky.server.requests import payloads
|
|
25
34
|
from sky.server.requests.serializers import decoders
|
|
26
35
|
from sky.server.requests.serializers import encoders
|
|
27
|
-
from sky.utils import
|
|
36
|
+
from sky.utils import asyncio_utils
|
|
28
37
|
from sky.utils import common_utils
|
|
29
|
-
from sky.utils import db_utils
|
|
30
|
-
from sky.utils import env_options
|
|
31
38
|
from sky.utils import ux_utils
|
|
39
|
+
from sky.utils.db import db_utils
|
|
32
40
|
|
|
33
41
|
logger = sky_logging.init_logger(__name__)
|
|
34
42
|
|
|
@@ -37,8 +45,12 @@ REQUEST_TABLE = 'requests'
|
|
|
37
45
|
COL_CLUSTER_NAME = 'cluster_name'
|
|
38
46
|
COL_USER_ID = 'user_id'
|
|
39
47
|
COL_STATUS_MSG = 'status_msg'
|
|
48
|
+
COL_SHOULD_RETRY = 'should_retry'
|
|
49
|
+
COL_FINISHED_AT = 'finished_at'
|
|
40
50
|
REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
|
|
41
51
|
|
|
52
|
+
DEFAULT_REQUESTS_RETENTION_HOURS = 24 # 1 day
|
|
53
|
+
|
|
42
54
|
# TODO(zhwu): For scalability, there are several TODOs:
|
|
43
55
|
# [x] Have a way to queue requests.
|
|
44
56
|
# [ ] Move logs to persistent place.
|
|
@@ -62,6 +74,10 @@ class RequestStatus(enum.Enum):
|
|
|
62
74
|
color = _STATUS_TO_COLOR[self]
|
|
63
75
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
64
76
|
|
|
77
|
+
@classmethod
|
|
78
|
+
def finished_status(cls) -> List['RequestStatus']:
|
|
79
|
+
return [cls.SUCCEEDED, cls.FAILED, cls.CANCELLED]
|
|
80
|
+
|
|
65
81
|
|
|
66
82
|
_STATUS_TO_COLOR = {
|
|
67
83
|
RequestStatus.PENDING: colorama.Fore.BLUE,
|
|
@@ -85,6 +101,8 @@ REQUEST_COLUMNS = [
|
|
|
85
101
|
'schedule_type',
|
|
86
102
|
COL_USER_ID,
|
|
87
103
|
COL_STATUS_MSG,
|
|
104
|
+
COL_SHOULD_RETRY,
|
|
105
|
+
COL_FINISHED_AT,
|
|
88
106
|
]
|
|
89
107
|
|
|
90
108
|
|
|
@@ -95,27 +113,6 @@ class ScheduleType(enum.Enum):
|
|
|
95
113
|
SHORT = 'short'
|
|
96
114
|
|
|
97
115
|
|
|
98
|
-
@dataclasses.dataclass
|
|
99
|
-
class RequestPayload:
|
|
100
|
-
"""The payload for the requests."""
|
|
101
|
-
|
|
102
|
-
request_id: str
|
|
103
|
-
name: str
|
|
104
|
-
entrypoint: str
|
|
105
|
-
request_body: str
|
|
106
|
-
status: str
|
|
107
|
-
created_at: float
|
|
108
|
-
user_id: str
|
|
109
|
-
return_value: str
|
|
110
|
-
error: str
|
|
111
|
-
pid: Optional[int]
|
|
112
|
-
schedule_type: str
|
|
113
|
-
user_name: Optional[str] = None
|
|
114
|
-
# Resources the request operates on.
|
|
115
|
-
cluster_name: Optional[str] = None
|
|
116
|
-
status_msg: Optional[str] = None
|
|
117
|
-
|
|
118
|
-
|
|
119
116
|
@dataclasses.dataclass
|
|
120
117
|
class Request:
|
|
121
118
|
"""A SkyPilot API request."""
|
|
@@ -136,6 +133,10 @@ class Request:
|
|
|
136
133
|
cluster_name: Optional[str] = None
|
|
137
134
|
# Status message of the request, indicates the reason of current status.
|
|
138
135
|
status_msg: Optional[str] = None
|
|
136
|
+
# Whether the request should be retried.
|
|
137
|
+
should_retry: bool = False
|
|
138
|
+
# When the request finished.
|
|
139
|
+
finished_at: Optional[float] = None
|
|
139
140
|
|
|
140
141
|
@property
|
|
141
142
|
def log_path(self) -> pathlib.Path:
|
|
@@ -179,7 +180,7 @@ class Request:
|
|
|
179
180
|
@classmethod
|
|
180
181
|
def from_row(cls, row: Tuple[Any, ...]) -> 'Request':
|
|
181
182
|
content = dict(zip(REQUEST_COLUMNS, row))
|
|
182
|
-
return cls.decode(RequestPayload(**content))
|
|
183
|
+
return cls.decode(payloads.RequestPayload(**content))
|
|
183
184
|
|
|
184
185
|
def to_row(self) -> Tuple[Any, ...]:
|
|
185
186
|
payload = self.encode()
|
|
@@ -188,7 +189,7 @@ class Request:
|
|
|
188
189
|
row.append(getattr(payload, k))
|
|
189
190
|
return tuple(row)
|
|
190
191
|
|
|
191
|
-
def readable_encode(self) -> RequestPayload:
|
|
192
|
+
def readable_encode(self) -> payloads.RequestPayload:
|
|
192
193
|
"""Serialize the SkyPilot API request for display purposes.
|
|
193
194
|
|
|
194
195
|
This function should be called on the server side to serialize the
|
|
@@ -204,15 +205,16 @@ class Request:
|
|
|
204
205
|
"""
|
|
205
206
|
assert isinstance(self.request_body,
|
|
206
207
|
payloads.RequestBody), (self.name, self.request_body)
|
|
207
|
-
|
|
208
|
-
|
|
208
|
+
user = global_user_state.get_user(self.user_id)
|
|
209
|
+
user_name = user.name if user is not None else None
|
|
210
|
+
return payloads.RequestPayload(
|
|
209
211
|
request_id=self.request_id,
|
|
210
212
|
name=self.name,
|
|
211
213
|
entrypoint=self.entrypoint.__name__,
|
|
212
214
|
request_body=self.request_body.model_dump_json(),
|
|
213
215
|
status=self.status.value,
|
|
214
|
-
return_value=
|
|
215
|
-
error=
|
|
216
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
217
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
216
218
|
pid=None,
|
|
217
219
|
created_at=self.created_at,
|
|
218
220
|
schedule_type=self.schedule_type.value,
|
|
@@ -220,27 +222,31 @@ class Request:
|
|
|
220
222
|
user_name=user_name,
|
|
221
223
|
cluster_name=self.cluster_name,
|
|
222
224
|
status_msg=self.status_msg,
|
|
225
|
+
should_retry=self.should_retry,
|
|
226
|
+
finished_at=self.finished_at,
|
|
223
227
|
)
|
|
224
228
|
|
|
225
|
-
def encode(self) -> RequestPayload:
|
|
229
|
+
def encode(self) -> payloads.RequestPayload:
|
|
226
230
|
"""Serialize the SkyPilot API request."""
|
|
227
231
|
assert isinstance(self.request_body,
|
|
228
232
|
payloads.RequestBody), (self.name, self.request_body)
|
|
229
233
|
try:
|
|
230
|
-
return RequestPayload(
|
|
234
|
+
return payloads.RequestPayload(
|
|
231
235
|
request_id=self.request_id,
|
|
232
236
|
name=self.name,
|
|
233
237
|
entrypoint=encoders.pickle_and_encode(self.entrypoint),
|
|
234
238
|
request_body=encoders.pickle_and_encode(self.request_body),
|
|
235
239
|
status=self.status.value,
|
|
236
|
-
return_value=
|
|
237
|
-
error=
|
|
240
|
+
return_value=orjson.dumps(self.return_value).decode('utf-8'),
|
|
241
|
+
error=orjson.dumps(self.error).decode('utf-8'),
|
|
238
242
|
pid=self.pid,
|
|
239
243
|
created_at=self.created_at,
|
|
240
244
|
schedule_type=self.schedule_type.value,
|
|
241
245
|
user_id=self.user_id,
|
|
242
246
|
cluster_name=self.cluster_name,
|
|
243
247
|
status_msg=self.status_msg,
|
|
248
|
+
should_retry=self.should_retry,
|
|
249
|
+
finished_at=self.finished_at,
|
|
244
250
|
)
|
|
245
251
|
except (TypeError, ValueError) as e:
|
|
246
252
|
# The error is unexpected, so we don't suppress the stack trace.
|
|
@@ -255,7 +261,7 @@ class Request:
|
|
|
255
261
|
raise
|
|
256
262
|
|
|
257
263
|
@classmethod
|
|
258
|
-
def decode(cls, payload: RequestPayload) -> 'Request':
|
|
264
|
+
def decode(cls, payload: payloads.RequestPayload) -> 'Request':
|
|
259
265
|
"""Deserialize the SkyPilot API request."""
|
|
260
266
|
try:
|
|
261
267
|
return cls(
|
|
@@ -264,14 +270,16 @@ class Request:
|
|
|
264
270
|
entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
|
|
265
271
|
request_body=decoders.decode_and_unpickle(payload.request_body),
|
|
266
272
|
status=RequestStatus(payload.status),
|
|
267
|
-
return_value=
|
|
268
|
-
error=
|
|
273
|
+
return_value=orjson.loads(payload.return_value),
|
|
274
|
+
error=orjson.loads(payload.error),
|
|
269
275
|
pid=payload.pid,
|
|
270
276
|
created_at=payload.created_at,
|
|
271
277
|
schedule_type=ScheduleType(payload.schedule_type),
|
|
272
278
|
user_id=payload.user_id,
|
|
273
279
|
cluster_name=payload.cluster_name,
|
|
274
280
|
status_msg=payload.status_msg,
|
|
281
|
+
should_retry=payload.should_retry,
|
|
282
|
+
finished_at=payload.finished_at,
|
|
275
283
|
)
|
|
276
284
|
except (TypeError, ValueError) as e:
|
|
277
285
|
logger.error(
|
|
@@ -286,113 +294,104 @@ class Request:
|
|
|
286
294
|
raise
|
|
287
295
|
|
|
288
296
|
|
|
289
|
-
def
|
|
290
|
-
"""
|
|
291
|
-
|
|
292
|
-
Args:
|
|
293
|
-
cluster_name: the name of the cluster.
|
|
294
|
-
exclude_request_names: exclude requests with these names. This is to
|
|
295
|
-
prevent killing the caller request.
|
|
296
|
-
"""
|
|
297
|
-
request_ids = [
|
|
298
|
-
request_task.request_id for request_task in get_request_tasks(
|
|
299
|
-
cluster_names=[cluster_name],
|
|
300
|
-
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
301
|
-
exclude_request_names=[exclude_request_name])
|
|
302
|
-
]
|
|
303
|
-
kill_requests(request_ids)
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
def refresh_cluster_status_event():
|
|
307
|
-
"""Periodically refresh the cluster status."""
|
|
308
|
-
# pylint: disable=import-outside-toplevel
|
|
309
|
-
from sky import core
|
|
310
|
-
|
|
311
|
-
# Disable logging for periodic refresh to avoid the usage message being
|
|
312
|
-
# sent multiple times.
|
|
313
|
-
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
314
|
-
|
|
315
|
-
while True:
|
|
316
|
-
logger.info('=== Refreshing cluster status ===')
|
|
317
|
-
# This periodically refresh will hold the lock for the cluster being
|
|
318
|
-
# refreshed, but it is OK because other operations will just wait for
|
|
319
|
-
# the lock and get the just refreshed status without refreshing again.
|
|
320
|
-
core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
|
|
321
|
-
logger.info(
|
|
322
|
-
'Status refreshed. Sleeping '
|
|
323
|
-
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
324
|
-
' seconds for the next refresh...\n')
|
|
325
|
-
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
326
|
-
|
|
297
|
+
def get_new_request_id() -> str:
|
|
298
|
+
"""Get a new request ID."""
|
|
299
|
+
return str(uuid.uuid4())
|
|
327
300
|
|
|
328
|
-
@dataclasses.dataclass
|
|
329
|
-
class InternalRequestDaemon:
|
|
330
|
-
id: str
|
|
331
|
-
name: str
|
|
332
|
-
event_fn: Callable[[], None]
|
|
333
301
|
|
|
302
|
+
def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
303
|
+
"""Serialize the SkyPilot API request for display purposes.
|
|
334
304
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
# set to updated status automatically, without showing users the hint of
|
|
339
|
-
# cluster being stopped or down when `sky status -r` is called.
|
|
340
|
-
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
341
|
-
name='status',
|
|
342
|
-
event_fn=refresh_cluster_status_event)
|
|
343
|
-
]
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
347
|
-
user_id: Optional[str] = None) -> List[str]:
|
|
348
|
-
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
349
|
-
|
|
350
|
-
Args:
|
|
351
|
-
request_ids: The request IDs to kill. If None, all requests for the
|
|
352
|
-
user are killed.
|
|
353
|
-
user_id: The user ID to kill requests for. If None, all users are
|
|
354
|
-
killed.
|
|
355
|
-
|
|
356
|
-
Returns:
|
|
357
|
-
A list of request IDs that were cancelled.
|
|
358
|
-
"""
|
|
359
|
-
if request_ids is None:
|
|
360
|
-
request_ids = [
|
|
361
|
-
request_task.request_id for request_task in get_request_tasks(
|
|
362
|
-
user_id=user_id,
|
|
363
|
-
status=[RequestStatus.RUNNING, RequestStatus.PENDING],
|
|
364
|
-
# Avoid cancelling the cancel request itself.
|
|
365
|
-
exclude_request_names=['sky.api_cancel'])
|
|
366
|
-
]
|
|
367
|
-
cancelled_request_ids = []
|
|
368
|
-
for request_id in request_ids:
|
|
369
|
-
with update_request(request_id) as request_record:
|
|
370
|
-
if request_record is None:
|
|
371
|
-
logger.debug(f'No request ID {request_id}')
|
|
372
|
-
continue
|
|
373
|
-
# Skip internal requests. The internal requests are scheduled with
|
|
374
|
-
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
375
|
-
if request_record.request_id in set(
|
|
376
|
-
event.id for event in INTERNAL_REQUEST_DAEMONS):
|
|
377
|
-
continue
|
|
378
|
-
if request_record.status > RequestStatus.RUNNING:
|
|
379
|
-
logger.debug(f'Request {request_id} already finished')
|
|
380
|
-
continue
|
|
381
|
-
if request_record.pid is not None:
|
|
382
|
-
logger.debug(f'Killing request process {request_record.pid}')
|
|
383
|
-
# Use SIGTERM instead of SIGKILL:
|
|
384
|
-
# - The executor can handle SIGTERM gracefully
|
|
385
|
-
# - After SIGTERM, the executor can reuse the request process
|
|
386
|
-
# for other requests, avoiding the overhead of forking a new
|
|
387
|
-
# process for each request.
|
|
388
|
-
os.kill(request_record.pid, signal.SIGTERM)
|
|
389
|
-
request_record.status = RequestStatus.CANCELLED
|
|
390
|
-
cancelled_request_ids.append(request_id)
|
|
391
|
-
return cancelled_request_ids
|
|
305
|
+
This function should be called on the server side to serialize the
|
|
306
|
+
request body into human readable format, e.g., the entrypoint should
|
|
307
|
+
be a string, and the pid, error, or return value are not needed.
|
|
392
308
|
|
|
309
|
+
The returned value will then be displayed on the client side in request
|
|
310
|
+
table.
|
|
393
311
|
|
|
394
|
-
|
|
395
|
-
|
|
312
|
+
We do not use `encode` for display to avoid a large amount of data being
|
|
313
|
+
sent to the client side, especially for the request table could include
|
|
314
|
+
all the requests.
|
|
315
|
+
"""
|
|
316
|
+
encoded_requests = []
|
|
317
|
+
all_users = global_user_state.get_all_users()
|
|
318
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
319
|
+
for request in requests:
|
|
320
|
+
if request.request_body is not None:
|
|
321
|
+
assert isinstance(request.request_body,
|
|
322
|
+
payloads.RequestBody), (request.name,
|
|
323
|
+
request.request_body)
|
|
324
|
+
user_name = all_users_map.get(request.user_id)
|
|
325
|
+
payload = payloads.RequestPayload(
|
|
326
|
+
request_id=request.request_id,
|
|
327
|
+
name=request.name,
|
|
328
|
+
entrypoint=request.entrypoint.__name__
|
|
329
|
+
if request.entrypoint is not None else '',
|
|
330
|
+
request_body=request.request_body.model_dump_json()
|
|
331
|
+
if request.request_body is not None else
|
|
332
|
+
orjson.dumps(None).decode('utf-8'),
|
|
333
|
+
status=request.status.value,
|
|
334
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
335
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
336
|
+
pid=None,
|
|
337
|
+
created_at=request.created_at,
|
|
338
|
+
schedule_type=request.schedule_type.value,
|
|
339
|
+
user_id=request.user_id,
|
|
340
|
+
user_name=user_name,
|
|
341
|
+
cluster_name=request.cluster_name,
|
|
342
|
+
status_msg=request.status_msg,
|
|
343
|
+
should_retry=request.should_retry,
|
|
344
|
+
finished_at=request.finished_at,
|
|
345
|
+
)
|
|
346
|
+
encoded_requests.append(payload)
|
|
347
|
+
return encoded_requests
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _update_request_row_fields(
|
|
351
|
+
row: Tuple[Any, ...],
|
|
352
|
+
fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
|
|
353
|
+
"""Update the request row fields."""
|
|
354
|
+
if not fields:
|
|
355
|
+
return row
|
|
356
|
+
|
|
357
|
+
# Convert tuple to dictionary for easier manipulation
|
|
358
|
+
content = dict(zip(fields, row))
|
|
359
|
+
|
|
360
|
+
# Required fields in RequestPayload
|
|
361
|
+
if 'request_id' not in fields:
|
|
362
|
+
content['request_id'] = ''
|
|
363
|
+
if 'name' not in fields:
|
|
364
|
+
content['name'] = ''
|
|
365
|
+
if 'entrypoint' not in fields:
|
|
366
|
+
content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
|
|
367
|
+
if 'request_body' not in fields:
|
|
368
|
+
content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
|
|
369
|
+
if 'status' not in fields:
|
|
370
|
+
content['status'] = RequestStatus.PENDING.value
|
|
371
|
+
if 'created_at' not in fields:
|
|
372
|
+
content['created_at'] = 0
|
|
373
|
+
if 'user_id' not in fields:
|
|
374
|
+
content['user_id'] = ''
|
|
375
|
+
if 'return_value' not in fields:
|
|
376
|
+
content['return_value'] = orjson.dumps(None).decode('utf-8')
|
|
377
|
+
if 'error' not in fields:
|
|
378
|
+
content['error'] = orjson.dumps(None).decode('utf-8')
|
|
379
|
+
if 'schedule_type' not in fields:
|
|
380
|
+
content['schedule_type'] = ScheduleType.SHORT.value
|
|
381
|
+
# Optional fields in RequestPayload
|
|
382
|
+
if 'pid' not in fields:
|
|
383
|
+
content['pid'] = None
|
|
384
|
+
if 'cluster_name' not in fields:
|
|
385
|
+
content['cluster_name'] = None
|
|
386
|
+
if 'status_msg' not in fields:
|
|
387
|
+
content['status_msg'] = None
|
|
388
|
+
if 'should_retry' not in fields:
|
|
389
|
+
content['should_retry'] = False
|
|
390
|
+
if 'finished_at' not in fields:
|
|
391
|
+
content['finished_at'] = None
|
|
392
|
+
|
|
393
|
+
# Convert back to tuple in the same order as REQUEST_COLUMNS
|
|
394
|
+
return tuple(content[col] for col in REQUEST_COLUMNS)
|
|
396
395
|
|
|
397
396
|
|
|
398
397
|
def create_table(cursor, conn):
|
|
@@ -425,13 +424,45 @@ def create_table(cursor, conn):
|
|
|
425
424
|
{COL_CLUSTER_NAME} TEXT,
|
|
426
425
|
schedule_type TEXT,
|
|
427
426
|
{COL_USER_ID} TEXT,
|
|
428
|
-
{COL_STATUS_MSG} TEXT
|
|
427
|
+
{COL_STATUS_MSG} TEXT,
|
|
428
|
+
{COL_SHOULD_RETRY} INTEGER,
|
|
429
|
+
{COL_FINISHED_AT} REAL
|
|
430
|
+
)""")
|
|
429
431
|
|
|
430
432
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
|
|
431
433
|
'TEXT')
|
|
434
|
+
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
|
|
435
|
+
'INTEGER')
|
|
436
|
+
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
|
|
437
|
+
'REAL')
|
|
438
|
+
|
|
439
|
+
# Add an index on (status, name) to speed up queries
|
|
440
|
+
# that filter on these columns.
|
|
441
|
+
cursor.execute(f"""\
|
|
442
|
+
CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
|
|
443
|
+
""")
|
|
444
|
+
# Add an index on cluster_name to speed up queries
|
|
445
|
+
# that filter on this column.
|
|
446
|
+
cursor.execute(f"""\
|
|
447
|
+
CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
|
|
448
|
+
""")
|
|
449
|
+
# Add an index on created_at to speed up queries that sort on this column.
|
|
450
|
+
cursor.execute(f"""\
|
|
451
|
+
CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
|
|
452
|
+
""")
|
|
432
453
|
|
|
433
454
|
|
|
434
455
|
_DB = None
|
|
456
|
+
_init_db_lock = threading.Lock()
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _init_db_within_lock():
|
|
460
|
+
global _DB
|
|
461
|
+
if _DB is None:
|
|
462
|
+
db_path = os.path.expanduser(
|
|
463
|
+
server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
464
|
+
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
465
|
+
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
435
466
|
|
|
436
467
|
|
|
437
468
|
def init_db(func):
|
|
@@ -439,21 +470,65 @@ def init_db(func):
|
|
|
439
470
|
|
|
440
471
|
@functools.wraps(func)
|
|
441
472
|
def wrapper(*args, **kwargs):
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
473
|
+
if _DB is not None:
|
|
474
|
+
return func(*args, **kwargs)
|
|
475
|
+
with _init_db_lock:
|
|
476
|
+
_init_db_within_lock()
|
|
445
477
|
return func(*args, **kwargs)
|
|
446
478
|
|
|
447
479
|
return wrapper
|
|
448
480
|
|
|
449
481
|
|
|
482
|
+
def init_db_async(func):
|
|
483
|
+
"""Async version of init_db."""
|
|
484
|
+
|
|
485
|
+
@functools.wraps(func)
|
|
486
|
+
async def wrapper(*args, **kwargs):
|
|
487
|
+
if _DB is not None:
|
|
488
|
+
return await func(*args, **kwargs)
|
|
489
|
+
# If _DB is not initialized, init_db_async will be blocked if there
|
|
490
|
+
# is a thread initializing _DB, this is fine since it occurs on process
|
|
491
|
+
# startup.
|
|
492
|
+
with _init_db_lock:
|
|
493
|
+
_init_db_within_lock()
|
|
494
|
+
return await func(*args, **kwargs)
|
|
495
|
+
|
|
496
|
+
return wrapper
|
|
497
|
+
|
|
498
|
+
|
|
450
499
|
def reset_db_and_logs():
|
|
451
500
|
"""Create the database."""
|
|
501
|
+
logger.debug('clearing local API server database')
|
|
452
502
|
server_common.clear_local_api_server_database()
|
|
503
|
+
logger.debug(
|
|
504
|
+
f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
|
|
505
|
+
)
|
|
453
506
|
shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
|
|
454
507
|
ignore_errors=True)
|
|
508
|
+
logger.debug('clearing local API server client directory at '
|
|
509
|
+
f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
|
|
455
510
|
shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
|
|
456
511
|
ignore_errors=True)
|
|
512
|
+
with _init_db_lock:
|
|
513
|
+
_init_db_within_lock()
|
|
514
|
+
assert _DB is not None
|
|
515
|
+
with _DB.conn:
|
|
516
|
+
cursor = _DB.conn.cursor()
|
|
517
|
+
cursor.execute('SELECT sqlite_version()')
|
|
518
|
+
row = cursor.fetchone()
|
|
519
|
+
if row is None:
|
|
520
|
+
raise RuntimeError('Failed to get SQLite version')
|
|
521
|
+
version_str = row[0]
|
|
522
|
+
version_parts = version_str.split('.')
|
|
523
|
+
assert len(version_parts) >= 2, \
|
|
524
|
+
f'Invalid version string: {version_str}'
|
|
525
|
+
major, minor = int(version_parts[0]), int(version_parts[1])
|
|
526
|
+
# SQLite 3.35.0+ supports RETURNING statements.
|
|
527
|
+
# 3.35.0 was released in March 2021.
|
|
528
|
+
if not ((major > 3) or (major == 3 and minor >= 35)):
|
|
529
|
+
raise RuntimeError(
|
|
530
|
+
f'SQLite version {version_str} is not supported. '
|
|
531
|
+
'Please upgrade to SQLite 3.35.0 or later.')
|
|
457
532
|
|
|
458
533
|
|
|
459
534
|
def request_lock_path(request_id: str) -> str:
|
|
@@ -462,69 +537,349 @@ def request_lock_path(request_id: str) -> str:
|
|
|
462
537
|
return os.path.join(lock_path, f'.{request_id}.lock')
|
|
463
538
|
|
|
464
539
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
540
|
+
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
541
|
+
"""Kill all pending and running requests for a cluster.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
cluster_name: the name of the cluster.
|
|
545
|
+
exclude_request_names: exclude requests with these names. This is to
|
|
546
|
+
prevent killing the caller request.
|
|
547
|
+
"""
|
|
548
|
+
request_ids = [
|
|
549
|
+
request_task.request_id
|
|
550
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
551
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
552
|
+
exclude_request_names=[exclude_request_name],
|
|
553
|
+
cluster_names=[cluster_name],
|
|
554
|
+
fields=['request_id']))
|
|
555
|
+
]
|
|
556
|
+
_kill_requests(request_ids)
|
|
473
557
|
|
|
474
558
|
|
|
475
|
-
def
|
|
559
|
+
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
560
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
561
|
+
"""Kill requests with a given request ID prefix."""
|
|
562
|
+
expanded_request_ids: Optional[List[str]] = None
|
|
563
|
+
if request_ids is not None:
|
|
564
|
+
expanded_request_ids = []
|
|
565
|
+
for request_id in request_ids:
|
|
566
|
+
request_tasks = get_requests_with_prefix(request_id,
|
|
567
|
+
fields=['request_id'])
|
|
568
|
+
if request_tasks is None or len(request_tasks) == 0:
|
|
569
|
+
continue
|
|
570
|
+
if len(request_tasks) > 1:
|
|
571
|
+
raise ValueError(f'Multiple requests found for '
|
|
572
|
+
f'request ID prefix: {request_id}')
|
|
573
|
+
expanded_request_ids.append(request_tasks[0].request_id)
|
|
574
|
+
return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
# needed for backward compatibility. Remove by v0.10.7 or v0.12.0
|
|
578
|
+
# and rename kill_requests to kill_requests_with_prefix.
|
|
579
|
+
kill_requests_with_prefix = kill_requests
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def _should_kill_request(request_id: str,
|
|
583
|
+
request_record: Optional[Request]) -> bool:
|
|
584
|
+
if request_record is None:
|
|
585
|
+
logger.debug(f'No request ID {request_id}')
|
|
586
|
+
return False
|
|
587
|
+
# Skip internal requests. The internal requests are scheduled with
|
|
588
|
+
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
589
|
+
if request_record.request_id in set(
|
|
590
|
+
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
591
|
+
return False
|
|
592
|
+
if request_record.status > RequestStatus.RUNNING:
|
|
593
|
+
logger.debug(f'Request {request_id} already finished')
|
|
594
|
+
return False
|
|
595
|
+
return True
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def _kill_requests(request_ids: Optional[List[str]] = None,
|
|
599
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
600
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
request_ids: The request IDs to kill. If None, all requests for the
|
|
604
|
+
user are killed.
|
|
605
|
+
user_id: The user ID to kill requests for. If None, all users are
|
|
606
|
+
killed.
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
A list of request IDs that were cancelled.
|
|
610
|
+
"""
|
|
611
|
+
if request_ids is None:
|
|
612
|
+
request_ids = [
|
|
613
|
+
request_task.request_id
|
|
614
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
615
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
616
|
+
# Avoid cancelling the cancel request itself.
|
|
617
|
+
exclude_request_names=['sky.api_cancel'],
|
|
618
|
+
user_id=user_id,
|
|
619
|
+
fields=['request_id']))
|
|
620
|
+
]
|
|
621
|
+
cancelled_request_ids = []
|
|
622
|
+
for request_id in request_ids:
|
|
623
|
+
with update_request(request_id) as request_record:
|
|
624
|
+
if not _should_kill_request(request_id, request_record):
|
|
625
|
+
continue
|
|
626
|
+
if request_record.pid is not None:
|
|
627
|
+
logger.debug(f'Killing request process {request_record.pid}')
|
|
628
|
+
# Use SIGTERM instead of SIGKILL:
|
|
629
|
+
# - The executor can handle SIGTERM gracefully
|
|
630
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
631
|
+
# for other requests, avoiding the overhead of forking a new
|
|
632
|
+
# process for each request.
|
|
633
|
+
os.kill(request_record.pid, signal.SIGTERM)
|
|
634
|
+
request_record.status = RequestStatus.CANCELLED
|
|
635
|
+
request_record.finished_at = time.time()
|
|
636
|
+
cancelled_request_ids.append(request_id)
|
|
637
|
+
return cancelled_request_ids
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
@init_db_async
|
|
641
|
+
@asyncio_utils.shield
|
|
642
|
+
async def kill_request_async(request_id: str) -> bool:
|
|
643
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
True if the request was killed, False otherwise.
|
|
647
|
+
"""
|
|
648
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
649
|
+
request = await _get_request_no_lock_async(request_id)
|
|
650
|
+
if not _should_kill_request(request_id, request):
|
|
651
|
+
return False
|
|
652
|
+
assert request is not None
|
|
653
|
+
if request.pid is not None:
|
|
654
|
+
logger.debug(f'Killing request process {request.pid}')
|
|
655
|
+
# Use SIGTERM instead of SIGKILL:
|
|
656
|
+
# - The executor can handle SIGTERM gracefully
|
|
657
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
658
|
+
# for other requests, avoiding the overhead of forking a new
|
|
659
|
+
# process for each request.
|
|
660
|
+
os.kill(request.pid, signal.SIGTERM)
|
|
661
|
+
request.status = RequestStatus.CANCELLED
|
|
662
|
+
request.finished_at = time.time()
|
|
663
|
+
await _add_or_update_request_no_lock_async(request)
|
|
664
|
+
return True
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
@contextlib.contextmanager
|
|
668
|
+
@init_db
|
|
669
|
+
@metrics_lib.time_me
|
|
670
|
+
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
671
|
+
"""Get and update a SkyPilot API request."""
|
|
672
|
+
# Acquire the lock to avoid race conditions between multiple request
|
|
673
|
+
# operations, e.g. execute and cancel.
|
|
674
|
+
with filelock.FileLock(request_lock_path(request_id)):
|
|
675
|
+
request = _get_request_no_lock(request_id)
|
|
676
|
+
yield request
|
|
677
|
+
if request is not None:
|
|
678
|
+
_add_or_update_request_no_lock(request)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
@init_db_async
|
|
682
|
+
@metrics_lib.time_me
|
|
683
|
+
@asyncio_utils.shield
|
|
684
|
+
async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
685
|
+
"""Update the status of a request"""
|
|
686
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
687
|
+
request = await _get_request_no_lock_async(request_id)
|
|
688
|
+
if request is not None:
|
|
689
|
+
request.status = status
|
|
690
|
+
await _add_or_update_request_no_lock_async(request)
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
@init_db_async
|
|
694
|
+
@metrics_lib.time_me
|
|
695
|
+
@asyncio_utils.shield
|
|
696
|
+
async def update_status_msg_async(request_id: str, status_msg: str) -> None:
|
|
697
|
+
"""Update the status message of a request"""
|
|
698
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
699
|
+
request = await _get_request_no_lock_async(request_id)
|
|
700
|
+
if request is not None:
|
|
701
|
+
request.status_msg = status_msg
|
|
702
|
+
await _add_or_update_request_no_lock_async(request)
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def _get_request_no_lock(
|
|
706
|
+
request_id: str,
|
|
707
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
476
708
|
"""Get a SkyPilot API request."""
|
|
477
709
|
assert _DB is not None
|
|
478
710
|
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
711
|
+
if fields:
|
|
712
|
+
columns_str = ', '.join(fields)
|
|
479
713
|
with _DB.conn:
|
|
480
714
|
cursor = _DB.conn.cursor()
|
|
481
|
-
cursor.execute(
|
|
482
|
-
|
|
483
|
-
'WHERE request_id LIKE ?', (request_id + '%',))
|
|
715
|
+
cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
716
|
+
'WHERE request_id LIKE ?'), (request_id + '%',))
|
|
484
717
|
row = cursor.fetchone()
|
|
485
718
|
if row is None:
|
|
486
719
|
return None
|
|
720
|
+
if fields:
|
|
721
|
+
row = _update_request_row_fields(row, fields)
|
|
487
722
|
return Request.from_row(row)
|
|
488
723
|
|
|
489
724
|
|
|
490
|
-
|
|
491
|
-
|
|
725
|
+
async def _get_request_no_lock_async(
|
|
726
|
+
request_id: str,
|
|
727
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
728
|
+
"""Async version of _get_request_no_lock."""
|
|
729
|
+
assert _DB is not None
|
|
730
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
731
|
+
if fields:
|
|
732
|
+
columns_str = ', '.join(fields)
|
|
733
|
+
async with _DB.execute_fetchall_async(
|
|
734
|
+
(f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
735
|
+
'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
|
|
736
|
+
row = rows[0] if rows else None
|
|
737
|
+
if row is None:
|
|
738
|
+
return None
|
|
739
|
+
if fields:
|
|
740
|
+
row = _update_request_row_fields(row, fields)
|
|
741
|
+
return Request.from_row(row)
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
@init_db_async
|
|
745
|
+
@metrics_lib.time_me
|
|
746
|
+
async def get_latest_request_id_async() -> Optional[str]:
|
|
492
747
|
"""Get the latest request ID."""
|
|
493
748
|
assert _DB is not None
|
|
494
|
-
with _DB.
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
row = cursor.fetchone()
|
|
499
|
-
return row[0] if row else None
|
|
749
|
+
async with _DB.execute_fetchall_async(
|
|
750
|
+
(f'SELECT request_id FROM {REQUEST_TABLE} '
|
|
751
|
+
'ORDER BY created_at DESC LIMIT 1')) as rows:
|
|
752
|
+
return rows[0][0] if rows else None
|
|
500
753
|
|
|
501
754
|
|
|
502
755
|
@init_db
|
|
503
|
-
|
|
756
|
+
@metrics_lib.time_me
|
|
757
|
+
def get_request(request_id: str,
|
|
758
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
504
759
|
"""Get a SkyPilot API request."""
|
|
505
760
|
with filelock.FileLock(request_lock_path(request_id)):
|
|
506
|
-
return _get_request_no_lock(request_id)
|
|
761
|
+
return _get_request_no_lock(request_id, fields)
|
|
507
762
|
|
|
508
763
|
|
|
509
|
-
@
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
764
|
+
@init_db_async
|
|
765
|
+
@metrics_lib.time_me_async
|
|
766
|
+
@asyncio_utils.shield
|
|
767
|
+
async def get_request_async(
|
|
768
|
+
request_id: str,
|
|
769
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
770
|
+
"""Async version of get_request."""
|
|
771
|
+
# TODO(aylei): figure out how to remove FileLock here to avoid the overhead
|
|
772
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
773
|
+
return await _get_request_no_lock_async(request_id, fields)
|
|
517
774
|
|
|
518
775
|
|
|
519
776
|
@init_db
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
777
|
+
@metrics_lib.time_me
|
|
778
|
+
def get_requests_with_prefix(
|
|
779
|
+
request_id_prefix: str,
|
|
780
|
+
fields: Optional[List[str]] = None) -> Optional[List[Request]]:
|
|
781
|
+
"""Get requests with a given request ID prefix."""
|
|
782
|
+
assert _DB is not None
|
|
783
|
+
if fields:
|
|
784
|
+
columns_str = ', '.join(fields)
|
|
785
|
+
else:
|
|
786
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
787
|
+
with _DB.conn:
|
|
788
|
+
cursor = _DB.conn.cursor()
|
|
789
|
+
cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
790
|
+
'WHERE request_id LIKE ?'), (request_id_prefix + '%',))
|
|
791
|
+
rows = cursor.fetchall()
|
|
792
|
+
if not rows:
|
|
793
|
+
return None
|
|
794
|
+
if fields:
|
|
795
|
+
rows = [_update_request_row_fields(row, fields) for row in rows]
|
|
796
|
+
return [Request.from_row(row) for row in rows]
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
@init_db_async
|
|
800
|
+
@metrics_lib.time_me_async
|
|
801
|
+
@asyncio_utils.shield
|
|
802
|
+
async def get_requests_async_with_prefix(
|
|
803
|
+
request_id_prefix: str,
|
|
804
|
+
fields: Optional[List[str]] = None) -> Optional[List[Request]]:
|
|
805
|
+
"""Async version of get_request_with_prefix."""
|
|
806
|
+
assert _DB is not None
|
|
807
|
+
if fields:
|
|
808
|
+
columns_str = ', '.join(fields)
|
|
809
|
+
else:
|
|
810
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
811
|
+
async with _DB.execute_fetchall_async(
|
|
812
|
+
(f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
813
|
+
'WHERE request_id LIKE ?'), (request_id_prefix + '%',)) as rows:
|
|
814
|
+
if not rows:
|
|
815
|
+
return None
|
|
816
|
+
if fields:
|
|
817
|
+
rows = [_update_request_row_fields(row, fields) for row in rows]
|
|
818
|
+
return [Request.from_row(row) for row in rows]
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
class StatusWithMsg(NamedTuple):
|
|
822
|
+
status: RequestStatus
|
|
823
|
+
status_msg: Optional[str] = None
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
@init_db_async
|
|
827
|
+
@metrics_lib.time_me_async
|
|
828
|
+
async def get_request_status_async(
|
|
829
|
+
request_id: str,
|
|
830
|
+
include_msg: bool = False,
|
|
831
|
+
) -> Optional[StatusWithMsg]:
|
|
832
|
+
"""Get the status of a request.
|
|
833
|
+
|
|
834
|
+
Args:
|
|
835
|
+
request_id: The ID of the request.
|
|
836
|
+
include_msg: Whether to include the status message.
|
|
837
|
+
|
|
838
|
+
Returns:
|
|
839
|
+
The status of the request. If the request is not found, returns
|
|
840
|
+
None.
|
|
841
|
+
"""
|
|
842
|
+
assert _DB is not None
|
|
843
|
+
columns = 'status'
|
|
844
|
+
if include_msg:
|
|
845
|
+
columns += ', status_msg'
|
|
846
|
+
sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
|
|
847
|
+
async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
|
|
848
|
+
if rows is None or len(rows) == 0:
|
|
849
|
+
return None
|
|
850
|
+
status = RequestStatus(rows[0][0])
|
|
851
|
+
status_msg = rows[0][1] if include_msg else None
|
|
852
|
+
return StatusWithMsg(status, status_msg)
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
@init_db_async
|
|
856
|
+
@metrics_lib.time_me_async
|
|
857
|
+
@asyncio_utils.shield
|
|
858
|
+
async def create_if_not_exists_async(request: Request) -> bool:
|
|
859
|
+
"""Create a request if it does not exist, otherwise do nothing.
|
|
860
|
+
|
|
861
|
+
Returns:
|
|
862
|
+
True if a new request is created, False if the request already exists.
|
|
863
|
+
"""
|
|
864
|
+
assert _DB is not None
|
|
865
|
+
request_columns = ', '.join(REQUEST_COLUMNS)
|
|
866
|
+
values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
|
|
867
|
+
sql_statement = (
|
|
868
|
+
f'INSERT INTO {REQUEST_TABLE} '
|
|
869
|
+
f'({request_columns}) VALUES '
|
|
870
|
+
f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
|
|
871
|
+
request_row = request.to_row()
|
|
872
|
+
# Execute the SQL statement without getting the request lock.
|
|
873
|
+
# The request lock is used to prevent racing with cancellation codepath,
|
|
874
|
+
# but a request cannot be cancelled before it is created.
|
|
875
|
+
row = await _DB.execute_get_returning_value_async(sql_statement,
|
|
876
|
+
request_row)
|
|
877
|
+
return True if row else False
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
@dataclasses.dataclass
|
|
881
|
+
class RequestTaskFilter:
|
|
882
|
+
"""Filter for requests.
|
|
528
883
|
|
|
529
884
|
Args:
|
|
530
885
|
status: a list of statuses of the requests to filter on.
|
|
@@ -535,74 +890,315 @@ def get_request_tasks(
|
|
|
535
890
|
If None, all users are included.
|
|
536
891
|
include_request_names: a list of request names to filter on.
|
|
537
892
|
Mutually exclusive with exclude_request_names.
|
|
893
|
+
finished_before: if provided, only include requests finished before this
|
|
894
|
+
timestamp.
|
|
895
|
+
limit: the number of requests to show. If None, show all requests.
|
|
538
896
|
|
|
539
897
|
Raises:
|
|
540
898
|
ValueError: If both exclude_request_names and include_request_names are
|
|
541
899
|
provided.
|
|
542
900
|
"""
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
filters
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
901
|
+
status: Optional[List[RequestStatus]] = None
|
|
902
|
+
cluster_names: Optional[List[str]] = None
|
|
903
|
+
user_id: Optional[str] = None
|
|
904
|
+
exclude_request_names: Optional[List[str]] = None
|
|
905
|
+
include_request_names: Optional[List[str]] = None
|
|
906
|
+
finished_before: Optional[float] = None
|
|
907
|
+
limit: Optional[int] = None
|
|
908
|
+
fields: Optional[List[str]] = None
|
|
909
|
+
sort: bool = False
|
|
910
|
+
|
|
911
|
+
def __post_init__(self):
|
|
912
|
+
if (self.exclude_request_names is not None and
|
|
913
|
+
self.include_request_names is not None):
|
|
914
|
+
raise ValueError(
|
|
915
|
+
'Only one of exclude_request_names or include_request_names '
|
|
916
|
+
'can be provided, not both.')
|
|
917
|
+
|
|
918
|
+
def build_query(self) -> Tuple[str, List[Any]]:
|
|
919
|
+
"""Build the SQL query and filter parameters.
|
|
920
|
+
|
|
921
|
+
Returns:
|
|
922
|
+
A tuple of (SQL, SQL parameters).
|
|
923
|
+
"""
|
|
924
|
+
filters = []
|
|
925
|
+
filter_params: List[Any] = []
|
|
926
|
+
if self.status is not None:
|
|
927
|
+
status_list_str = ','.join(
|
|
928
|
+
repr(status.value) for status in self.status)
|
|
929
|
+
filters.append(f'status IN ({status_list_str})')
|
|
930
|
+
if self.include_request_names is not None:
|
|
931
|
+
request_names_str = ','.join(
|
|
932
|
+
repr(name) for name in self.include_request_names)
|
|
933
|
+
filters.append(f'name IN ({request_names_str})')
|
|
934
|
+
if self.exclude_request_names is not None:
|
|
935
|
+
exclude_request_names_str = ','.join(
|
|
936
|
+
repr(name) for name in self.exclude_request_names)
|
|
937
|
+
filters.append(f'name NOT IN ({exclude_request_names_str})')
|
|
938
|
+
if self.cluster_names is not None:
|
|
939
|
+
cluster_names_str = ','.join(
|
|
940
|
+
repr(name) for name in self.cluster_names)
|
|
941
|
+
filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
|
|
942
|
+
if self.user_id is not None:
|
|
943
|
+
filters.append(f'{COL_USER_ID} = ?')
|
|
944
|
+
filter_params.append(self.user_id)
|
|
945
|
+
if self.finished_before is not None:
|
|
946
|
+
filters.append('finished_at < ?')
|
|
947
|
+
filter_params.append(self.finished_before)
|
|
570
948
|
filter_str = ' AND '.join(filters)
|
|
571
949
|
if filter_str:
|
|
572
950
|
filter_str = f' WHERE {filter_str}'
|
|
573
951
|
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
952
|
+
if self.fields:
|
|
953
|
+
columns_str = ', '.join(self.fields)
|
|
954
|
+
sort_str = ''
|
|
955
|
+
if self.sort:
|
|
956
|
+
sort_str = ' ORDER BY created_at DESC'
|
|
957
|
+
query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str}'
|
|
958
|
+
f'{sort_str}')
|
|
959
|
+
if self.limit is not None:
|
|
960
|
+
query_str += f' LIMIT {self.limit}'
|
|
961
|
+
return query_str, filter_params
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
@init_db
|
|
965
|
+
@metrics_lib.time_me
|
|
966
|
+
def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
|
|
967
|
+
"""Get a list of requests that match the given filters.
|
|
968
|
+
|
|
969
|
+
Args:
|
|
970
|
+
req_filter: the filter to apply to the requests. Refer to
|
|
971
|
+
RequestTaskFilter for the details.
|
|
972
|
+
"""
|
|
973
|
+
assert _DB is not None
|
|
974
|
+
with _DB.conn:
|
|
975
|
+
cursor = _DB.conn.cursor()
|
|
976
|
+
cursor.execute(*req_filter.build_query())
|
|
577
977
|
rows = cursor.fetchall()
|
|
578
978
|
if rows is None:
|
|
579
979
|
return []
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
return
|
|
980
|
+
if req_filter.fields:
|
|
981
|
+
rows = [
|
|
982
|
+
_update_request_row_fields(row, req_filter.fields) for row in rows
|
|
983
|
+
]
|
|
984
|
+
return [Request.from_row(row) for row in rows]
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
@init_db_async
|
|
988
|
+
@metrics_lib.time_me_async
|
|
989
|
+
async def get_request_tasks_async(
|
|
990
|
+
req_filter: RequestTaskFilter) -> List[Request]:
|
|
991
|
+
"""Async version of get_request_tasks."""
|
|
992
|
+
assert _DB is not None
|
|
993
|
+
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
994
|
+
if not rows:
|
|
995
|
+
return []
|
|
996
|
+
if req_filter.fields:
|
|
997
|
+
rows = [
|
|
998
|
+
_update_request_row_fields(row, req_filter.fields) for row in rows
|
|
999
|
+
]
|
|
1000
|
+
return [Request.from_row(row) for row in rows]
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
@init_db_async
|
|
1004
|
+
@metrics_lib.time_me_async
|
|
1005
|
+
async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
|
|
1006
|
+
"""Get a list of API request ids for shell completion."""
|
|
1007
|
+
assert _DB is not None
|
|
1008
|
+
# Prioritize alive requests (PENDING, RUNNING) over finished ones,
|
|
1009
|
+
# then order by creation time (newest first) within each category.
|
|
1010
|
+
async with _DB.execute_fetchall_async(
|
|
1011
|
+
f"""SELECT request_id FROM {REQUEST_TABLE}
|
|
1012
|
+
WHERE request_id LIKE ?
|
|
1013
|
+
ORDER BY
|
|
1014
|
+
CASE
|
|
1015
|
+
WHEN status IN ('PENDING', 'RUNNING') THEN 0
|
|
1016
|
+
ELSE 1
|
|
1017
|
+
END,
|
|
1018
|
+
created_at DESC
|
|
1019
|
+
LIMIT 1000""", (f'{incomplete}%',)) as rows:
|
|
1020
|
+
if not rows:
|
|
1021
|
+
return []
|
|
1022
|
+
return [row[0] for row in rows]
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
_add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
|
|
1026
|
+
f'({", ".join(REQUEST_COLUMNS)}) VALUES '
|
|
1027
|
+
f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
|
|
585
1028
|
|
|
586
1029
|
|
|
587
1030
|
def _add_or_update_request_no_lock(request: Request):
|
|
588
1031
|
"""Add or update a REST request into the database."""
|
|
589
|
-
row = request.to_row()
|
|
590
|
-
key_str = ', '.join(REQUEST_COLUMNS)
|
|
591
|
-
fill_str = ', '.join(['?'] * len(row))
|
|
592
1032
|
assert _DB is not None
|
|
593
1033
|
with _DB.conn:
|
|
594
1034
|
cursor = _DB.conn.cursor()
|
|
595
|
-
cursor.execute(
|
|
596
|
-
f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
|
|
597
|
-
f'VALUES ({fill_str})', row)
|
|
1035
|
+
cursor.execute(_add_or_update_request_sql, request.to_row())
|
|
598
1036
|
|
|
599
1037
|
|
|
600
|
-
def
|
|
601
|
-
"""
|
|
1038
|
+
async def _add_or_update_request_no_lock_async(request: Request):
|
|
1039
|
+
"""Async version of _add_or_update_request_no_lock."""
|
|
1040
|
+
assert _DB is not None
|
|
1041
|
+
await _DB.execute_and_commit_async(_add_or_update_request_sql,
|
|
1042
|
+
request.to_row())
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
def set_exception_stacktrace(e: BaseException) -> None:
|
|
602
1046
|
with ux_utils.enable_traceback():
|
|
603
1047
|
stacktrace = traceback.format_exc()
|
|
604
1048
|
setattr(e, 'stacktrace', stacktrace)
|
|
1049
|
+
|
|
1050
|
+
|
|
1051
|
+
def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
1052
|
+
"""Set a request to failed and populate the error message."""
|
|
1053
|
+
set_exception_stacktrace(e)
|
|
605
1054
|
with update_request(request_id) as request_task:
|
|
606
1055
|
assert request_task is not None, request_id
|
|
607
1056
|
request_task.status = RequestStatus.FAILED
|
|
1057
|
+
request_task.finished_at = time.time()
|
|
608
1058
|
request_task.set_error(e)
|
|
1059
|
+
|
|
1060
|
+
|
|
1061
|
+
@init_db_async
|
|
1062
|
+
@metrics_lib.time_me_async
|
|
1063
|
+
@asyncio_utils.shield
|
|
1064
|
+
async def set_request_failed_async(request_id: str, e: BaseException) -> None:
|
|
1065
|
+
"""Set a request to failed and populate the error message."""
|
|
1066
|
+
set_exception_stacktrace(e)
|
|
1067
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
1068
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
1069
|
+
assert request_task is not None, request_id
|
|
1070
|
+
request_task.status = RequestStatus.FAILED
|
|
1071
|
+
request_task.finished_at = time.time()
|
|
1072
|
+
request_task.set_error(e)
|
|
1073
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
|
1077
|
+
"""Set a request to succeeded and populate the result."""
|
|
1078
|
+
with update_request(request_id) as request_task:
|
|
1079
|
+
assert request_task is not None, request_id
|
|
1080
|
+
request_task.status = RequestStatus.SUCCEEDED
|
|
1081
|
+
request_task.finished_at = time.time()
|
|
1082
|
+
if result is not None:
|
|
1083
|
+
request_task.set_return_value(result)
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
@init_db_async
|
|
1087
|
+
@metrics_lib.time_me_async
|
|
1088
|
+
@asyncio_utils.shield
|
|
1089
|
+
async def set_request_succeeded_async(request_id: str,
|
|
1090
|
+
result: Optional[Any]) -> None:
|
|
1091
|
+
"""Set a request to succeeded and populate the result."""
|
|
1092
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
1093
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
1094
|
+
assert request_task is not None, request_id
|
|
1095
|
+
request_task.status = RequestStatus.SUCCEEDED
|
|
1096
|
+
request_task.finished_at = time.time()
|
|
1097
|
+
if result is not None:
|
|
1098
|
+
request_task.set_return_value(result)
|
|
1099
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
@init_db_async
|
|
1103
|
+
@metrics_lib.time_me_async
|
|
1104
|
+
@asyncio_utils.shield
|
|
1105
|
+
async def set_request_cancelled_async(request_id: str) -> None:
|
|
1106
|
+
"""Set a pending or running request to cancelled."""
|
|
1107
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
1108
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
1109
|
+
assert request_task is not None, request_id
|
|
1110
|
+
# Already finished or cancelled.
|
|
1111
|
+
if request_task.status > RequestStatus.RUNNING:
|
|
1112
|
+
return
|
|
1113
|
+
request_task.finished_at = time.time()
|
|
1114
|
+
request_task.status = RequestStatus.CANCELLED
|
|
1115
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
@init_db
|
|
1119
|
+
@metrics_lib.time_me
|
|
1120
|
+
async def _delete_requests(request_ids: List[str]):
|
|
1121
|
+
"""Clean up requests by their IDs."""
|
|
1122
|
+
id_list_str = ','.join(repr(request_id) for request_id in request_ids)
|
|
1123
|
+
assert _DB is not None
|
|
1124
|
+
await _DB.execute_and_commit_async(
|
|
1125
|
+
f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
async def clean_finished_requests_with_retention(retention_seconds: int,
|
|
1129
|
+
batch_size: int = 1000):
|
|
1130
|
+
"""Clean up finished requests older than the retention period.
|
|
1131
|
+
|
|
1132
|
+
This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
|
|
1133
|
+
from the database and cleans up their associated log files.
|
|
1134
|
+
|
|
1135
|
+
Args:
|
|
1136
|
+
retention_seconds: Requests older than this many seconds will be
|
|
1137
|
+
deleted.
|
|
1138
|
+
batch_size: batch delete 'batch_size' requests at a time to
|
|
1139
|
+
avoid using too much memory and once and to let each
|
|
1140
|
+
db query complete in a reasonable time. All stale
|
|
1141
|
+
requests older than the retention period will be deleted
|
|
1142
|
+
regardless of the batch size.
|
|
1143
|
+
"""
|
|
1144
|
+
total_deleted = 0
|
|
1145
|
+
while True:
|
|
1146
|
+
reqs = await get_request_tasks_async(
|
|
1147
|
+
req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
|
|
1148
|
+
finished_before=time.time() -
|
|
1149
|
+
retention_seconds,
|
|
1150
|
+
limit=batch_size,
|
|
1151
|
+
fields=['request_id']))
|
|
1152
|
+
if len(reqs) == 0:
|
|
1153
|
+
break
|
|
1154
|
+
futs = []
|
|
1155
|
+
for req in reqs:
|
|
1156
|
+
# req.log_path is derived from request_id,
|
|
1157
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1158
|
+
futs.append(
|
|
1159
|
+
asyncio.create_task(
|
|
1160
|
+
anyio.Path(
|
|
1161
|
+
req.log_path.absolute()).unlink(missing_ok=True)))
|
|
1162
|
+
await asyncio.gather(*futs)
|
|
1163
|
+
|
|
1164
|
+
await _delete_requests([req.request_id for req in reqs])
|
|
1165
|
+
total_deleted += len(reqs)
|
|
1166
|
+
if len(reqs) < batch_size:
|
|
1167
|
+
break
|
|
1168
|
+
|
|
1169
|
+
# To avoid leakage of the log file, logs must be deleted before the
|
|
1170
|
+
# request task in the database.
|
|
1171
|
+
logger.info(f'Cleaned up {total_deleted} finished requests '
|
|
1172
|
+
f'older than {retention_seconds} seconds')
|
|
1173
|
+
|
|
1174
|
+
|
|
1175
|
+
async def requests_gc_daemon():
|
|
1176
|
+
"""Garbage collect finished requests periodically."""
|
|
1177
|
+
while True:
|
|
1178
|
+
logger.info('Running requests GC daemon...')
|
|
1179
|
+
# Use the latest config.
|
|
1180
|
+
skypilot_config.reload_config()
|
|
1181
|
+
retention_seconds = skypilot_config.get_nested(
|
|
1182
|
+
('api_server', 'requests_retention_hours'),
|
|
1183
|
+
DEFAULT_REQUESTS_RETENTION_HOURS) * 3600
|
|
1184
|
+
try:
|
|
1185
|
+
# Negative value disables the requests GC
|
|
1186
|
+
if retention_seconds >= 0:
|
|
1187
|
+
await clean_finished_requests_with_retention(retention_seconds)
|
|
1188
|
+
except asyncio.CancelledError:
|
|
1189
|
+
logger.info('Requests GC daemon cancelled')
|
|
1190
|
+
break
|
|
1191
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1192
|
+
logger.error(f'Error running requests GC daemon: {e}'
|
|
1193
|
+
f'traceback: {traceback.format_exc()}')
|
|
1194
|
+
# Run the daemon at most once every hour to avoid too frequent
|
|
1195
|
+
# cleanup.
|
|
1196
|
+
await asyncio.sleep(max(retention_seconds, 3600))
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
def _cleanup():
|
|
1200
|
+
if _DB is not None:
|
|
1201
|
+
asyncio.run(_DB.close())
|
|
1202
|
+
|
|
1203
|
+
|
|
1204
|
+
atexit.register(_cleanup)
|