skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/requests/requests.py
CHANGED
|
@@ -5,7 +5,6 @@ import contextlib
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import enum
|
|
7
7
|
import functools
|
|
8
|
-
import json
|
|
9
8
|
import os
|
|
10
9
|
import pathlib
|
|
11
10
|
import shutil
|
|
@@ -14,24 +13,28 @@ import sqlite3
|
|
|
14
13
|
import threading
|
|
15
14
|
import time
|
|
16
15
|
import traceback
|
|
17
|
-
from typing import (Any,
|
|
18
|
-
|
|
16
|
+
from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
|
|
17
|
+
Tuple)
|
|
18
|
+
import uuid
|
|
19
19
|
|
|
20
20
|
import anyio
|
|
21
21
|
import colorama
|
|
22
22
|
import filelock
|
|
23
|
+
import orjson
|
|
23
24
|
|
|
24
25
|
from sky import exceptions
|
|
25
26
|
from sky import global_user_state
|
|
26
27
|
from sky import sky_logging
|
|
27
28
|
from sky import skypilot_config
|
|
29
|
+
from sky.metrics import utils as metrics_lib
|
|
28
30
|
from sky.server import common as server_common
|
|
29
31
|
from sky.server import constants as server_constants
|
|
30
32
|
from sky.server import daemons
|
|
31
|
-
from sky.server import metrics as metrics_lib
|
|
32
33
|
from sky.server.requests import payloads
|
|
33
34
|
from sky.server.requests.serializers import decoders
|
|
34
35
|
from sky.server.requests.serializers import encoders
|
|
36
|
+
from sky.server.requests.serializers import return_value_serializers
|
|
37
|
+
from sky.utils import asyncio_utils
|
|
35
38
|
from sky.utils import common_utils
|
|
36
39
|
from sky.utils import ux_utils
|
|
37
40
|
from sky.utils.db import db_utils
|
|
@@ -211,8 +214,8 @@ class Request:
|
|
|
211
214
|
entrypoint=self.entrypoint.__name__,
|
|
212
215
|
request_body=self.request_body.model_dump_json(),
|
|
213
216
|
status=self.status.value,
|
|
214
|
-
return_value=
|
|
215
|
-
error=
|
|
217
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
218
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
216
219
|
pid=None,
|
|
217
220
|
created_at=self.created_at,
|
|
218
221
|
schedule_type=self.schedule_type.value,
|
|
@@ -229,14 +232,17 @@ class Request:
|
|
|
229
232
|
assert isinstance(self.request_body,
|
|
230
233
|
payloads.RequestBody), (self.name, self.request_body)
|
|
231
234
|
try:
|
|
235
|
+
# Use version-aware serializer to handle backward compatibility
|
|
236
|
+
# for old clients that don't recognize new fields.
|
|
237
|
+
serializer = return_value_serializers.get_serializer(self.name)
|
|
232
238
|
return payloads.RequestPayload(
|
|
233
239
|
request_id=self.request_id,
|
|
234
240
|
name=self.name,
|
|
235
241
|
entrypoint=encoders.pickle_and_encode(self.entrypoint),
|
|
236
242
|
request_body=encoders.pickle_and_encode(self.request_body),
|
|
237
243
|
status=self.status.value,
|
|
238
|
-
return_value=
|
|
239
|
-
error=
|
|
244
|
+
return_value=serializer(self.return_value),
|
|
245
|
+
error=orjson.dumps(self.error).decode('utf-8'),
|
|
240
246
|
pid=self.pid,
|
|
241
247
|
created_at=self.created_at,
|
|
242
248
|
schedule_type=self.schedule_type.value,
|
|
@@ -268,8 +274,8 @@ class Request:
|
|
|
268
274
|
entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
|
|
269
275
|
request_body=decoders.decode_and_unpickle(payload.request_body),
|
|
270
276
|
status=RequestStatus(payload.status),
|
|
271
|
-
return_value=
|
|
272
|
-
error=
|
|
277
|
+
return_value=orjson.loads(payload.return_value),
|
|
278
|
+
error=orjson.loads(payload.error),
|
|
273
279
|
pid=payload.pid,
|
|
274
280
|
created_at=payload.created_at,
|
|
275
281
|
schedule_type=ScheduleType(payload.schedule_type),
|
|
@@ -292,72 +298,104 @@ class Request:
|
|
|
292
298
|
raise
|
|
293
299
|
|
|
294
300
|
|
|
295
|
-
def
|
|
296
|
-
"""
|
|
301
|
+
def get_new_request_id() -> str:
|
|
302
|
+
"""Get a new request ID."""
|
|
303
|
+
return str(uuid.uuid4())
|
|
297
304
|
|
|
298
|
-
Args:
|
|
299
|
-
cluster_name: the name of the cluster.
|
|
300
|
-
exclude_request_names: exclude requests with these names. This is to
|
|
301
|
-
prevent killing the caller request.
|
|
302
|
-
"""
|
|
303
|
-
request_ids = [
|
|
304
|
-
request_task.request_id
|
|
305
|
-
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
306
|
-
cluster_names=[cluster_name],
|
|
307
|
-
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
308
|
-
exclude_request_names=[exclude_request_name]))
|
|
309
|
-
]
|
|
310
|
-
kill_requests(request_ids)
|
|
311
305
|
|
|
306
|
+
def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
307
|
+
"""Serialize the SkyPilot API request for display purposes.
|
|
312
308
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
309
|
+
This function should be called on the server side to serialize the
|
|
310
|
+
request body into human readable format, e.g., the entrypoint should
|
|
311
|
+
be a string, and the pid, error, or return value are not needed.
|
|
316
312
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
user are killed.
|
|
320
|
-
user_id: The user ID to kill requests for. If None, all users are
|
|
321
|
-
killed.
|
|
313
|
+
The returned value will then be displayed on the client side in request
|
|
314
|
+
table.
|
|
322
315
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
316
|
+
We do not use `encode` for display to avoid a large amount of data being
|
|
317
|
+
sent to the client side, especially for the request table could include
|
|
318
|
+
all the requests.
|
|
319
|
+
"""
|
|
320
|
+
encoded_requests = []
|
|
321
|
+
all_users = global_user_state.get_all_users()
|
|
322
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
323
|
+
for request in requests:
|
|
324
|
+
if request.request_body is not None:
|
|
325
|
+
assert isinstance(request.request_body,
|
|
326
|
+
payloads.RequestBody), (request.name,
|
|
327
|
+
request.request_body)
|
|
328
|
+
user_name = all_users_map.get(request.user_id)
|
|
329
|
+
payload = payloads.RequestPayload(
|
|
330
|
+
request_id=request.request_id,
|
|
331
|
+
name=request.name,
|
|
332
|
+
entrypoint=request.entrypoint.__name__
|
|
333
|
+
if request.entrypoint is not None else '',
|
|
334
|
+
request_body=request.request_body.model_dump_json()
|
|
335
|
+
if request.request_body is not None else
|
|
336
|
+
orjson.dumps(None).decode('utf-8'),
|
|
337
|
+
status=request.status.value,
|
|
338
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
339
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
340
|
+
pid=None,
|
|
341
|
+
created_at=request.created_at,
|
|
342
|
+
schedule_type=request.schedule_type.value,
|
|
343
|
+
user_id=request.user_id,
|
|
344
|
+
user_name=user_name,
|
|
345
|
+
cluster_name=request.cluster_name,
|
|
346
|
+
status_msg=request.status_msg,
|
|
347
|
+
should_retry=request.should_retry,
|
|
348
|
+
finished_at=request.finished_at,
|
|
349
|
+
)
|
|
350
|
+
encoded_requests.append(payload)
|
|
351
|
+
return encoded_requests
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _update_request_row_fields(
|
|
355
|
+
row: Tuple[Any, ...],
|
|
356
|
+
fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
|
|
357
|
+
"""Update the request row fields."""
|
|
358
|
+
if not fields:
|
|
359
|
+
return row
|
|
360
|
+
|
|
361
|
+
# Convert tuple to dictionary for easier manipulation
|
|
362
|
+
content = dict(zip(fields, row))
|
|
363
|
+
|
|
364
|
+
# Required fields in RequestPayload
|
|
365
|
+
if 'request_id' not in fields:
|
|
366
|
+
content['request_id'] = ''
|
|
367
|
+
if 'name' not in fields:
|
|
368
|
+
content['name'] = ''
|
|
369
|
+
if 'entrypoint' not in fields:
|
|
370
|
+
content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
|
|
371
|
+
if 'request_body' not in fields:
|
|
372
|
+
content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
|
|
373
|
+
if 'status' not in fields:
|
|
374
|
+
content['status'] = RequestStatus.PENDING.value
|
|
375
|
+
if 'created_at' not in fields:
|
|
376
|
+
content['created_at'] = 0
|
|
377
|
+
if 'user_id' not in fields:
|
|
378
|
+
content['user_id'] = ''
|
|
379
|
+
if 'return_value' not in fields:
|
|
380
|
+
content['return_value'] = orjson.dumps(None).decode('utf-8')
|
|
381
|
+
if 'error' not in fields:
|
|
382
|
+
content['error'] = orjson.dumps(None).decode('utf-8')
|
|
383
|
+
if 'schedule_type' not in fields:
|
|
384
|
+
content['schedule_type'] = ScheduleType.SHORT.value
|
|
385
|
+
# Optional fields in RequestPayload
|
|
386
|
+
if 'pid' not in fields:
|
|
387
|
+
content['pid'] = None
|
|
388
|
+
if 'cluster_name' not in fields:
|
|
389
|
+
content['cluster_name'] = None
|
|
390
|
+
if 'status_msg' not in fields:
|
|
391
|
+
content['status_msg'] = None
|
|
392
|
+
if 'should_retry' not in fields:
|
|
393
|
+
content['should_retry'] = False
|
|
394
|
+
if 'finished_at' not in fields:
|
|
395
|
+
content['finished_at'] = None
|
|
396
|
+
|
|
397
|
+
# Convert back to tuple in the same order as REQUEST_COLUMNS
|
|
398
|
+
return tuple(content[col] for col in REQUEST_COLUMNS)
|
|
361
399
|
|
|
362
400
|
|
|
363
401
|
def create_table(cursor, conn):
|
|
@@ -402,6 +440,21 @@ def create_table(cursor, conn):
|
|
|
402
440
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
|
|
403
441
|
'REAL')
|
|
404
442
|
|
|
443
|
+
# Add an index on (status, name) to speed up queries
|
|
444
|
+
# that filter on these columns.
|
|
445
|
+
cursor.execute(f"""\
|
|
446
|
+
CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
|
|
447
|
+
""")
|
|
448
|
+
# Add an index on cluster_name to speed up queries
|
|
449
|
+
# that filter on this column.
|
|
450
|
+
cursor.execute(f"""\
|
|
451
|
+
CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
|
|
452
|
+
""")
|
|
453
|
+
# Add an index on created_at to speed up queries that sort on this column.
|
|
454
|
+
cursor.execute(f"""\
|
|
455
|
+
CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
|
|
456
|
+
""")
|
|
457
|
+
|
|
405
458
|
|
|
406
459
|
_DB = None
|
|
407
460
|
_init_db_lock = threading.Lock()
|
|
@@ -449,11 +502,37 @@ def init_db_async(func):
|
|
|
449
502
|
|
|
450
503
|
def reset_db_and_logs():
|
|
451
504
|
"""Create the database."""
|
|
505
|
+
logger.debug('clearing local API server database')
|
|
452
506
|
server_common.clear_local_api_server_database()
|
|
507
|
+
logger.debug(
|
|
508
|
+
f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
|
|
509
|
+
)
|
|
453
510
|
shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
|
|
454
511
|
ignore_errors=True)
|
|
512
|
+
logger.debug('clearing local API server client directory at '
|
|
513
|
+
f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
|
|
455
514
|
shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
|
|
456
515
|
ignore_errors=True)
|
|
516
|
+
with _init_db_lock:
|
|
517
|
+
_init_db_within_lock()
|
|
518
|
+
assert _DB is not None
|
|
519
|
+
with _DB.conn:
|
|
520
|
+
cursor = _DB.conn.cursor()
|
|
521
|
+
cursor.execute('SELECT sqlite_version()')
|
|
522
|
+
row = cursor.fetchone()
|
|
523
|
+
if row is None:
|
|
524
|
+
raise RuntimeError('Failed to get SQLite version')
|
|
525
|
+
version_str = row[0]
|
|
526
|
+
version_parts = version_str.split('.')
|
|
527
|
+
assert len(version_parts) >= 2, \
|
|
528
|
+
f'Invalid version string: {version_str}'
|
|
529
|
+
major, minor = int(version_parts[0]), int(version_parts[1])
|
|
530
|
+
# SQLite 3.35.0+ supports RETURNING statements.
|
|
531
|
+
# 3.35.0 was released in March 2021.
|
|
532
|
+
if not ((major > 3) or (major == 3 and minor >= 35)):
|
|
533
|
+
raise RuntimeError(
|
|
534
|
+
f'SQLite version {version_str} is not supported. '
|
|
535
|
+
'Please upgrade to SQLite 3.35.0 or later.')
|
|
457
536
|
|
|
458
537
|
|
|
459
538
|
def request_lock_path(request_id: str) -> str:
|
|
@@ -462,93 +541,285 @@ def request_lock_path(request_id: str) -> str:
|
|
|
462
541
|
return os.path.join(lock_path, f'.{request_id}.lock')
|
|
463
542
|
|
|
464
543
|
|
|
544
|
+
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
545
|
+
"""Kill all pending and running requests for a cluster.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
cluster_name: the name of the cluster.
|
|
549
|
+
exclude_request_names: exclude requests with these names. This is to
|
|
550
|
+
prevent killing the caller request.
|
|
551
|
+
"""
|
|
552
|
+
request_ids = [
|
|
553
|
+
request_task.request_id
|
|
554
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
555
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
556
|
+
exclude_request_names=[exclude_request_name],
|
|
557
|
+
cluster_names=[cluster_name],
|
|
558
|
+
fields=['request_id']))
|
|
559
|
+
]
|
|
560
|
+
_kill_requests(request_ids)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
564
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
565
|
+
"""Kill requests with a given request ID prefix."""
|
|
566
|
+
expanded_request_ids: Optional[List[str]] = None
|
|
567
|
+
if request_ids is not None:
|
|
568
|
+
expanded_request_ids = []
|
|
569
|
+
for request_id in request_ids:
|
|
570
|
+
request_tasks = get_requests_with_prefix(request_id,
|
|
571
|
+
fields=['request_id'])
|
|
572
|
+
if request_tasks is None or len(request_tasks) == 0:
|
|
573
|
+
continue
|
|
574
|
+
if len(request_tasks) > 1:
|
|
575
|
+
raise ValueError(f'Multiple requests found for '
|
|
576
|
+
f'request ID prefix: {request_id}')
|
|
577
|
+
expanded_request_ids.append(request_tasks[0].request_id)
|
|
578
|
+
return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
# needed for backward compatibility. Remove by v0.10.7 or v0.12.0
|
|
582
|
+
# and rename kill_requests to kill_requests_with_prefix.
|
|
583
|
+
kill_requests_with_prefix = kill_requests
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def _should_kill_request(request_id: str,
|
|
587
|
+
request_record: Optional[Request]) -> bool:
|
|
588
|
+
if request_record is None:
|
|
589
|
+
logger.debug(f'No request ID {request_id}')
|
|
590
|
+
return False
|
|
591
|
+
# Skip internal requests. The internal requests are scheduled with
|
|
592
|
+
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
593
|
+
if request_record.request_id in set(
|
|
594
|
+
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
595
|
+
return False
|
|
596
|
+
if request_record.status > RequestStatus.RUNNING:
|
|
597
|
+
logger.debug(f'Request {request_id} already finished')
|
|
598
|
+
return False
|
|
599
|
+
return True
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def _kill_requests(request_ids: Optional[List[str]] = None,
|
|
603
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
604
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
request_ids: The request IDs to kill. If None, all requests for the
|
|
608
|
+
user are killed.
|
|
609
|
+
user_id: The user ID to kill requests for. If None, all users are
|
|
610
|
+
killed.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
A list of request IDs that were cancelled.
|
|
614
|
+
"""
|
|
615
|
+
if request_ids is None:
|
|
616
|
+
request_ids = [
|
|
617
|
+
request_task.request_id
|
|
618
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
619
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
620
|
+
# Avoid cancelling the cancel request itself.
|
|
621
|
+
exclude_request_names=['sky.api_cancel'],
|
|
622
|
+
user_id=user_id,
|
|
623
|
+
fields=['request_id']))
|
|
624
|
+
]
|
|
625
|
+
cancelled_request_ids = []
|
|
626
|
+
for request_id in request_ids:
|
|
627
|
+
with update_request(request_id) as request_record:
|
|
628
|
+
if not _should_kill_request(request_id, request_record):
|
|
629
|
+
continue
|
|
630
|
+
if request_record.pid is not None:
|
|
631
|
+
logger.debug(f'Killing request process {request_record.pid}')
|
|
632
|
+
# Use SIGTERM instead of SIGKILL:
|
|
633
|
+
# - The executor can handle SIGTERM gracefully
|
|
634
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
635
|
+
# for other requests, avoiding the overhead of forking a new
|
|
636
|
+
# process for each request.
|
|
637
|
+
os.kill(request_record.pid, signal.SIGTERM)
|
|
638
|
+
request_record.status = RequestStatus.CANCELLED
|
|
639
|
+
request_record.finished_at = time.time()
|
|
640
|
+
cancelled_request_ids.append(request_id)
|
|
641
|
+
return cancelled_request_ids
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
@init_db_async
|
|
645
|
+
@asyncio_utils.shield
|
|
646
|
+
async def kill_request_async(request_id: str) -> bool:
|
|
647
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
True if the request was killed, False otherwise.
|
|
651
|
+
"""
|
|
652
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
653
|
+
request = await _get_request_no_lock_async(request_id)
|
|
654
|
+
if not _should_kill_request(request_id, request):
|
|
655
|
+
return False
|
|
656
|
+
assert request is not None
|
|
657
|
+
if request.pid is not None:
|
|
658
|
+
logger.debug(f'Killing request process {request.pid}')
|
|
659
|
+
# Use SIGTERM instead of SIGKILL:
|
|
660
|
+
# - The executor can handle SIGTERM gracefully
|
|
661
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
662
|
+
# for other requests, avoiding the overhead of forking a new
|
|
663
|
+
# process for each request.
|
|
664
|
+
os.kill(request.pid, signal.SIGTERM)
|
|
665
|
+
request.status = RequestStatus.CANCELLED
|
|
666
|
+
request.finished_at = time.time()
|
|
667
|
+
await _add_or_update_request_no_lock_async(request)
|
|
668
|
+
return True
|
|
669
|
+
|
|
670
|
+
|
|
465
671
|
@contextlib.contextmanager
|
|
466
672
|
@init_db
|
|
467
673
|
@metrics_lib.time_me
|
|
468
674
|
def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
469
675
|
"""Get and update a SkyPilot API request."""
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
676
|
+
# Acquire the lock to avoid race conditions between multiple request
|
|
677
|
+
# operations, e.g. execute and cancel.
|
|
678
|
+
with filelock.FileLock(request_lock_path(request_id)):
|
|
679
|
+
request = _get_request_no_lock(request_id)
|
|
680
|
+
yield request
|
|
681
|
+
if request is not None:
|
|
682
|
+
_add_or_update_request_no_lock(request)
|
|
474
683
|
|
|
475
684
|
|
|
476
|
-
@
|
|
685
|
+
@init_db_async
|
|
477
686
|
@metrics_lib.time_me
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
"""
|
|
481
|
-
|
|
482
|
-
Returns an async context manager that yields the request record and
|
|
483
|
-
persists any in-place updates upon exit.
|
|
484
|
-
"""
|
|
485
|
-
|
|
486
|
-
@contextlib.asynccontextmanager
|
|
487
|
-
async def _cm():
|
|
687
|
+
@asyncio_utils.shield
|
|
688
|
+
async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
689
|
+
"""Update the status of a request"""
|
|
690
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
488
691
|
request = await _get_request_no_lock_async(request_id)
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
if request is not None:
|
|
493
|
-
await _add_or_update_request_no_lock_async(request)
|
|
494
|
-
|
|
495
|
-
return _cm()
|
|
692
|
+
if request is not None:
|
|
693
|
+
request.status = status
|
|
694
|
+
await _add_or_update_request_no_lock_async(request)
|
|
496
695
|
|
|
497
696
|
|
|
498
|
-
|
|
499
|
-
|
|
697
|
+
@init_db_async
|
|
698
|
+
@metrics_lib.time_me
|
|
699
|
+
@asyncio_utils.shield
|
|
700
|
+
async def update_status_msg_async(request_id: str, status_msg: str) -> None:
|
|
701
|
+
"""Update the status message of a request"""
|
|
702
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
703
|
+
request = await _get_request_no_lock_async(request_id)
|
|
704
|
+
if request is not None:
|
|
705
|
+
request.status_msg = status_msg
|
|
706
|
+
await _add_or_update_request_no_lock_async(request)
|
|
500
707
|
|
|
501
708
|
|
|
502
|
-
def _get_request_no_lock(
|
|
709
|
+
def _get_request_no_lock(
|
|
710
|
+
request_id: str,
|
|
711
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
503
712
|
"""Get a SkyPilot API request."""
|
|
504
713
|
assert _DB is not None
|
|
714
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
715
|
+
if fields:
|
|
716
|
+
columns_str = ', '.join(fields)
|
|
505
717
|
with _DB.conn:
|
|
506
718
|
cursor = _DB.conn.cursor()
|
|
507
|
-
cursor.execute(
|
|
719
|
+
cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
720
|
+
'WHERE request_id LIKE ?'), (request_id + '%',))
|
|
508
721
|
row = cursor.fetchone()
|
|
509
722
|
if row is None:
|
|
510
723
|
return None
|
|
724
|
+
if fields:
|
|
725
|
+
row = _update_request_row_fields(row, fields)
|
|
511
726
|
return Request.from_row(row)
|
|
512
727
|
|
|
513
728
|
|
|
514
|
-
async def _get_request_no_lock_async(
|
|
729
|
+
async def _get_request_no_lock_async(
|
|
730
|
+
request_id: str,
|
|
731
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
515
732
|
"""Async version of _get_request_no_lock."""
|
|
516
733
|
assert _DB is not None
|
|
517
|
-
|
|
518
|
-
|
|
734
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
735
|
+
if fields:
|
|
736
|
+
columns_str = ', '.join(fields)
|
|
737
|
+
async with _DB.execute_fetchall_async(
|
|
738
|
+
(f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
739
|
+
'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
|
|
519
740
|
row = rows[0] if rows else None
|
|
520
741
|
if row is None:
|
|
521
742
|
return None
|
|
743
|
+
if fields:
|
|
744
|
+
row = _update_request_row_fields(row, fields)
|
|
522
745
|
return Request.from_row(row)
|
|
523
746
|
|
|
524
747
|
|
|
525
|
-
@
|
|
748
|
+
@init_db_async
|
|
526
749
|
@metrics_lib.time_me
|
|
527
|
-
def
|
|
750
|
+
async def get_latest_request_id_async() -> Optional[str]:
|
|
528
751
|
"""Get the latest request ID."""
|
|
529
752
|
assert _DB is not None
|
|
530
|
-
with _DB.
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
row = cursor.fetchone()
|
|
535
|
-
return row[0] if row else None
|
|
753
|
+
async with _DB.execute_fetchall_async(
|
|
754
|
+
(f'SELECT request_id FROM {REQUEST_TABLE} '
|
|
755
|
+
'ORDER BY created_at DESC LIMIT 1')) as rows:
|
|
756
|
+
return rows[0][0] if rows else None
|
|
536
757
|
|
|
537
758
|
|
|
538
759
|
@init_db
|
|
539
760
|
@metrics_lib.time_me
|
|
540
|
-
def get_request(request_id: str
|
|
761
|
+
def get_request(request_id: str,
|
|
762
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
541
763
|
"""Get a SkyPilot API request."""
|
|
542
764
|
with filelock.FileLock(request_lock_path(request_id)):
|
|
543
|
-
return _get_request_no_lock(request_id)
|
|
765
|
+
return _get_request_no_lock(request_id, fields)
|
|
544
766
|
|
|
545
767
|
|
|
546
768
|
@init_db_async
|
|
547
769
|
@metrics_lib.time_me_async
|
|
548
|
-
|
|
770
|
+
@asyncio_utils.shield
|
|
771
|
+
async def get_request_async(
|
|
772
|
+
request_id: str,
|
|
773
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
549
774
|
"""Async version of get_request."""
|
|
775
|
+
# TODO(aylei): figure out how to remove FileLock here to avoid the overhead
|
|
550
776
|
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
551
|
-
return await _get_request_no_lock_async(request_id)
|
|
777
|
+
return await _get_request_no_lock_async(request_id, fields)
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
@init_db
|
|
781
|
+
@metrics_lib.time_me
|
|
782
|
+
def get_requests_with_prefix(
|
|
783
|
+
request_id_prefix: str,
|
|
784
|
+
fields: Optional[List[str]] = None) -> Optional[List[Request]]:
|
|
785
|
+
"""Get requests with a given request ID prefix."""
|
|
786
|
+
assert _DB is not None
|
|
787
|
+
if fields:
|
|
788
|
+
columns_str = ', '.join(fields)
|
|
789
|
+
else:
|
|
790
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
791
|
+
with _DB.conn:
|
|
792
|
+
cursor = _DB.conn.cursor()
|
|
793
|
+
cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
794
|
+
'WHERE request_id LIKE ?'), (request_id_prefix + '%',))
|
|
795
|
+
rows = cursor.fetchall()
|
|
796
|
+
if not rows:
|
|
797
|
+
return None
|
|
798
|
+
if fields:
|
|
799
|
+
rows = [_update_request_row_fields(row, fields) for row in rows]
|
|
800
|
+
return [Request.from_row(row) for row in rows]
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
@init_db_async
|
|
804
|
+
@metrics_lib.time_me_async
|
|
805
|
+
@asyncio_utils.shield
|
|
806
|
+
async def get_requests_async_with_prefix(
|
|
807
|
+
request_id_prefix: str,
|
|
808
|
+
fields: Optional[List[str]] = None) -> Optional[List[Request]]:
|
|
809
|
+
"""Async version of get_request_with_prefix."""
|
|
810
|
+
assert _DB is not None
|
|
811
|
+
if fields:
|
|
812
|
+
columns_str = ', '.join(fields)
|
|
813
|
+
else:
|
|
814
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
815
|
+
async with _DB.execute_fetchall_async(
|
|
816
|
+
(f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
817
|
+
'WHERE request_id LIKE ?'), (request_id_prefix + '%',)) as rows:
|
|
818
|
+
if not rows:
|
|
819
|
+
return None
|
|
820
|
+
if fields:
|
|
821
|
+
rows = [_update_request_row_fields(row, fields) for row in rows]
|
|
822
|
+
return [Request.from_row(row) for row in rows]
|
|
552
823
|
|
|
553
824
|
|
|
554
825
|
class StatusWithMsg(NamedTuple):
|
|
@@ -585,26 +856,29 @@ async def get_request_status_async(
|
|
|
585
856
|
return StatusWithMsg(status, status_msg)
|
|
586
857
|
|
|
587
858
|
|
|
588
|
-
@init_db
|
|
589
|
-
@metrics_lib.time_me
|
|
590
|
-
def create_if_not_exists(request: Request) -> bool:
|
|
591
|
-
"""Create a SkyPilot API request if it does not exist."""
|
|
592
|
-
with filelock.FileLock(request_lock_path(request.request_id)):
|
|
593
|
-
if _get_request_no_lock(request.request_id) is not None:
|
|
594
|
-
return False
|
|
595
|
-
_add_or_update_request_no_lock(request)
|
|
596
|
-
return True
|
|
597
|
-
|
|
598
|
-
|
|
599
859
|
@init_db_async
|
|
600
860
|
@metrics_lib.time_me_async
|
|
861
|
+
@asyncio_utils.shield
|
|
601
862
|
async def create_if_not_exists_async(request: Request) -> bool:
|
|
602
|
-
"""
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
863
|
+
"""Create a request if it does not exist, otherwise do nothing.
|
|
864
|
+
|
|
865
|
+
Returns:
|
|
866
|
+
True if a new request is created, False if the request already exists.
|
|
867
|
+
"""
|
|
868
|
+
assert _DB is not None
|
|
869
|
+
request_columns = ', '.join(REQUEST_COLUMNS)
|
|
870
|
+
values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
|
|
871
|
+
sql_statement = (
|
|
872
|
+
f'INSERT INTO {REQUEST_TABLE} '
|
|
873
|
+
f'({request_columns}) VALUES '
|
|
874
|
+
f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
|
|
875
|
+
request_row = request.to_row()
|
|
876
|
+
# Execute the SQL statement without getting the request lock.
|
|
877
|
+
# The request lock is used to prevent racing with cancellation codepath,
|
|
878
|
+
# but a request cannot be cancelled before it is created.
|
|
879
|
+
row = await _DB.execute_get_returning_value_async(sql_statement,
|
|
880
|
+
request_row)
|
|
881
|
+
return True if row else False
|
|
608
882
|
|
|
609
883
|
|
|
610
884
|
@dataclasses.dataclass
|
|
@@ -622,6 +896,7 @@ class RequestTaskFilter:
|
|
|
622
896
|
Mutually exclusive with exclude_request_names.
|
|
623
897
|
finished_before: if provided, only include requests finished before this
|
|
624
898
|
timestamp.
|
|
899
|
+
limit: the number of requests to show. If None, show all requests.
|
|
625
900
|
|
|
626
901
|
Raises:
|
|
627
902
|
ValueError: If both exclude_request_names and include_request_names are
|
|
@@ -633,6 +908,9 @@ class RequestTaskFilter:
|
|
|
633
908
|
exclude_request_names: Optional[List[str]] = None
|
|
634
909
|
include_request_names: Optional[List[str]] = None
|
|
635
910
|
finished_before: Optional[float] = None
|
|
911
|
+
limit: Optional[int] = None
|
|
912
|
+
fields: Optional[List[str]] = None
|
|
913
|
+
sort: bool = False
|
|
636
914
|
|
|
637
915
|
def __post_init__(self):
|
|
638
916
|
if (self.exclude_request_names is not None and
|
|
@@ -653,6 +931,10 @@ class RequestTaskFilter:
|
|
|
653
931
|
status_list_str = ','.join(
|
|
654
932
|
repr(status.value) for status in self.status)
|
|
655
933
|
filters.append(f'status IN ({status_list_str})')
|
|
934
|
+
if self.include_request_names is not None:
|
|
935
|
+
request_names_str = ','.join(
|
|
936
|
+
repr(name) for name in self.include_request_names)
|
|
937
|
+
filters.append(f'name IN ({request_names_str})')
|
|
656
938
|
if self.exclude_request_names is not None:
|
|
657
939
|
exclude_request_names_str = ','.join(
|
|
658
940
|
repr(name) for name in self.exclude_request_names)
|
|
@@ -664,10 +946,6 @@ class RequestTaskFilter:
|
|
|
664
946
|
if self.user_id is not None:
|
|
665
947
|
filters.append(f'{COL_USER_ID} = ?')
|
|
666
948
|
filter_params.append(self.user_id)
|
|
667
|
-
if self.include_request_names is not None:
|
|
668
|
-
request_names_str = ','.join(
|
|
669
|
-
repr(name) for name in self.include_request_names)
|
|
670
|
-
filters.append(f'name IN ({request_names_str})')
|
|
671
949
|
if self.finished_before is not None:
|
|
672
950
|
filters.append('finished_at < ?')
|
|
673
951
|
filter_params.append(self.finished_before)
|
|
@@ -675,8 +953,16 @@ class RequestTaskFilter:
|
|
|
675
953
|
if filter_str:
|
|
676
954
|
filter_str = f' WHERE {filter_str}'
|
|
677
955
|
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
678
|
-
|
|
679
|
-
|
|
956
|
+
if self.fields:
|
|
957
|
+
columns_str = ', '.join(self.fields)
|
|
958
|
+
sort_str = ''
|
|
959
|
+
if self.sort:
|
|
960
|
+
sort_str = ' ORDER BY created_at DESC'
|
|
961
|
+
query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str}'
|
|
962
|
+
f'{sort_str}')
|
|
963
|
+
if self.limit is not None:
|
|
964
|
+
query_str += f' LIMIT {self.limit}'
|
|
965
|
+
return query_str, filter_params
|
|
680
966
|
|
|
681
967
|
|
|
682
968
|
@init_db
|
|
@@ -695,6 +981,10 @@ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
|
|
|
695
981
|
rows = cursor.fetchall()
|
|
696
982
|
if rows is None:
|
|
697
983
|
return []
|
|
984
|
+
if req_filter.fields:
|
|
985
|
+
rows = [
|
|
986
|
+
_update_request_row_fields(row, req_filter.fields) for row in rows
|
|
987
|
+
]
|
|
698
988
|
return [Request.from_row(row) for row in rows]
|
|
699
989
|
|
|
700
990
|
|
|
@@ -707,6 +997,10 @@ async def get_request_tasks_async(
|
|
|
707
997
|
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
708
998
|
if not rows:
|
|
709
999
|
return []
|
|
1000
|
+
if req_filter.fields:
|
|
1001
|
+
rows = [
|
|
1002
|
+
_update_request_row_fields(row, req_filter.fields) for row in rows
|
|
1003
|
+
]
|
|
710
1004
|
return [Request.from_row(row) for row in rows]
|
|
711
1005
|
|
|
712
1006
|
|
|
@@ -752,11 +1046,15 @@ async def _add_or_update_request_no_lock_async(request: Request):
|
|
|
752
1046
|
request.to_row())
|
|
753
1047
|
|
|
754
1048
|
|
|
755
|
-
def
|
|
756
|
-
"""Set a request to failed and populate the error message."""
|
|
1049
|
+
def set_exception_stacktrace(e: BaseException) -> None:
|
|
757
1050
|
with ux_utils.enable_traceback():
|
|
758
1051
|
stacktrace = traceback.format_exc()
|
|
759
1052
|
setattr(e, 'stacktrace', stacktrace)
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
1056
|
+
"""Set a request to failed and populate the error message."""
|
|
1057
|
+
set_exception_stacktrace(e)
|
|
760
1058
|
with update_request(request_id) as request_task:
|
|
761
1059
|
assert request_task is not None, request_id
|
|
762
1060
|
request_task.status = RequestStatus.FAILED
|
|
@@ -764,6 +1062,21 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
|
764
1062
|
request_task.set_error(e)
|
|
765
1063
|
|
|
766
1064
|
|
|
1065
|
+
@init_db_async
|
|
1066
|
+
@metrics_lib.time_me_async
|
|
1067
|
+
@asyncio_utils.shield
|
|
1068
|
+
async def set_request_failed_async(request_id: str, e: BaseException) -> None:
|
|
1069
|
+
"""Set a request to failed and populate the error message."""
|
|
1070
|
+
set_exception_stacktrace(e)
|
|
1071
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
1072
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
1073
|
+
assert request_task is not None, request_id
|
|
1074
|
+
request_task.status = RequestStatus.FAILED
|
|
1075
|
+
request_task.finished_at = time.time()
|
|
1076
|
+
request_task.set_error(e)
|
|
1077
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
1078
|
+
|
|
1079
|
+
|
|
767
1080
|
def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
|
768
1081
|
"""Set a request to succeeded and populate the result."""
|
|
769
1082
|
with update_request(request_id) as request_task:
|
|
@@ -774,25 +1087,50 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
|
|
774
1087
|
request_task.set_return_value(result)
|
|
775
1088
|
|
|
776
1089
|
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
1090
|
+
@init_db_async
|
|
1091
|
+
@metrics_lib.time_me_async
|
|
1092
|
+
@asyncio_utils.shield
|
|
1093
|
+
async def set_request_succeeded_async(request_id: str,
|
|
1094
|
+
result: Optional[Any]) -> None:
|
|
1095
|
+
"""Set a request to succeeded and populate the result."""
|
|
1096
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
1097
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
780
1098
|
assert request_task is not None, request_id
|
|
1099
|
+
request_task.status = RequestStatus.SUCCEEDED
|
|
1100
|
+
request_task.finished_at = time.time()
|
|
1101
|
+
if result is not None:
|
|
1102
|
+
request_task.set_return_value(result)
|
|
1103
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
@init_db_async
|
|
1107
|
+
@metrics_lib.time_me_async
|
|
1108
|
+
@asyncio_utils.shield
|
|
1109
|
+
async def set_request_cancelled_async(request_id: str) -> None:
|
|
1110
|
+
"""Set a pending or running request to cancelled."""
|
|
1111
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
1112
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
1113
|
+
assert request_task is not None, request_id
|
|
1114
|
+
# Already finished or cancelled.
|
|
1115
|
+
if request_task.status > RequestStatus.RUNNING:
|
|
1116
|
+
return
|
|
781
1117
|
request_task.finished_at = time.time()
|
|
782
1118
|
request_task.status = RequestStatus.CANCELLED
|
|
1119
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
783
1120
|
|
|
784
1121
|
|
|
785
1122
|
@init_db
|
|
786
1123
|
@metrics_lib.time_me
|
|
787
|
-
async def _delete_requests(
|
|
1124
|
+
async def _delete_requests(request_ids: List[str]):
|
|
788
1125
|
"""Clean up requests by their IDs."""
|
|
789
|
-
id_list_str = ','.join(repr(
|
|
1126
|
+
id_list_str = ','.join(repr(request_id) for request_id in request_ids)
|
|
790
1127
|
assert _DB is not None
|
|
791
1128
|
await _DB.execute_and_commit_async(
|
|
792
1129
|
f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
|
|
793
1130
|
|
|
794
1131
|
|
|
795
|
-
async def clean_finished_requests_with_retention(retention_seconds: int
|
|
1132
|
+
async def clean_finished_requests_with_retention(retention_seconds: int,
|
|
1133
|
+
batch_size: int = 1000):
|
|
796
1134
|
"""Clean up finished requests older than the retention period.
|
|
797
1135
|
|
|
798
1136
|
This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
|
|
@@ -801,24 +1139,40 @@ async def clean_finished_requests_with_retention(retention_seconds: int):
|
|
|
801
1139
|
Args:
|
|
802
1140
|
retention_seconds: Requests older than this many seconds will be
|
|
803
1141
|
deleted.
|
|
1142
|
+
batch_size: batch delete 'batch_size' requests at a time to
|
|
1143
|
+
avoid using too much memory and once and to let each
|
|
1144
|
+
db query complete in a reasonable time. All stale
|
|
1145
|
+
requests older than the retention period will be deleted
|
|
1146
|
+
regardless of the batch size.
|
|
804
1147
|
"""
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
1148
|
+
total_deleted = 0
|
|
1149
|
+
while True:
|
|
1150
|
+
reqs = await get_request_tasks_async(
|
|
1151
|
+
req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
|
|
1152
|
+
finished_before=time.time() -
|
|
1153
|
+
retention_seconds,
|
|
1154
|
+
limit=batch_size,
|
|
1155
|
+
fields=['request_id']))
|
|
1156
|
+
if len(reqs) == 0:
|
|
1157
|
+
break
|
|
1158
|
+
futs = []
|
|
1159
|
+
for req in reqs:
|
|
1160
|
+
# req.log_path is derived from request_id,
|
|
1161
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1162
|
+
futs.append(
|
|
1163
|
+
asyncio.create_task(
|
|
1164
|
+
anyio.Path(
|
|
1165
|
+
req.log_path.absolute()).unlink(missing_ok=True)))
|
|
1166
|
+
await asyncio.gather(*futs)
|
|
1167
|
+
|
|
1168
|
+
await _delete_requests([req.request_id for req in reqs])
|
|
1169
|
+
total_deleted += len(reqs)
|
|
1170
|
+
if len(reqs) < batch_size:
|
|
1171
|
+
break
|
|
818
1172
|
|
|
819
1173
|
# To avoid leakage of the log file, logs must be deleted before the
|
|
820
1174
|
# request task in the database.
|
|
821
|
-
logger.info(f'Cleaned up {
|
|
1175
|
+
logger.info(f'Cleaned up {total_deleted} finished requests '
|
|
822
1176
|
f'older than {retention_seconds} seconds')
|
|
823
1177
|
|
|
824
1178
|
|