skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/stream_utils.py
CHANGED
|
@@ -8,10 +8,13 @@ from typing import AsyncGenerator, Deque, List, Optional
|
|
|
8
8
|
import aiofiles
|
|
9
9
|
import fastapi
|
|
10
10
|
|
|
11
|
+
from sky import global_user_state
|
|
11
12
|
from sky import sky_logging
|
|
12
13
|
from sky.server.requests import requests as requests_lib
|
|
14
|
+
from sky.utils import common_utils
|
|
13
15
|
from sky.utils import message_utils
|
|
14
16
|
from sky.utils import rich_utils
|
|
17
|
+
from sky.utils import status_lib
|
|
15
18
|
|
|
16
19
|
logger = sky_logging.init_logger(__name__)
|
|
17
20
|
|
|
@@ -22,6 +25,14 @@ logger = sky_logging.init_logger(__name__)
|
|
|
22
25
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
23
26
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
24
27
|
_HEARTBEAT_INTERVAL = 30
|
|
28
|
+
_READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
|
|
29
|
+
|
|
30
|
+
# If a SHORT request has been stuck in pending for
|
|
31
|
+
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
32
|
+
_SHORT_REQUEST_SPINNER_TIMEOUT = 2
|
|
33
|
+
|
|
34
|
+
LONG_REQUEST_POLL_INTERVAL = 1
|
|
35
|
+
DEFAULT_POLL_INTERVAL = 0.1
|
|
25
36
|
|
|
26
37
|
|
|
27
38
|
async def _yield_log_file_with_payloads_skipped(
|
|
@@ -37,34 +48,51 @@ async def _yield_log_file_with_payloads_skipped(
|
|
|
37
48
|
yield line_str
|
|
38
49
|
|
|
39
50
|
|
|
40
|
-
async def log_streamer(
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
51
|
+
async def log_streamer(
|
|
52
|
+
request_id: Optional[str],
|
|
53
|
+
log_path: Optional[pathlib.Path] = None,
|
|
54
|
+
plain_logs: bool = False,
|
|
55
|
+
tail: Optional[int] = None,
|
|
56
|
+
follow: bool = True,
|
|
57
|
+
cluster_name: Optional[str] = None,
|
|
58
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
59
|
+
) -> AsyncGenerator[str, None]:
|
|
45
60
|
"""Streams the logs of a request.
|
|
46
61
|
|
|
47
62
|
Args:
|
|
48
63
|
request_id: The request ID to check whether the log tailing process
|
|
49
64
|
should be stopped.
|
|
50
|
-
log_path: The path to the log file
|
|
65
|
+
log_path: The path to the log file or directory containing the log
|
|
66
|
+
files. If it is a directory, all *.log files in the directory will be
|
|
67
|
+
streamed.
|
|
51
68
|
plain_logs: Whether to show plain logs.
|
|
52
69
|
tail: The number of lines to tail. If None, tail the whole file.
|
|
53
70
|
follow: Whether to follow the log file.
|
|
71
|
+
cluster_name: The cluster name to check status for provision logs.
|
|
72
|
+
If provided and cluster status is UP, streaming will terminate.
|
|
54
73
|
"""
|
|
55
74
|
|
|
56
75
|
if request_id is not None:
|
|
76
|
+
start_time = asyncio.get_event_loop().time()
|
|
57
77
|
status_msg = rich_utils.EncodedStatusMessage(
|
|
58
78
|
f'[dim]Checking request: {request_id}[/dim]')
|
|
59
|
-
request_task = await requests_lib.get_request_async(request_id
|
|
79
|
+
request_task = await requests_lib.get_request_async(request_id,
|
|
80
|
+
fields=[
|
|
81
|
+
'request_id',
|
|
82
|
+
'name',
|
|
83
|
+
'schedule_type',
|
|
84
|
+
'status',
|
|
85
|
+
'status_msg'
|
|
86
|
+
])
|
|
60
87
|
|
|
61
88
|
if request_task is None:
|
|
62
89
|
raise fastapi.HTTPException(
|
|
63
90
|
status_code=404, detail=f'Request {request_id} not found')
|
|
64
91
|
request_id = request_task.request_id
|
|
65
92
|
|
|
66
|
-
#
|
|
67
|
-
# request
|
|
93
|
+
# By default, do not show the waiting spinner for SHORT requests.
|
|
94
|
+
# If the request has been stuck in pending for
|
|
95
|
+
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
68
96
|
show_request_waiting_spinner = (not plain_logs and
|
|
69
97
|
request_task.schedule_type
|
|
70
98
|
== requests_lib.ScheduleType.LONG)
|
|
@@ -77,9 +105,23 @@ async def log_streamer(request_id: Optional[str],
|
|
|
77
105
|
f'scheduled: {request_id}')
|
|
78
106
|
req_status = request_task.status
|
|
79
107
|
req_msg = request_task.status_msg
|
|
108
|
+
del request_task
|
|
109
|
+
# Slowly back off the database polling up to every 1 second, to avoid
|
|
110
|
+
# overloading the CPU and DB.
|
|
111
|
+
backoff = common_utils.Backoff(initial_backoff=polling_interval,
|
|
112
|
+
max_backoff_factor=10,
|
|
113
|
+
multiplier=1.2)
|
|
80
114
|
while req_status < requests_lib.RequestStatus.RUNNING:
|
|
115
|
+
current_time = asyncio.get_event_loop().time()
|
|
116
|
+
# Show the waiting spinner for a SHORT request if it has been stuck
|
|
117
|
+
# in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
|
|
118
|
+
if not show_request_waiting_spinner and (
|
|
119
|
+
current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
|
|
120
|
+
show_request_waiting_spinner = True
|
|
121
|
+
yield status_msg.init()
|
|
122
|
+
yield status_msg.start()
|
|
81
123
|
if req_msg is not None:
|
|
82
|
-
waiting_msg =
|
|
124
|
+
waiting_msg = req_msg
|
|
83
125
|
if show_request_waiting_spinner:
|
|
84
126
|
yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
|
|
85
127
|
elif plain_logs and waiting_msg != last_waiting_msg:
|
|
@@ -92,7 +134,7 @@ async def log_streamer(request_id: Optional[str],
|
|
|
92
134
|
# TODO(aylei): we should use a better mechanism to avoid busy
|
|
93
135
|
# polling the DB, which can be a bottleneck for high-concurrency
|
|
94
136
|
# requests.
|
|
95
|
-
await asyncio.sleep(
|
|
137
|
+
await asyncio.sleep(backoff.current_backoff())
|
|
96
138
|
status_with_msg = await requests_lib.get_request_status_async(
|
|
97
139
|
request_id, include_msg=True)
|
|
98
140
|
req_status = status_with_msg.status
|
|
@@ -102,17 +144,42 @@ async def log_streamer(request_id: Optional[str],
|
|
|
102
144
|
if show_request_waiting_spinner:
|
|
103
145
|
yield status_msg.stop()
|
|
104
146
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
147
|
+
# worker node provision logs
|
|
148
|
+
if log_path is not None and log_path.is_dir():
|
|
149
|
+
# Get all *.log files in the log_path dir
|
|
150
|
+
log_files = sorted(log_path.glob('*.log'))
|
|
151
|
+
|
|
152
|
+
for log_file_path in log_files:
|
|
153
|
+
# Add header before each file (similar to tail -f behavior)
|
|
154
|
+
header = f'\n==> {log_file_path} <==\n\n'
|
|
155
|
+
yield header
|
|
156
|
+
|
|
157
|
+
async with aiofiles.open(log_file_path, 'rb') as f:
|
|
158
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs,
|
|
159
|
+
tail, follow, cluster_name,
|
|
160
|
+
polling_interval):
|
|
161
|
+
yield chunk
|
|
162
|
+
|
|
163
|
+
# api server request logs (if request_id is provided) or
|
|
164
|
+
# head node provision logs (if cluster_name is provided)
|
|
165
|
+
else:
|
|
166
|
+
assert log_path is not None, (request_id, cluster_name)
|
|
167
|
+
async with aiofiles.open(log_path, 'rb') as f:
|
|
168
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
|
169
|
+
follow, cluster_name,
|
|
170
|
+
polling_interval):
|
|
171
|
+
yield chunk
|
|
109
172
|
|
|
110
173
|
|
|
111
|
-
async def _tail_log_file(
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
174
|
+
async def _tail_log_file(
|
|
175
|
+
f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
176
|
+
request_id: Optional[str] = None,
|
|
177
|
+
plain_logs: bool = False,
|
|
178
|
+
tail: Optional[int] = None,
|
|
179
|
+
follow: bool = True,
|
|
180
|
+
cluster_name: Optional[str] = None,
|
|
181
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
182
|
+
) -> AsyncGenerator[str, None]:
|
|
116
183
|
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
|
117
184
|
|
|
118
185
|
if tail is not None:
|
|
@@ -128,6 +195,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
128
195
|
yield line_str
|
|
129
196
|
|
|
130
197
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
198
|
+
last_status_check_time = asyncio.get_event_loop().time()
|
|
131
199
|
|
|
132
200
|
# Buffer the lines in memory and flush them in chunks to improve log
|
|
133
201
|
# tailing throughput.
|
|
@@ -135,6 +203,9 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
135
203
|
buffer_bytes = 0
|
|
136
204
|
last_flush_time = asyncio.get_event_loop().time()
|
|
137
205
|
|
|
206
|
+
# Read file in chunks instead of line-by-line for better performance
|
|
207
|
+
incomplete_line = b'' # Buffer for incomplete lines across chunks
|
|
208
|
+
|
|
138
209
|
async def flush_buffer() -> AsyncGenerator[str, None]:
|
|
139
210
|
nonlocal buffer, buffer_bytes, last_flush_time
|
|
140
211
|
if buffer:
|
|
@@ -155,16 +226,41 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
155
226
|
async for chunk in flush_buffer():
|
|
156
227
|
yield chunk
|
|
157
228
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
229
|
+
# Read file in chunks for better I/O performance
|
|
230
|
+
file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
|
|
231
|
+
if not file_chunk:
|
|
232
|
+
# Process any remaining incomplete line
|
|
233
|
+
if incomplete_line:
|
|
234
|
+
line_str = incomplete_line.decode('utf-8')
|
|
235
|
+
if plain_logs:
|
|
236
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
237
|
+
line_str, raise_for_mismatch=False)
|
|
238
|
+
if not is_payload:
|
|
239
|
+
buffer.append(line_str)
|
|
240
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
241
|
+
else:
|
|
242
|
+
buffer.append(line_str)
|
|
243
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
244
|
+
incomplete_line = b''
|
|
245
|
+
|
|
246
|
+
# Avoid checking the status too frequently to avoid overloading the
|
|
247
|
+
# DB.
|
|
248
|
+
should_check_status = (current_time -
|
|
249
|
+
last_status_check_time) >= polling_interval
|
|
250
|
+
if not follow:
|
|
251
|
+
# We will only hit this path once, but we should make sure to
|
|
252
|
+
# check the status so that we display the final request status
|
|
253
|
+
# if the request is complete.
|
|
254
|
+
should_check_status = True
|
|
255
|
+
if request_id is not None and should_check_status:
|
|
256
|
+
last_status_check_time = current_time
|
|
161
257
|
req_status = await requests_lib.get_request_status_async(
|
|
162
258
|
request_id)
|
|
163
259
|
if req_status.status > requests_lib.RequestStatus.RUNNING:
|
|
164
260
|
if (req_status.status ==
|
|
165
261
|
requests_lib.RequestStatus.CANCELLED):
|
|
166
262
|
request_task = await requests_lib.get_request_async(
|
|
167
|
-
request_id)
|
|
263
|
+
request_id, fields=['name', 'should_retry'])
|
|
168
264
|
if request_task.should_retry:
|
|
169
265
|
buffer.append(
|
|
170
266
|
message_utils.encode_payload(
|
|
@@ -173,10 +269,44 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
173
269
|
buffer.append(
|
|
174
270
|
f'{request_task.name!r} request {request_id}'
|
|
175
271
|
' cancelled\n')
|
|
272
|
+
del request_task
|
|
176
273
|
break
|
|
177
274
|
if not follow:
|
|
275
|
+
# The below checks (cluster status, heartbeat) are not needed
|
|
276
|
+
# for non-follow logs.
|
|
178
277
|
break
|
|
179
|
-
|
|
278
|
+
# Provision logs pass in cluster_name, check cluster status
|
|
279
|
+
# periodically to see if provisioning is done.
|
|
280
|
+
if cluster_name is not None:
|
|
281
|
+
if should_check_status:
|
|
282
|
+
last_status_check_time = current_time
|
|
283
|
+
cluster_status = await (
|
|
284
|
+
global_user_state.get_status_from_cluster_name_async(
|
|
285
|
+
cluster_name))
|
|
286
|
+
if cluster_status is None:
|
|
287
|
+
logger.debug(
|
|
288
|
+
'Stop tailing provision logs for cluster'
|
|
289
|
+
f' status for cluster {cluster_name} not found')
|
|
290
|
+
break
|
|
291
|
+
# if the cluster is not in INIT state (UP or STOPPED),
|
|
292
|
+
# stop tailing provision logs
|
|
293
|
+
if cluster_status != status_lib.ClusterStatus.INIT:
|
|
294
|
+
logger.debug(
|
|
295
|
+
f'Stop tailing provision logs for cluster'
|
|
296
|
+
f' {cluster_name} has status {cluster_status} '
|
|
297
|
+
'(not in INIT state)')
|
|
298
|
+
break
|
|
299
|
+
req_filter = requests_lib.RequestTaskFilter(
|
|
300
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
301
|
+
cluster_names=[cluster_name],
|
|
302
|
+
include_request_names=['sky.launch'],
|
|
303
|
+
fields=['cluster_name'])
|
|
304
|
+
req_tasks = await requests_lib.get_request_tasks_async(
|
|
305
|
+
req_filter)
|
|
306
|
+
# if the cluster is in INIT state and there is no ongoing
|
|
307
|
+
# launch request, stop tailing provision logs
|
|
308
|
+
if len(req_tasks) == 0:
|
|
309
|
+
break
|
|
180
310
|
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
181
311
|
# Currently just used to keep the connection busy, refer to
|
|
182
312
|
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
|
@@ -196,38 +326,82 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
196
326
|
# performance but it helps avoid unnecessary heartbeat strings
|
|
197
327
|
# being printed when the client runs in an old version.
|
|
198
328
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
329
|
+
|
|
330
|
+
# Combine with any incomplete line from previous chunk
|
|
331
|
+
file_chunk = incomplete_line + file_chunk
|
|
332
|
+
incomplete_line = b''
|
|
333
|
+
|
|
334
|
+
# Split chunk into lines, preserving line structure
|
|
335
|
+
lines_bytes = file_chunk.split(b'\n')
|
|
336
|
+
|
|
337
|
+
# If chunk doesn't end with newline, the last element is incomplete
|
|
338
|
+
if file_chunk and not file_chunk.endswith(b'\n'):
|
|
339
|
+
incomplete_line = lines_bytes[-1]
|
|
340
|
+
lines_bytes = lines_bytes[:-1]
|
|
341
|
+
else:
|
|
342
|
+
# If ends with \n, split creates an empty last element we should
|
|
343
|
+
# ignore
|
|
344
|
+
if lines_bytes and lines_bytes[-1] == b'':
|
|
345
|
+
lines_bytes = lines_bytes[:-1]
|
|
346
|
+
|
|
347
|
+
# Process all complete lines in this chunk
|
|
348
|
+
for line_bytes in lines_bytes:
|
|
349
|
+
# Reconstruct line with newline (since split removed it)
|
|
350
|
+
line_str = line_bytes.decode('utf-8') + '\n'
|
|
351
|
+
|
|
352
|
+
if plain_logs:
|
|
353
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
354
|
+
line_str, raise_for_mismatch=False)
|
|
355
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
|
356
|
+
# sending invisible characters might be okay.
|
|
357
|
+
if is_payload:
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
buffer.append(line_str)
|
|
361
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
209
362
|
|
|
210
363
|
# Flush remaining lines in the buffer.
|
|
211
364
|
async for chunk in flush_buffer():
|
|
212
365
|
yield chunk
|
|
213
366
|
|
|
214
367
|
|
|
368
|
+
def stream_response_for_long_request(
|
|
369
|
+
request_id: str,
|
|
370
|
+
logs_path: pathlib.Path,
|
|
371
|
+
background_tasks: fastapi.BackgroundTasks,
|
|
372
|
+
kill_request_on_disconnect: bool = True,
|
|
373
|
+
) -> fastapi.responses.StreamingResponse:
|
|
374
|
+
"""Stream the logs of a long request."""
|
|
375
|
+
return stream_response(
|
|
376
|
+
request_id,
|
|
377
|
+
logs_path,
|
|
378
|
+
background_tasks,
|
|
379
|
+
polling_interval=LONG_REQUEST_POLL_INTERVAL,
|
|
380
|
+
kill_request_on_disconnect=kill_request_on_disconnect,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
|
|
215
384
|
def stream_response(
|
|
216
|
-
request_id: str,
|
|
217
|
-
|
|
385
|
+
request_id: str,
|
|
386
|
+
logs_path: pathlib.Path,
|
|
387
|
+
background_tasks: fastapi.BackgroundTasks,
|
|
388
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL,
|
|
389
|
+
kill_request_on_disconnect: bool = True,
|
|
218
390
|
) -> fastapi.responses.StreamingResponse:
|
|
219
391
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
392
|
+
if kill_request_on_disconnect:
|
|
393
|
+
|
|
394
|
+
async def on_disconnect():
|
|
395
|
+
logger.info(f'User terminated the connection for request '
|
|
396
|
+
f'{request_id}')
|
|
397
|
+
await requests_lib.kill_request_async(request_id)
|
|
224
398
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
399
|
+
# The background task will be run after returning a response.
|
|
400
|
+
# https://fastapi.tiangolo.com/tutorial/background-tasks/
|
|
401
|
+
background_tasks.add_task(on_disconnect)
|
|
228
402
|
|
|
229
403
|
return fastapi.responses.StreamingResponse(
|
|
230
|
-
log_streamer(request_id, logs_path),
|
|
404
|
+
log_streamer(request_id, logs_path, polling_interval=polling_interval),
|
|
231
405
|
media_type='text/plain',
|
|
232
406
|
headers={
|
|
233
407
|
'Cache-Control': 'no-cache, no-transform',
|
sky/server/uvicorn.py
CHANGED
|
@@ -19,6 +19,7 @@ from uvicorn.supervisors import multiprocess
|
|
|
19
19
|
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.server import daemons
|
|
22
|
+
from sky.server import metrics as metrics_lib
|
|
22
23
|
from sky.server import state
|
|
23
24
|
from sky.server.requests import requests as requests_lib
|
|
24
25
|
from sky.skylet import constants
|
|
@@ -45,11 +46,11 @@ except ValueError:
|
|
|
45
46
|
|
|
46
47
|
# TODO(aylei): use decorator to register requests that need to be proactively
|
|
47
48
|
# cancelled instead of hardcoding here.
|
|
48
|
-
_RETRIABLE_REQUEST_NAMES =
|
|
49
|
+
_RETRIABLE_REQUEST_NAMES = {
|
|
49
50
|
'sky.logs',
|
|
50
51
|
'sky.jobs.logs',
|
|
51
52
|
'sky.serve.logs',
|
|
52
|
-
|
|
53
|
+
}
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
def add_timestamp_prefix_for_server_logs() -> None:
|
|
@@ -150,37 +151,38 @@ class Server(uvicorn.Server):
|
|
|
150
151
|
requests_lib.RequestStatus.PENDING,
|
|
151
152
|
requests_lib.RequestStatus.RUNNING,
|
|
152
153
|
]
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
154
|
+
requests = [(request_task.request_id, request_task.name)
|
|
155
|
+
for request_task in requests_lib.get_request_tasks(
|
|
156
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
157
|
+
status=statuses, fields=['request_id', 'name']))
|
|
158
|
+
]
|
|
159
|
+
if not requests:
|
|
156
160
|
break
|
|
157
|
-
logger.info(f'{len(
|
|
161
|
+
logger.info(f'{len(requests)} on-going requests '
|
|
158
162
|
'found, waiting for them to finish...')
|
|
159
163
|
# Proactively cancel internal requests and logs requests since
|
|
160
164
|
# they can run for infinite time.
|
|
161
|
-
internal_request_ids =
|
|
165
|
+
internal_request_ids = {
|
|
162
166
|
d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
|
|
163
|
-
|
|
167
|
+
}
|
|
164
168
|
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
|
165
169
|
logger.warning('Timeout waiting for on-going requests to '
|
|
166
170
|
'finish, cancelling all on-going requests.')
|
|
167
|
-
for
|
|
168
|
-
self.interrupt_request_for_retry(
|
|
171
|
+
for request_id, _ in requests:
|
|
172
|
+
self.interrupt_request_for_retry(request_id)
|
|
169
173
|
break
|
|
170
174
|
interrupted = 0
|
|
171
|
-
for
|
|
172
|
-
if
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
elif req.name in _RETRIABLE_REQUEST_NAMES:
|
|
176
|
-
self.interrupt_request_for_retry(req.request_id)
|
|
175
|
+
for request_id, name in requests:
|
|
176
|
+
if (name in _RETRIABLE_REQUEST_NAMES or
|
|
177
|
+
request_id in internal_request_ids):
|
|
178
|
+
self.interrupt_request_for_retry(request_id)
|
|
177
179
|
interrupted += 1
|
|
178
180
|
# TODO(aylei): interrupt pending requests to accelerate the
|
|
179
181
|
# shutdown.
|
|
180
182
|
# If some requests are not interrupted, wait for them to finish,
|
|
181
183
|
# otherwise we just check again immediately to accelerate the
|
|
182
184
|
# shutdown process.
|
|
183
|
-
if interrupted < len(
|
|
185
|
+
if interrupted < len(requests):
|
|
184
186
|
time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
|
|
185
187
|
|
|
186
188
|
def interrupt_request_for_retry(self, request_id: str) -> None:
|
|
@@ -212,8 +214,17 @@ class Server(uvicorn.Server):
|
|
|
212
214
|
# Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
|
|
213
215
|
event_loop.set_debug(True)
|
|
214
216
|
event_loop.slow_callback_duration = lag_threshold
|
|
215
|
-
|
|
216
|
-
|
|
217
|
+
stop_monitor = threading.Event()
|
|
218
|
+
monitor = threading.Thread(target=metrics_lib.process_monitor,
|
|
219
|
+
args=('server', stop_monitor),
|
|
220
|
+
daemon=True)
|
|
221
|
+
monitor.start()
|
|
222
|
+
try:
|
|
223
|
+
with self.capture_signals():
|
|
224
|
+
asyncio.run(self.serve(*args, **kwargs))
|
|
225
|
+
finally:
|
|
226
|
+
stop_monitor.set()
|
|
227
|
+
monitor.join()
|
|
217
228
|
|
|
218
229
|
|
|
219
230
|
def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
|
sky/setup_files/MANIFEST.in
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
include sky/backends/monkey_patches/*.py
|
|
2
|
-
exclude sky/
|
|
2
|
+
exclude sky/catalog/data_fetchers/analyze.py
|
|
3
3
|
include sky/provision/kubernetes/manifests/*
|
|
4
4
|
include sky/provision/azure/*
|
|
5
5
|
include sky/setup_files/*
|
|
@@ -21,3 +21,8 @@ include sky/users/*.conf
|
|
|
21
21
|
include sky/utils/*.sh
|
|
22
22
|
include sky/setup_files/alembic.ini
|
|
23
23
|
recursive-include sky/schemas/db *
|
|
24
|
+
|
|
25
|
+
# SkyPilot templates package
|
|
26
|
+
recursive-include sky_templates/ray *
|
|
27
|
+
recursive-include sky_templates *.py
|
|
28
|
+
include sky_templates/README.md
|
sky/setup_files/alembic.ini
CHANGED
|
@@ -98,6 +98,14 @@ version_table = alembic_version_spot_jobs_db
|
|
|
98
98
|
version_locations = %(here)s/../schemas/db/serve_state
|
|
99
99
|
version_table = alembic_version_serve_state_db
|
|
100
100
|
|
|
101
|
+
[sky_config_db]
|
|
102
|
+
version_locations = %(here)s/../schemas/db/skypilot_config
|
|
103
|
+
version_table = alembic_version_sky_config_db
|
|
104
|
+
|
|
105
|
+
[kv_cache_db]
|
|
106
|
+
version_locations = %(here)s/../schemas/db/kv_cache
|
|
107
|
+
version_table = alembic_version_kv_cache_db
|
|
108
|
+
|
|
101
109
|
[post_write_hooks]
|
|
102
110
|
# post_write_hooks defines scripts or Python functions that are run
|
|
103
111
|
# on newly generated revision scripts. See the documentation for further
|