skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/requests/executor.py
CHANGED
|
@@ -31,6 +31,7 @@ import time
|
|
|
31
31
|
import typing
|
|
32
32
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
|
33
33
|
|
|
34
|
+
import psutil
|
|
34
35
|
import setproctitle
|
|
35
36
|
|
|
36
37
|
from sky import exceptions
|
|
@@ -38,6 +39,7 @@ from sky import global_user_state
|
|
|
38
39
|
from sky import models
|
|
39
40
|
from sky import sky_logging
|
|
40
41
|
from sky import skypilot_config
|
|
42
|
+
from sky.metrics import utils as metrics_utils
|
|
41
43
|
from sky.server import common as server_common
|
|
42
44
|
from sky.server import config as server_config
|
|
43
45
|
from sky.server import constants as server_constants
|
|
@@ -45,7 +47,9 @@ from sky.server import metrics as metrics_lib
|
|
|
45
47
|
from sky.server.requests import payloads
|
|
46
48
|
from sky.server.requests import preconditions
|
|
47
49
|
from sky.server.requests import process
|
|
50
|
+
from sky.server.requests import request_names
|
|
48
51
|
from sky.server.requests import requests as api_requests
|
|
52
|
+
from sky.server.requests import threads
|
|
49
53
|
from sky.server.requests.queues import local_queue
|
|
50
54
|
from sky.server.requests.queues import mp_queue
|
|
51
55
|
from sky.skylet import constants
|
|
@@ -79,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
|
|
|
79
83
|
# platforms, including macOS.
|
|
80
84
|
multiprocessing.set_start_method('spawn', force=True)
|
|
81
85
|
|
|
86
|
+
# An upper limit of max threads for request execution per server process that
|
|
87
|
+
# unlikely to be reached to allow higher concurrency while still prevent the
|
|
88
|
+
# server process become overloaded.
|
|
89
|
+
_REQUEST_THREADS_LIMIT = 128
|
|
90
|
+
|
|
91
|
+
_REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
|
|
92
|
+
# A dedicated thread pool executor for synced requests execution in coroutine to
|
|
93
|
+
# avoid:
|
|
94
|
+
# 1. blocking the event loop;
|
|
95
|
+
# 2. exhausting the default thread pool executor of event loop;
|
|
96
|
+
_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
|
|
100
|
+
"""Lazy init and return the request thread executor for current process."""
|
|
101
|
+
global _REQUEST_THREAD_EXECUTOR
|
|
102
|
+
if _REQUEST_THREAD_EXECUTOR is not None:
|
|
103
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
104
|
+
with _REQUEST_THREAD_EXECUTOR_LOCK:
|
|
105
|
+
if _REQUEST_THREAD_EXECUTOR is None:
|
|
106
|
+
_REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
|
|
107
|
+
name='request_thread_executor',
|
|
108
|
+
max_workers=_REQUEST_THREADS_LIMIT)
|
|
109
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
110
|
+
|
|
82
111
|
|
|
83
112
|
class RequestQueue:
|
|
84
113
|
"""The queue for the requests, either redis or multiprocessing.
|
|
@@ -130,6 +159,10 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
|
|
130
159
|
def executor_initializer(proc_group: str):
|
|
131
160
|
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
|
132
161
|
f'{multiprocessing.current_process().pid}')
|
|
162
|
+
# Executor never stops, unless the whole process is killed.
|
|
163
|
+
threading.Thread(target=metrics_lib.process_monitor,
|
|
164
|
+
args=(f'worker:{proc_group}', threading.Event()),
|
|
165
|
+
daemon=True).start()
|
|
133
166
|
|
|
134
167
|
|
|
135
168
|
class RequestWorker:
|
|
@@ -182,10 +215,11 @@ class RequestWorker:
|
|
|
182
215
|
time.sleep(0.1)
|
|
183
216
|
return
|
|
184
217
|
request_id, ignore_return_value, _ = request_element
|
|
185
|
-
request = api_requests.get_request(request_id)
|
|
218
|
+
request = api_requests.get_request(request_id, fields=['status'])
|
|
186
219
|
assert request is not None, f'Request with ID {request_id} is None'
|
|
187
220
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
|
188
221
|
return
|
|
222
|
+
del request
|
|
189
223
|
logger.info(f'[{self}] Submitting request: {request_id}')
|
|
190
224
|
# Start additional process to run the request, so that it can be
|
|
191
225
|
# cancelled when requested by a user.
|
|
@@ -196,6 +230,12 @@ class RequestWorker:
|
|
|
196
230
|
fut = executor.submit_until_success(
|
|
197
231
|
_request_execution_wrapper, request_id, ignore_return_value,
|
|
198
232
|
self.num_db_connections_per_worker)
|
|
233
|
+
# Decrement the free executor count when a request starts
|
|
234
|
+
if metrics_utils.METRICS_ENABLED:
|
|
235
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
236
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.dec()
|
|
237
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
238
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.dec()
|
|
199
239
|
# Monitor the result of the request execution.
|
|
200
240
|
threading.Thread(target=self.handle_task_result,
|
|
201
241
|
args=(fut, request_element),
|
|
@@ -230,9 +270,23 @@ class RequestWorker:
|
|
|
230
270
|
queue.put(request_element)
|
|
231
271
|
except exceptions.ExecutionRetryableError as e:
|
|
232
272
|
time.sleep(e.retry_wait_seconds)
|
|
273
|
+
# Reset the request status to PENDING so it can be picked up again.
|
|
274
|
+
# Assume retryable since the error is ExecutionRetryableError.
|
|
275
|
+
request_id, _, _ = request_element
|
|
276
|
+
with api_requests.update_request(request_id) as request_task:
|
|
277
|
+
assert request_task is not None, request_id
|
|
278
|
+
request_task.status = api_requests.RequestStatus.PENDING
|
|
233
279
|
# Reschedule the request.
|
|
234
280
|
queue = _get_queue(self.schedule_type)
|
|
235
281
|
queue.put(request_element)
|
|
282
|
+
logger.info(f'Rescheduled request {request_id} for retry')
|
|
283
|
+
finally:
|
|
284
|
+
# Increment the free executor count when a request finishes
|
|
285
|
+
if metrics_utils.METRICS_ENABLED:
|
|
286
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
287
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.inc()
|
|
288
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
289
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.inc()
|
|
236
290
|
|
|
237
291
|
def run(self) -> None:
|
|
238
292
|
# Handle the SIGTERM signal to abort the executor process gracefully.
|
|
@@ -254,6 +308,16 @@ class RequestWorker:
|
|
|
254
308
|
burst_workers=self.burstable_parallelism,
|
|
255
309
|
initializer=executor_initializer,
|
|
256
310
|
initargs=(proc_group,))
|
|
311
|
+
# Initialize the appropriate gauge for the number of free executors
|
|
312
|
+
total_executors = (self.garanteed_parallelism +
|
|
313
|
+
self.burstable_parallelism)
|
|
314
|
+
if metrics_utils.METRICS_ENABLED:
|
|
315
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
316
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.set(
|
|
317
|
+
total_executors)
|
|
318
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
319
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.set(
|
|
320
|
+
total_executors)
|
|
257
321
|
while not self._cancel_event.is_set():
|
|
258
322
|
self.process_request(executor, queue)
|
|
259
323
|
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
|
@@ -277,43 +341,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
|
|
|
277
341
|
|
|
278
342
|
@contextlib.contextmanager
|
|
279
343
|
def override_request_env_and_config(
|
|
280
|
-
request_body: payloads.RequestBody,
|
|
281
|
-
|
|
344
|
+
request_body: payloads.RequestBody, request_id: str,
|
|
345
|
+
request_name: str) -> Generator[None, None, None]:
|
|
282
346
|
"""Override the environment and SkyPilot config for a request."""
|
|
283
347
|
original_env = os.environ.copy()
|
|
284
|
-
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
|
|
285
|
-
# affecting client requests. If set on the client side, it will be
|
|
286
|
-
# overridden by the request body.
|
|
287
|
-
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
288
|
-
os.environ.update(request_body.env_vars)
|
|
289
|
-
# Note: may be overridden by AuthProxyMiddleware.
|
|
290
|
-
# TODO(zhwu): we need to make the entire request a context available to the
|
|
291
|
-
# entire request execution, so that we can access info like user through
|
|
292
|
-
# the execution.
|
|
293
|
-
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
294
|
-
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
295
|
-
global_user_state.add_or_update_user(user)
|
|
296
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
297
|
-
# field.
|
|
298
|
-
user = global_user_state.get_user(user.id)
|
|
299
|
-
|
|
300
|
-
# Force color to be enabled.
|
|
301
|
-
os.environ['CLICOLOR_FORCE'] = '1'
|
|
302
|
-
server_common.reload_for_new_request(
|
|
303
|
-
client_entrypoint=request_body.entrypoint,
|
|
304
|
-
client_command=request_body.entrypoint_command,
|
|
305
|
-
using_remote_api_server=request_body.using_remote_api_server,
|
|
306
|
-
user=user,
|
|
307
|
-
request_id=request_id)
|
|
308
348
|
try:
|
|
349
|
+
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
|
|
350
|
+
# server affecting client requests. If set on the client side, it will
|
|
351
|
+
# be overridden by the request body.
|
|
352
|
+
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
353
|
+
# Remove the db connection uri from client supplied env vars, as the
|
|
354
|
+
# client should not set the db string on server side.
|
|
355
|
+
request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
|
|
356
|
+
os.environ.update(request_body.env_vars)
|
|
357
|
+
# Note: may be overridden by AuthProxyMiddleware.
|
|
358
|
+
# TODO(zhwu): we need to make the entire request a context available to
|
|
359
|
+
# the entire request execution, so that we can access info like user
|
|
360
|
+
# through the execution.
|
|
361
|
+
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
362
|
+
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
363
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
364
|
+
|
|
365
|
+
# Force color to be enabled.
|
|
366
|
+
os.environ['CLICOLOR_FORCE'] = '1'
|
|
367
|
+
server_common.reload_for_new_request(
|
|
368
|
+
client_entrypoint=request_body.entrypoint,
|
|
369
|
+
client_command=request_body.entrypoint_command,
|
|
370
|
+
using_remote_api_server=request_body.using_remote_api_server,
|
|
371
|
+
user=user,
|
|
372
|
+
request_id=request_id)
|
|
309
373
|
logger.debug(
|
|
310
374
|
f'override path: {request_body.override_skypilot_config_path}')
|
|
311
375
|
with skypilot_config.override_skypilot_config(
|
|
312
376
|
request_body.override_skypilot_config,
|
|
313
377
|
request_body.override_skypilot_config_path):
|
|
314
|
-
#
|
|
315
|
-
#
|
|
316
|
-
|
|
378
|
+
# Skip permission check for sky.workspaces.get request
|
|
379
|
+
# as it is used to determine which workspaces the user
|
|
380
|
+
# has access to.
|
|
381
|
+
if request_name != 'sky.workspaces.get':
|
|
382
|
+
try:
|
|
383
|
+
# Reject requests that the user does not have permission
|
|
384
|
+
# to access.
|
|
385
|
+
workspaces_core.reject_request_for_unauthorized_workspace(
|
|
386
|
+
user)
|
|
387
|
+
except exceptions.PermissionDeniedError as e:
|
|
388
|
+
logger.debug(
|
|
389
|
+
f'{request_id} permission denied to workspace: '
|
|
390
|
+
f'{skypilot_config.get_active_workspace()}: {e}')
|
|
391
|
+
raise e
|
|
392
|
+
logger.debug(
|
|
393
|
+
f'{request_id} permission granted to {request_name} request')
|
|
317
394
|
yield
|
|
318
395
|
finally:
|
|
319
396
|
# We need to call the save_timeline() since atexit will not be
|
|
@@ -327,29 +404,6 @@ def override_request_env_and_config(
|
|
|
327
404
|
os.environ.update(original_env)
|
|
328
405
|
|
|
329
406
|
|
|
330
|
-
def _redirect_output(file: TextIO) -> Tuple[int, int]:
|
|
331
|
-
"""Redirect stdout and stderr to the log file."""
|
|
332
|
-
fd = file.fileno() # Get the file descriptor from the file object
|
|
333
|
-
# Store copies of the original stdout and stderr file descriptors
|
|
334
|
-
original_stdout = os.dup(sys.stdout.fileno())
|
|
335
|
-
original_stderr = os.dup(sys.stderr.fileno())
|
|
336
|
-
|
|
337
|
-
# Copy this fd to stdout and stderr
|
|
338
|
-
os.dup2(fd, sys.stdout.fileno())
|
|
339
|
-
os.dup2(fd, sys.stderr.fileno())
|
|
340
|
-
return original_stdout, original_stderr
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
def _restore_output(original_stdout: int, original_stderr: int) -> None:
|
|
344
|
-
"""Restore stdout and stderr to their original file descriptors."""
|
|
345
|
-
os.dup2(original_stdout, sys.stdout.fileno())
|
|
346
|
-
os.dup2(original_stderr, sys.stderr.fileno())
|
|
347
|
-
|
|
348
|
-
# Close the duplicate file descriptors
|
|
349
|
-
os.close(original_stdout)
|
|
350
|
-
os.close(original_stderr)
|
|
351
|
-
|
|
352
|
-
|
|
353
407
|
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
354
408
|
raise KeyboardInterrupt
|
|
355
409
|
|
|
@@ -367,76 +421,226 @@ def _request_execution_wrapper(request_id: str,
|
|
|
367
421
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
|
368
422
|
5. Maintain the lifecycle of the temp dir used by the request.
|
|
369
423
|
"""
|
|
424
|
+
pid = multiprocessing.current_process().pid
|
|
425
|
+
proc = psutil.Process(pid)
|
|
426
|
+
rss_begin = proc.memory_info().rss
|
|
370
427
|
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
371
428
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
372
|
-
signal.signal(
|
|
429
|
+
# Only set up signal handlers in the main thread, as signal.signal() raises
|
|
430
|
+
# ValueError if called from a non-main thread (e.g., in tests).
|
|
431
|
+
if threading.current_thread() is threading.main_thread():
|
|
432
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
373
433
|
|
|
374
|
-
pid = multiprocessing.current_process().pid
|
|
375
434
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
435
|
+
|
|
436
|
+
original_stdout = original_stderr = None
|
|
437
|
+
|
|
438
|
+
def _save_current_output() -> None:
|
|
439
|
+
"""Save the current stdout and stderr file descriptors."""
|
|
440
|
+
nonlocal original_stdout, original_stderr
|
|
441
|
+
original_stdout = os.dup(sys.stdout.fileno())
|
|
442
|
+
original_stderr = os.dup(sys.stderr.fileno())
|
|
443
|
+
|
|
444
|
+
def _redirect_output(file: TextIO) -> None:
|
|
445
|
+
"""Redirect stdout and stderr to the log file."""
|
|
446
|
+
# Get the file descriptor from the file object
|
|
447
|
+
fd = file.fileno()
|
|
448
|
+
# Copy this fd to stdout and stderr
|
|
449
|
+
os.dup2(fd, sys.stdout.fileno())
|
|
450
|
+
os.dup2(fd, sys.stderr.fileno())
|
|
451
|
+
|
|
452
|
+
def _restore_output() -> None:
|
|
453
|
+
"""Restore stdout and stderr to their original file descriptors."""
|
|
454
|
+
nonlocal original_stdout, original_stderr
|
|
455
|
+
if original_stdout is not None:
|
|
456
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
|
457
|
+
os.close(original_stdout)
|
|
458
|
+
original_stdout = None
|
|
459
|
+
|
|
460
|
+
if original_stderr is not None:
|
|
461
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
|
462
|
+
os.close(original_stderr)
|
|
463
|
+
original_stderr = None
|
|
464
|
+
|
|
465
|
+
request_name = None
|
|
466
|
+
try:
|
|
467
|
+
# As soon as the request is updated with the executor PID, we can
|
|
468
|
+
# receive SIGTERM from cancellation. So, we update the request inside
|
|
469
|
+
# the try block to ensure we have the KeyboardInterrupt handling.
|
|
470
|
+
with api_requests.update_request(request_id) as request_task:
|
|
471
|
+
assert request_task is not None, request_id
|
|
472
|
+
if request_task.status != api_requests.RequestStatus.PENDING:
|
|
473
|
+
logger.debug(f'Request is already {request_task.status.value}, '
|
|
474
|
+
f'skipping execution')
|
|
475
|
+
return
|
|
476
|
+
log_path = request_task.log_path
|
|
477
|
+
request_task.pid = pid
|
|
478
|
+
request_task.status = api_requests.RequestStatus.RUNNING
|
|
479
|
+
func = request_task.entrypoint
|
|
480
|
+
request_body = request_task.request_body
|
|
481
|
+
request_name = request_task.name
|
|
482
|
+
|
|
388
483
|
# Store copies of the original stdout and stderr file descriptors
|
|
389
|
-
|
|
390
|
-
#
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
484
|
+
# We do this in two steps because we should make sure to restore the
|
|
485
|
+
# original values even if we are cancelled or fail during the redirect.
|
|
486
|
+
_save_current_output()
|
|
487
|
+
|
|
488
|
+
# Append to the log file instead of overwriting it since there might be
|
|
489
|
+
# logs from previous retries.
|
|
490
|
+
with log_path.open('a', encoding='utf-8') as f:
|
|
491
|
+
# Redirect the stdout/stderr before overriding the environment and
|
|
492
|
+
# config, as there can be some logs during override that needs to be
|
|
493
|
+
# captured in the log file.
|
|
494
|
+
_redirect_output(f)
|
|
495
|
+
|
|
394
496
|
with sky_logging.add_debug_log_handler(request_id), \
|
|
395
|
-
override_request_env_and_config(
|
|
497
|
+
override_request_env_and_config(
|
|
498
|
+
request_body, request_id, request_name), \
|
|
396
499
|
tempstore.tempdir():
|
|
397
500
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
398
501
|
config = skypilot_config.to_dict()
|
|
399
502
|
logger.debug(f'request config: \n'
|
|
400
503
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
401
|
-
|
|
402
|
-
|
|
504
|
+
(metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
|
|
505
|
+
labels(request=request_name, pid=pid).inc())
|
|
506
|
+
with metrics_utils.time_it(name=request_name,
|
|
507
|
+
group='request_execution'):
|
|
403
508
|
return_value = func(**request_body.to_kwargs())
|
|
404
509
|
f.flush()
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
510
|
+
except KeyboardInterrupt:
|
|
511
|
+
logger.info(f'Request {request_id} cancelled by user')
|
|
512
|
+
# Kill all children processes related to this request.
|
|
513
|
+
# Each executor handles a single request, so we can safely kill all
|
|
514
|
+
# children processes related to this request.
|
|
515
|
+
# This is required as python does not pass the KeyboardInterrupt to the
|
|
516
|
+
# threads that are not main thread.
|
|
517
|
+
subprocess_utils.kill_children_processes()
|
|
518
|
+
return
|
|
519
|
+
except exceptions.ExecutionRetryableError as e:
|
|
520
|
+
logger.error(e)
|
|
521
|
+
logger.info(e.hint)
|
|
522
|
+
with api_requests.update_request(request_id) as request_task:
|
|
523
|
+
assert request_task is not None, request_id
|
|
524
|
+
# Retried request will undergo rescheduling and a new execution,
|
|
525
|
+
# clear the pid of the request.
|
|
526
|
+
request_task.pid = None
|
|
527
|
+
# Yield control to the scheduler for uniform handling of retries.
|
|
528
|
+
_restore_output()
|
|
529
|
+
raise
|
|
530
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
531
|
+
api_requests.set_request_failed(request_id, e)
|
|
532
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
533
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
534
|
+
# written to the original stdout and stderr file descriptors.
|
|
535
|
+
_restore_output()
|
|
536
|
+
logger.info(f'Request {request_id} failed due to '
|
|
537
|
+
f'{common_utils.format_exception(e)}')
|
|
538
|
+
return
|
|
539
|
+
else:
|
|
540
|
+
api_requests.set_request_succeeded(
|
|
541
|
+
request_id, return_value if not ignore_return_value else None)
|
|
542
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
543
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
544
|
+
# written to the original stdout and stderr file descriptors.
|
|
545
|
+
_restore_output()
|
|
546
|
+
logger.info(f'Request {request_id} finished')
|
|
547
|
+
finally:
|
|
548
|
+
_restore_output()
|
|
549
|
+
try:
|
|
550
|
+
# Capture the peak RSS before GC.
|
|
551
|
+
peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
|
|
552
|
+
# Clear request level cache to release all memory used by the
|
|
553
|
+
# request.
|
|
554
|
+
annotations.clear_request_level_cache()
|
|
555
|
+
with metrics_utils.time_it(name='release_memory', group='internal'):
|
|
556
|
+
common_utils.release_memory()
|
|
557
|
+
if request_name is not None:
|
|
558
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
559
|
+
except Exception as e: # pylint: disable=broad-except
|
|
560
|
+
logger.error(f'Failed to record memory metrics: '
|
|
561
|
+
f'{common_utils.format_exception(e)}')
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
_first_request = True
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
568
|
+
rss_begin: int, peak_rss: int) -> None:
|
|
569
|
+
"""Record the memory metrics for a request."""
|
|
570
|
+
# Do not record full memory delta for the first request as it
|
|
571
|
+
# will loads the sky core modules and make the memory usage
|
|
572
|
+
# estimation inaccurate.
|
|
573
|
+
global _first_request
|
|
574
|
+
if _first_request:
|
|
575
|
+
_first_request = False
|
|
576
|
+
return
|
|
577
|
+
rss_end = proc.memory_info().rss
|
|
578
|
+
|
|
579
|
+
# Answer "how much RSS this request contributed?"
|
|
580
|
+
metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
581
|
+
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
582
|
+
# Estimate the memory usage by the request by capturing the
|
|
583
|
+
# peak memory delta during the request execution.
|
|
584
|
+
metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
585
|
+
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
class CoroutineTask:
|
|
589
|
+
"""Wrapper of a background task runs in coroutine"""
|
|
590
|
+
|
|
591
|
+
def __init__(self, task: asyncio.Task):
|
|
592
|
+
self.task = task
|
|
593
|
+
|
|
594
|
+
async def cancel(self):
|
|
595
|
+
try:
|
|
596
|
+
self.task.cancel()
|
|
597
|
+
await self.task
|
|
598
|
+
except asyncio.CancelledError:
|
|
599
|
+
pass
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def check_request_thread_executor_available() -> None:
|
|
603
|
+
"""Check if the request thread executor is available.
|
|
437
604
|
|
|
605
|
+
This is a best effort check to hint the client to retry other server
|
|
606
|
+
processes when there is no avaiable thread worker in current one. But
|
|
607
|
+
a request may pass this check and still cannot get worker on execution
|
|
608
|
+
time due to race condition. In this case, the client will see a failed
|
|
609
|
+
request instead of retry.
|
|
438
610
|
|
|
439
|
-
|
|
611
|
+
TODO(aylei): this can be refined with a refactor of our coroutine
|
|
612
|
+
execution flow.
|
|
613
|
+
"""
|
|
614
|
+
get_request_thread_executor().check_available()
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def execute_request_in_coroutine(
|
|
618
|
+
request: api_requests.Request) -> CoroutineTask:
|
|
619
|
+
"""Execute a request in current event loop.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
request: The request to execute.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
A CoroutineTask handle to operate the background task.
|
|
626
|
+
"""
|
|
627
|
+
task = asyncio.create_task(_execute_request_coroutine(request))
|
|
628
|
+
return CoroutineTask(task)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _execute_with_config_override(func: Callable,
|
|
632
|
+
request_body: payloads.RequestBody,
|
|
633
|
+
request_id: str, request_name: str,
|
|
634
|
+
**kwargs) -> Any:
|
|
635
|
+
"""Execute a function with env and config override inside a thread."""
|
|
636
|
+
# Override the environment and config within this thread's context,
|
|
637
|
+
# which gets copied when we call to_thread.
|
|
638
|
+
with override_request_env_and_config(request_body, request_id,
|
|
639
|
+
request_name):
|
|
640
|
+
return func(**kwargs)
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
async def _execute_request_coroutine(request: api_requests.Request):
|
|
440
644
|
"""Execute a request in current event loop.
|
|
441
645
|
|
|
442
646
|
Similar to _request_execution_wrapper, but executed as coroutine in current
|
|
@@ -449,39 +653,43 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
449
653
|
logger.info(f'Executing request {request.request_id} in coroutine')
|
|
450
654
|
func = request.entrypoint
|
|
451
655
|
request_body = request.request_body
|
|
452
|
-
|
|
453
|
-
|
|
656
|
+
await api_requests.update_status_async(request.request_id,
|
|
657
|
+
api_requests.RequestStatus.RUNNING)
|
|
454
658
|
# Redirect stdout and stderr to the request log path.
|
|
455
659
|
original_output = ctx.redirect_log(request.log_path)
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
660
|
+
try:
|
|
661
|
+
fut: asyncio.Future = context_utils.to_thread_with_executor(
|
|
662
|
+
get_request_thread_executor(), _execute_with_config_override, func,
|
|
663
|
+
request_body, request.request_id, request.name,
|
|
664
|
+
**request_body.to_kwargs())
|
|
665
|
+
except Exception as e: # pylint: disable=broad-except
|
|
666
|
+
ctx.redirect_log(original_output)
|
|
667
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
668
|
+
logger.error(f'Failed to run request {request.request_id} due to '
|
|
669
|
+
f'{common_utils.format_exception(e)}')
|
|
670
|
+
return
|
|
464
671
|
|
|
465
672
|
async def poll_task(request_id: str) -> bool:
|
|
466
|
-
|
|
467
|
-
if
|
|
673
|
+
req_status = await api_requests.get_request_status_async(request_id)
|
|
674
|
+
if req_status is None:
|
|
468
675
|
raise RuntimeError('Request not found')
|
|
469
676
|
|
|
470
|
-
if
|
|
677
|
+
if req_status.status == api_requests.RequestStatus.CANCELLED:
|
|
471
678
|
ctx.cancel()
|
|
472
679
|
return True
|
|
473
680
|
|
|
474
681
|
if fut.done():
|
|
475
682
|
try:
|
|
476
683
|
result = await fut
|
|
477
|
-
api_requests.
|
|
684
|
+
await api_requests.set_request_succeeded_async(
|
|
685
|
+
request_id, result)
|
|
478
686
|
except asyncio.CancelledError:
|
|
479
687
|
# The task is cancelled by ctx.cancel(), where the status
|
|
480
688
|
# should already be set to CANCELLED.
|
|
481
689
|
pass
|
|
482
690
|
except Exception as e: # pylint: disable=broad-except
|
|
483
691
|
ctx.redirect_log(original_output)
|
|
484
|
-
api_requests.
|
|
692
|
+
await api_requests.set_request_failed_async(request_id, e)
|
|
485
693
|
logger.error(f'Request {request_id} failed due to '
|
|
486
694
|
f'{common_utils.format_exception(e)}')
|
|
487
695
|
return True
|
|
@@ -496,22 +704,25 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
496
704
|
except asyncio.CancelledError:
|
|
497
705
|
# Current coroutine is cancelled due to client disconnect, set the
|
|
498
706
|
# request status for consistency.
|
|
499
|
-
api_requests.
|
|
707
|
+
await api_requests.set_request_cancelled_async(request.request_id)
|
|
500
708
|
pass
|
|
501
709
|
# pylint: disable=broad-except
|
|
502
710
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
503
711
|
# Handle any other error
|
|
504
712
|
ctx.redirect_log(original_output)
|
|
505
|
-
|
|
506
|
-
api_requests.set_request_failed(request.request_id, e)
|
|
713
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
507
714
|
logger.error(f'Request {request.request_id} interrupted due to '
|
|
508
715
|
f'unhandled exception: {common_utils.format_exception(e)}')
|
|
509
716
|
raise
|
|
717
|
+
finally:
|
|
718
|
+
# Always cancel the context to kill potentially running background
|
|
719
|
+
# routine.
|
|
720
|
+
ctx.cancel()
|
|
510
721
|
|
|
511
722
|
|
|
512
|
-
def
|
|
723
|
+
async def prepare_request_async(
|
|
513
724
|
request_id: str,
|
|
514
|
-
request_name:
|
|
725
|
+
request_name: request_names.RequestName,
|
|
515
726
|
request_body: payloads.RequestBody,
|
|
516
727
|
func: Callable[P, Any],
|
|
517
728
|
request_cluster_name: Optional[str] = None,
|
|
@@ -535,7 +746,7 @@ def prepare_request(
|
|
|
535
746
|
user_id=user_id,
|
|
536
747
|
cluster_name=request_cluster_name)
|
|
537
748
|
|
|
538
|
-
if not api_requests.
|
|
749
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
539
750
|
raise exceptions.RequestAlreadyExistsError(
|
|
540
751
|
f'Request {request_id} already exists.')
|
|
541
752
|
|
|
@@ -543,17 +754,18 @@ def prepare_request(
|
|
|
543
754
|
return request
|
|
544
755
|
|
|
545
756
|
|
|
546
|
-
def
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
757
|
+
async def schedule_request_async(request_id: str,
|
|
758
|
+
request_name: request_names.RequestName,
|
|
759
|
+
request_body: payloads.RequestBody,
|
|
760
|
+
func: Callable[P, Any],
|
|
761
|
+
request_cluster_name: Optional[str] = None,
|
|
762
|
+
ignore_return_value: bool = False,
|
|
763
|
+
schedule_type: api_requests.ScheduleType = (
|
|
764
|
+
api_requests.ScheduleType.LONG),
|
|
765
|
+
is_skypilot_system: bool = False,
|
|
766
|
+
precondition: Optional[
|
|
767
|
+
preconditions.Precondition] = None,
|
|
768
|
+
retryable: bool = False) -> None:
|
|
557
769
|
"""Enqueue a request to the request queue.
|
|
558
770
|
|
|
559
771
|
Args:
|
|
@@ -574,13 +786,37 @@ def schedule_request(request_id: str,
|
|
|
574
786
|
The precondition is waited asynchronously and does not block the
|
|
575
787
|
caller.
|
|
576
788
|
"""
|
|
577
|
-
|
|
578
|
-
|
|
789
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
790
|
+
request_body, func,
|
|
791
|
+
request_cluster_name,
|
|
792
|
+
schedule_type,
|
|
793
|
+
is_skypilot_system)
|
|
794
|
+
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
795
|
+
retryable)
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def schedule_prepared_request(request_task: api_requests.Request,
|
|
799
|
+
ignore_return_value: bool = False,
|
|
800
|
+
precondition: Optional[
|
|
801
|
+
preconditions.Precondition] = None,
|
|
802
|
+
retryable: bool = False) -> None:
|
|
803
|
+
"""Enqueue a request to the request queue
|
|
804
|
+
|
|
805
|
+
Args:
|
|
806
|
+
request_task: The prepared request task to schedule.
|
|
807
|
+
ignore_return_value: If True, the return value of the function will be
|
|
808
|
+
ignored.
|
|
809
|
+
precondition: If a precondition is provided, the request will only be
|
|
810
|
+
scheduled for execution when the precondition is met (returns True).
|
|
811
|
+
The precondition is waited asynchronously and does not block the
|
|
812
|
+
caller.
|
|
813
|
+
retryable: Whether the request should be retried if it fails.
|
|
814
|
+
"""
|
|
579
815
|
|
|
580
816
|
def enqueue():
|
|
581
|
-
input_tuple = (request_id, ignore_return_value, retryable)
|
|
582
|
-
logger.info(f'Queuing request: {request_id}')
|
|
583
|
-
_get_queue(schedule_type).put(input_tuple)
|
|
817
|
+
input_tuple = (request_task.request_id, ignore_return_value, retryable)
|
|
818
|
+
logger.info(f'Queuing request: {request_task.request_id}')
|
|
819
|
+
_get_queue(request_task.schedule_type).put(input_tuple)
|
|
584
820
|
|
|
585
821
|
if precondition is not None:
|
|
586
822
|
# Wait async to avoid blocking caller.
|