skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/requests/executor.py
CHANGED
|
@@ -31,6 +31,7 @@ import time
|
|
|
31
31
|
import typing
|
|
32
32
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
|
33
33
|
|
|
34
|
+
import psutil
|
|
34
35
|
import setproctitle
|
|
35
36
|
|
|
36
37
|
from sky import exceptions
|
|
@@ -38,14 +39,18 @@ from sky import global_user_state
|
|
|
38
39
|
from sky import models
|
|
39
40
|
from sky import sky_logging
|
|
40
41
|
from sky import skypilot_config
|
|
42
|
+
from sky.metrics import utils as metrics_utils
|
|
41
43
|
from sky.server import common as server_common
|
|
42
44
|
from sky.server import config as server_config
|
|
43
45
|
from sky.server import constants as server_constants
|
|
44
46
|
from sky.server import metrics as metrics_lib
|
|
47
|
+
from sky.server import plugins
|
|
45
48
|
from sky.server.requests import payloads
|
|
46
49
|
from sky.server.requests import preconditions
|
|
47
50
|
from sky.server.requests import process
|
|
51
|
+
from sky.server.requests import request_names
|
|
48
52
|
from sky.server.requests import requests as api_requests
|
|
53
|
+
from sky.server.requests import threads
|
|
49
54
|
from sky.server.requests.queues import local_queue
|
|
50
55
|
from sky.server.requests.queues import mp_queue
|
|
51
56
|
from sky.skylet import constants
|
|
@@ -79,6 +84,31 @@ logger = sky_logging.init_logger(__name__)
|
|
|
79
84
|
# platforms, including macOS.
|
|
80
85
|
multiprocessing.set_start_method('spawn', force=True)
|
|
81
86
|
|
|
87
|
+
# An upper limit of max threads for request execution per server process that
|
|
88
|
+
# unlikely to be reached to allow higher concurrency while still prevent the
|
|
89
|
+
# server process become overloaded.
|
|
90
|
+
_REQUEST_THREADS_LIMIT = 128
|
|
91
|
+
|
|
92
|
+
_REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
|
|
93
|
+
# A dedicated thread pool executor for synced requests execution in coroutine to
|
|
94
|
+
# avoid:
|
|
95
|
+
# 1. blocking the event loop;
|
|
96
|
+
# 2. exhausting the default thread pool executor of event loop;
|
|
97
|
+
_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
|
|
101
|
+
"""Lazy init and return the request thread executor for current process."""
|
|
102
|
+
global _REQUEST_THREAD_EXECUTOR
|
|
103
|
+
if _REQUEST_THREAD_EXECUTOR is not None:
|
|
104
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
105
|
+
with _REQUEST_THREAD_EXECUTOR_LOCK:
|
|
106
|
+
if _REQUEST_THREAD_EXECUTOR is None:
|
|
107
|
+
_REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
|
|
108
|
+
name='request_thread_executor',
|
|
109
|
+
max_workers=_REQUEST_THREADS_LIMIT)
|
|
110
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
111
|
+
|
|
82
112
|
|
|
83
113
|
class RequestQueue:
|
|
84
114
|
"""The queue for the requests, either redis or multiprocessing.
|
|
@@ -130,6 +160,12 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
|
|
130
160
|
def executor_initializer(proc_group: str):
|
|
131
161
|
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
|
132
162
|
f'{multiprocessing.current_process().pid}')
|
|
163
|
+
# Load plugins for executor process.
|
|
164
|
+
plugins.load_plugins(plugins.ExtensionContext())
|
|
165
|
+
# Executor never stops, unless the whole process is killed.
|
|
166
|
+
threading.Thread(target=metrics_lib.process_monitor,
|
|
167
|
+
args=(f'worker:{proc_group}', threading.Event()),
|
|
168
|
+
daemon=True).start()
|
|
133
169
|
|
|
134
170
|
|
|
135
171
|
class RequestWorker:
|
|
@@ -182,10 +218,11 @@ class RequestWorker:
|
|
|
182
218
|
time.sleep(0.1)
|
|
183
219
|
return
|
|
184
220
|
request_id, ignore_return_value, _ = request_element
|
|
185
|
-
request = api_requests.get_request(request_id)
|
|
221
|
+
request = api_requests.get_request(request_id, fields=['status'])
|
|
186
222
|
assert request is not None, f'Request with ID {request_id} is None'
|
|
187
223
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
|
188
224
|
return
|
|
225
|
+
del request
|
|
189
226
|
logger.info(f'[{self}] Submitting request: {request_id}')
|
|
190
227
|
# Start additional process to run the request, so that it can be
|
|
191
228
|
# cancelled when requested by a user.
|
|
@@ -196,6 +233,12 @@ class RequestWorker:
|
|
|
196
233
|
fut = executor.submit_until_success(
|
|
197
234
|
_request_execution_wrapper, request_id, ignore_return_value,
|
|
198
235
|
self.num_db_connections_per_worker)
|
|
236
|
+
# Decrement the free executor count when a request starts
|
|
237
|
+
if metrics_utils.METRICS_ENABLED:
|
|
238
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
239
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.dec()
|
|
240
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
241
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.dec()
|
|
199
242
|
# Monitor the result of the request execution.
|
|
200
243
|
threading.Thread(target=self.handle_task_result,
|
|
201
244
|
args=(fut, request_element),
|
|
@@ -230,9 +273,23 @@ class RequestWorker:
|
|
|
230
273
|
queue.put(request_element)
|
|
231
274
|
except exceptions.ExecutionRetryableError as e:
|
|
232
275
|
time.sleep(e.retry_wait_seconds)
|
|
276
|
+
# Reset the request status to PENDING so it can be picked up again.
|
|
277
|
+
# Assume retryable since the error is ExecutionRetryableError.
|
|
278
|
+
request_id, _, _ = request_element
|
|
279
|
+
with api_requests.update_request(request_id) as request_task:
|
|
280
|
+
assert request_task is not None, request_id
|
|
281
|
+
request_task.status = api_requests.RequestStatus.PENDING
|
|
233
282
|
# Reschedule the request.
|
|
234
283
|
queue = _get_queue(self.schedule_type)
|
|
235
284
|
queue.put(request_element)
|
|
285
|
+
logger.info(f'Rescheduled request {request_id} for retry')
|
|
286
|
+
finally:
|
|
287
|
+
# Increment the free executor count when a request finishes
|
|
288
|
+
if metrics_utils.METRICS_ENABLED:
|
|
289
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
290
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.inc()
|
|
291
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
292
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.inc()
|
|
236
293
|
|
|
237
294
|
def run(self) -> None:
|
|
238
295
|
# Handle the SIGTERM signal to abort the executor process gracefully.
|
|
@@ -254,6 +311,16 @@ class RequestWorker:
|
|
|
254
311
|
burst_workers=self.burstable_parallelism,
|
|
255
312
|
initializer=executor_initializer,
|
|
256
313
|
initargs=(proc_group,))
|
|
314
|
+
# Initialize the appropriate gauge for the number of free executors
|
|
315
|
+
total_executors = (self.garanteed_parallelism +
|
|
316
|
+
self.burstable_parallelism)
|
|
317
|
+
if metrics_utils.METRICS_ENABLED:
|
|
318
|
+
if self.schedule_type == api_requests.ScheduleType.LONG:
|
|
319
|
+
metrics_utils.SKY_APISERVER_LONG_EXECUTORS.set(
|
|
320
|
+
total_executors)
|
|
321
|
+
elif self.schedule_type == api_requests.ScheduleType.SHORT:
|
|
322
|
+
metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.set(
|
|
323
|
+
total_executors)
|
|
257
324
|
while not self._cancel_event.is_set():
|
|
258
325
|
self.process_request(executor, queue)
|
|
259
326
|
# TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
|
|
@@ -277,43 +344,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
|
|
|
277
344
|
|
|
278
345
|
@contextlib.contextmanager
|
|
279
346
|
def override_request_env_and_config(
|
|
280
|
-
request_body: payloads.RequestBody,
|
|
281
|
-
|
|
347
|
+
request_body: payloads.RequestBody, request_id: str,
|
|
348
|
+
request_name: str) -> Generator[None, None, None]:
|
|
282
349
|
"""Override the environment and SkyPilot config for a request."""
|
|
283
350
|
original_env = os.environ.copy()
|
|
284
|
-
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
|
|
285
|
-
# affecting client requests. If set on the client side, it will be
|
|
286
|
-
# overridden by the request body.
|
|
287
|
-
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
288
|
-
os.environ.update(request_body.env_vars)
|
|
289
|
-
# Note: may be overridden by AuthProxyMiddleware.
|
|
290
|
-
# TODO(zhwu): we need to make the entire request a context available to the
|
|
291
|
-
# entire request execution, so that we can access info like user through
|
|
292
|
-
# the execution.
|
|
293
|
-
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
294
|
-
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
295
|
-
global_user_state.add_or_update_user(user)
|
|
296
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
297
|
-
# field.
|
|
298
|
-
user = global_user_state.get_user(user.id)
|
|
299
|
-
|
|
300
|
-
# Force color to be enabled.
|
|
301
|
-
os.environ['CLICOLOR_FORCE'] = '1'
|
|
302
|
-
server_common.reload_for_new_request(
|
|
303
|
-
client_entrypoint=request_body.entrypoint,
|
|
304
|
-
client_command=request_body.entrypoint_command,
|
|
305
|
-
using_remote_api_server=request_body.using_remote_api_server,
|
|
306
|
-
user=user,
|
|
307
|
-
request_id=request_id)
|
|
308
351
|
try:
|
|
352
|
+
# Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
|
|
353
|
+
# server affecting client requests. If set on the client side, it will
|
|
354
|
+
# be overridden by the request body.
|
|
355
|
+
os.environ.pop('SKYPILOT_DEBUG', None)
|
|
356
|
+
# Remove the db connection uri from client supplied env vars, as the
|
|
357
|
+
# client should not set the db string on server side.
|
|
358
|
+
request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
|
|
359
|
+
os.environ.update(request_body.env_vars)
|
|
360
|
+
# Note: may be overridden by AuthProxyMiddleware.
|
|
361
|
+
# TODO(zhwu): we need to make the entire request a context available to
|
|
362
|
+
# the entire request execution, so that we can access info like user
|
|
363
|
+
# through the execution.
|
|
364
|
+
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
365
|
+
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
366
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
367
|
+
|
|
368
|
+
# Force color to be enabled.
|
|
369
|
+
os.environ['CLICOLOR_FORCE'] = '1'
|
|
370
|
+
server_common.reload_for_new_request(
|
|
371
|
+
client_entrypoint=request_body.entrypoint,
|
|
372
|
+
client_command=request_body.entrypoint_command,
|
|
373
|
+
using_remote_api_server=request_body.using_remote_api_server,
|
|
374
|
+
user=user,
|
|
375
|
+
request_id=request_id)
|
|
309
376
|
logger.debug(
|
|
310
377
|
f'override path: {request_body.override_skypilot_config_path}')
|
|
311
378
|
with skypilot_config.override_skypilot_config(
|
|
312
379
|
request_body.override_skypilot_config,
|
|
313
380
|
request_body.override_skypilot_config_path):
|
|
314
|
-
#
|
|
315
|
-
#
|
|
316
|
-
|
|
381
|
+
# Skip permission check for sky.workspaces.get request
|
|
382
|
+
# as it is used to determine which workspaces the user
|
|
383
|
+
# has access to.
|
|
384
|
+
if request_name != 'sky.workspaces.get':
|
|
385
|
+
try:
|
|
386
|
+
# Reject requests that the user does not have permission
|
|
387
|
+
# to access.
|
|
388
|
+
workspaces_core.reject_request_for_unauthorized_workspace(
|
|
389
|
+
user)
|
|
390
|
+
except exceptions.PermissionDeniedError as e:
|
|
391
|
+
logger.debug(
|
|
392
|
+
f'{request_id} permission denied to workspace: '
|
|
393
|
+
f'{skypilot_config.get_active_workspace()}: {e}')
|
|
394
|
+
raise e
|
|
395
|
+
logger.debug(
|
|
396
|
+
f'{request_id} permission granted to {request_name} request')
|
|
317
397
|
yield
|
|
318
398
|
finally:
|
|
319
399
|
# We need to call the save_timeline() since atexit will not be
|
|
@@ -327,29 +407,6 @@ def override_request_env_and_config(
|
|
|
327
407
|
os.environ.update(original_env)
|
|
328
408
|
|
|
329
409
|
|
|
330
|
-
def _redirect_output(file: TextIO) -> Tuple[int, int]:
|
|
331
|
-
"""Redirect stdout and stderr to the log file."""
|
|
332
|
-
fd = file.fileno() # Get the file descriptor from the file object
|
|
333
|
-
# Store copies of the original stdout and stderr file descriptors
|
|
334
|
-
original_stdout = os.dup(sys.stdout.fileno())
|
|
335
|
-
original_stderr = os.dup(sys.stderr.fileno())
|
|
336
|
-
|
|
337
|
-
# Copy this fd to stdout and stderr
|
|
338
|
-
os.dup2(fd, sys.stdout.fileno())
|
|
339
|
-
os.dup2(fd, sys.stderr.fileno())
|
|
340
|
-
return original_stdout, original_stderr
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
def _restore_output(original_stdout: int, original_stderr: int) -> None:
|
|
344
|
-
"""Restore stdout and stderr to their original file descriptors."""
|
|
345
|
-
os.dup2(original_stdout, sys.stdout.fileno())
|
|
346
|
-
os.dup2(original_stderr, sys.stderr.fileno())
|
|
347
|
-
|
|
348
|
-
# Close the duplicate file descriptors
|
|
349
|
-
os.close(original_stdout)
|
|
350
|
-
os.close(original_stderr)
|
|
351
|
-
|
|
352
|
-
|
|
353
410
|
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
354
411
|
raise KeyboardInterrupt
|
|
355
412
|
|
|
@@ -367,76 +424,226 @@ def _request_execution_wrapper(request_id: str,
|
|
|
367
424
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
|
368
425
|
5. Maintain the lifecycle of the temp dir used by the request.
|
|
369
426
|
"""
|
|
427
|
+
pid = multiprocessing.current_process().pid
|
|
428
|
+
proc = psutil.Process(pid)
|
|
429
|
+
rss_begin = proc.memory_info().rss
|
|
370
430
|
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
371
431
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
372
|
-
signal.signal(
|
|
432
|
+
# Only set up signal handlers in the main thread, as signal.signal() raises
|
|
433
|
+
# ValueError if called from a non-main thread (e.g., in tests).
|
|
434
|
+
if threading.current_thread() is threading.main_thread():
|
|
435
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
373
436
|
|
|
374
|
-
pid = multiprocessing.current_process().pid
|
|
375
437
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
438
|
+
|
|
439
|
+
original_stdout = original_stderr = None
|
|
440
|
+
|
|
441
|
+
def _save_current_output() -> None:
|
|
442
|
+
"""Save the current stdout and stderr file descriptors."""
|
|
443
|
+
nonlocal original_stdout, original_stderr
|
|
444
|
+
original_stdout = os.dup(sys.stdout.fileno())
|
|
445
|
+
original_stderr = os.dup(sys.stderr.fileno())
|
|
446
|
+
|
|
447
|
+
def _redirect_output(file: TextIO) -> None:
|
|
448
|
+
"""Redirect stdout and stderr to the log file."""
|
|
449
|
+
# Get the file descriptor from the file object
|
|
450
|
+
fd = file.fileno()
|
|
451
|
+
# Copy this fd to stdout and stderr
|
|
452
|
+
os.dup2(fd, sys.stdout.fileno())
|
|
453
|
+
os.dup2(fd, sys.stderr.fileno())
|
|
454
|
+
|
|
455
|
+
def _restore_output() -> None:
|
|
456
|
+
"""Restore stdout and stderr to their original file descriptors."""
|
|
457
|
+
nonlocal original_stdout, original_stderr
|
|
458
|
+
if original_stdout is not None:
|
|
459
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
|
460
|
+
os.close(original_stdout)
|
|
461
|
+
original_stdout = None
|
|
462
|
+
|
|
463
|
+
if original_stderr is not None:
|
|
464
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
|
465
|
+
os.close(original_stderr)
|
|
466
|
+
original_stderr = None
|
|
467
|
+
|
|
468
|
+
request_name = None
|
|
469
|
+
try:
|
|
470
|
+
# As soon as the request is updated with the executor PID, we can
|
|
471
|
+
# receive SIGTERM from cancellation. So, we update the request inside
|
|
472
|
+
# the try block to ensure we have the KeyboardInterrupt handling.
|
|
473
|
+
with api_requests.update_request(request_id) as request_task:
|
|
474
|
+
assert request_task is not None, request_id
|
|
475
|
+
if request_task.status != api_requests.RequestStatus.PENDING:
|
|
476
|
+
logger.debug(f'Request is already {request_task.status.value}, '
|
|
477
|
+
f'skipping execution')
|
|
478
|
+
return
|
|
479
|
+
log_path = request_task.log_path
|
|
480
|
+
request_task.pid = pid
|
|
481
|
+
request_task.status = api_requests.RequestStatus.RUNNING
|
|
482
|
+
func = request_task.entrypoint
|
|
483
|
+
request_body = request_task.request_body
|
|
484
|
+
request_name = request_task.name
|
|
485
|
+
|
|
388
486
|
# Store copies of the original stdout and stderr file descriptors
|
|
389
|
-
|
|
390
|
-
#
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
487
|
+
# We do this in two steps because we should make sure to restore the
|
|
488
|
+
# original values even if we are cancelled or fail during the redirect.
|
|
489
|
+
_save_current_output()
|
|
490
|
+
|
|
491
|
+
# Append to the log file instead of overwriting it since there might be
|
|
492
|
+
# logs from previous retries.
|
|
493
|
+
with log_path.open('a', encoding='utf-8') as f:
|
|
494
|
+
# Redirect the stdout/stderr before overriding the environment and
|
|
495
|
+
# config, as there can be some logs during override that needs to be
|
|
496
|
+
# captured in the log file.
|
|
497
|
+
_redirect_output(f)
|
|
498
|
+
|
|
394
499
|
with sky_logging.add_debug_log_handler(request_id), \
|
|
395
|
-
override_request_env_and_config(
|
|
500
|
+
override_request_env_and_config(
|
|
501
|
+
request_body, request_id, request_name), \
|
|
396
502
|
tempstore.tempdir():
|
|
397
503
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
398
504
|
config = skypilot_config.to_dict()
|
|
399
505
|
logger.debug(f'request config: \n'
|
|
400
506
|
f'{yaml_utils.dump_yaml_str(dict(config))}')
|
|
401
|
-
|
|
402
|
-
|
|
507
|
+
(metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
|
|
508
|
+
labels(request=request_name, pid=pid).inc())
|
|
509
|
+
with metrics_utils.time_it(name=request_name,
|
|
510
|
+
group='request_execution'):
|
|
403
511
|
return_value = func(**request_body.to_kwargs())
|
|
404
512
|
f.flush()
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
513
|
+
except KeyboardInterrupt:
|
|
514
|
+
logger.info(f'Request {request_id} cancelled by user')
|
|
515
|
+
# Kill all children processes related to this request.
|
|
516
|
+
# Each executor handles a single request, so we can safely kill all
|
|
517
|
+
# children processes related to this request.
|
|
518
|
+
# This is required as python does not pass the KeyboardInterrupt to the
|
|
519
|
+
# threads that are not main thread.
|
|
520
|
+
subprocess_utils.kill_children_processes()
|
|
521
|
+
return
|
|
522
|
+
except exceptions.ExecutionRetryableError as e:
|
|
523
|
+
logger.error(e)
|
|
524
|
+
logger.info(e.hint)
|
|
525
|
+
with api_requests.update_request(request_id) as request_task:
|
|
526
|
+
assert request_task is not None, request_id
|
|
527
|
+
# Retried request will undergo rescheduling and a new execution,
|
|
528
|
+
# clear the pid of the request.
|
|
529
|
+
request_task.pid = None
|
|
530
|
+
# Yield control to the scheduler for uniform handling of retries.
|
|
531
|
+
_restore_output()
|
|
532
|
+
raise
|
|
533
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
534
|
+
api_requests.set_request_failed(request_id, e)
|
|
535
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
536
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
537
|
+
# written to the original stdout and stderr file descriptors.
|
|
538
|
+
_restore_output()
|
|
539
|
+
logger.error(f'Request {request_id} failed due to '
|
|
540
|
+
f'{common_utils.format_exception(e)}')
|
|
541
|
+
return
|
|
542
|
+
else:
|
|
543
|
+
api_requests.set_request_succeeded(
|
|
544
|
+
request_id, return_value if not ignore_return_value else None)
|
|
545
|
+
# Manually reset the original stdout and stderr file descriptors early
|
|
546
|
+
# so that the "Request xxxx failed due to ..." log message will be
|
|
547
|
+
# written to the original stdout and stderr file descriptors.
|
|
548
|
+
_restore_output()
|
|
549
|
+
logger.info(f'Request {request_id} finished')
|
|
550
|
+
finally:
|
|
551
|
+
_restore_output()
|
|
552
|
+
try:
|
|
553
|
+
# Capture the peak RSS before GC.
|
|
554
|
+
peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
|
|
555
|
+
# Clear request level cache to release all memory used by the
|
|
556
|
+
# request.
|
|
557
|
+
annotations.clear_request_level_cache()
|
|
558
|
+
with metrics_utils.time_it(name='release_memory', group='internal'):
|
|
559
|
+
common_utils.release_memory()
|
|
560
|
+
if request_name is not None:
|
|
561
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
562
|
+
except Exception as e: # pylint: disable=broad-except
|
|
563
|
+
logger.error(f'Failed to record memory metrics: '
|
|
564
|
+
f'{common_utils.format_exception(e)}')
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
_first_request = True
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
571
|
+
rss_begin: int, peak_rss: int) -> None:
|
|
572
|
+
"""Record the memory metrics for a request."""
|
|
573
|
+
# Do not record full memory delta for the first request as it
|
|
574
|
+
# will loads the sky core modules and make the memory usage
|
|
575
|
+
# estimation inaccurate.
|
|
576
|
+
global _first_request
|
|
577
|
+
if _first_request:
|
|
578
|
+
_first_request = False
|
|
579
|
+
return
|
|
580
|
+
rss_end = proc.memory_info().rss
|
|
581
|
+
|
|
582
|
+
# Answer "how much RSS this request contributed?"
|
|
583
|
+
metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
584
|
+
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
585
|
+
# Estimate the memory usage by the request by capturing the
|
|
586
|
+
# peak memory delta during the request execution.
|
|
587
|
+
metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
588
|
+
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
class CoroutineTask:
|
|
592
|
+
"""Wrapper of a background task runs in coroutine"""
|
|
593
|
+
|
|
594
|
+
def __init__(self, task: asyncio.Task):
|
|
595
|
+
self.task = task
|
|
596
|
+
|
|
597
|
+
async def cancel(self):
|
|
598
|
+
try:
|
|
599
|
+
self.task.cancel()
|
|
600
|
+
await self.task
|
|
601
|
+
except asyncio.CancelledError:
|
|
602
|
+
pass
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def check_request_thread_executor_available() -> None:
|
|
606
|
+
"""Check if the request thread executor is available.
|
|
437
607
|
|
|
608
|
+
This is a best effort check to hint the client to retry other server
|
|
609
|
+
processes when there is no avaiable thread worker in current one. But
|
|
610
|
+
a request may pass this check and still cannot get worker on execution
|
|
611
|
+
time due to race condition. In this case, the client will see a failed
|
|
612
|
+
request instead of retry.
|
|
438
613
|
|
|
439
|
-
|
|
614
|
+
TODO(aylei): this can be refined with a refactor of our coroutine
|
|
615
|
+
execution flow.
|
|
616
|
+
"""
|
|
617
|
+
get_request_thread_executor().check_available()
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def execute_request_in_coroutine(
|
|
621
|
+
request: api_requests.Request) -> CoroutineTask:
|
|
622
|
+
"""Execute a request in current event loop.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
request: The request to execute.
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
A CoroutineTask handle to operate the background task.
|
|
629
|
+
"""
|
|
630
|
+
task = asyncio.create_task(_execute_request_coroutine(request))
|
|
631
|
+
return CoroutineTask(task)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def _execute_with_config_override(func: Callable,
|
|
635
|
+
request_body: payloads.RequestBody,
|
|
636
|
+
request_id: str, request_name: str,
|
|
637
|
+
**kwargs) -> Any:
|
|
638
|
+
"""Execute a function with env and config override inside a thread."""
|
|
639
|
+
# Override the environment and config within this thread's context,
|
|
640
|
+
# which gets copied when we call to_thread.
|
|
641
|
+
with override_request_env_and_config(request_body, request_id,
|
|
642
|
+
request_name):
|
|
643
|
+
return func(**kwargs)
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
async def _execute_request_coroutine(request: api_requests.Request):
|
|
440
647
|
"""Execute a request in current event loop.
|
|
441
648
|
|
|
442
649
|
Similar to _request_execution_wrapper, but executed as coroutine in current
|
|
@@ -449,39 +656,43 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
449
656
|
logger.info(f'Executing request {request.request_id} in coroutine')
|
|
450
657
|
func = request.entrypoint
|
|
451
658
|
request_body = request.request_body
|
|
452
|
-
|
|
453
|
-
|
|
659
|
+
await api_requests.update_status_async(request.request_id,
|
|
660
|
+
api_requests.RequestStatus.RUNNING)
|
|
454
661
|
# Redirect stdout and stderr to the request log path.
|
|
455
662
|
original_output = ctx.redirect_log(request.log_path)
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
663
|
+
try:
|
|
664
|
+
fut: asyncio.Future = context_utils.to_thread_with_executor(
|
|
665
|
+
get_request_thread_executor(), _execute_with_config_override, func,
|
|
666
|
+
request_body, request.request_id, request.name,
|
|
667
|
+
**request_body.to_kwargs())
|
|
668
|
+
except Exception as e: # pylint: disable=broad-except
|
|
669
|
+
ctx.redirect_log(original_output)
|
|
670
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
671
|
+
logger.error(f'Failed to run request {request.request_id} due to '
|
|
672
|
+
f'{common_utils.format_exception(e)}')
|
|
673
|
+
return
|
|
464
674
|
|
|
465
675
|
async def poll_task(request_id: str) -> bool:
|
|
466
|
-
|
|
467
|
-
if
|
|
676
|
+
req_status = await api_requests.get_request_status_async(request_id)
|
|
677
|
+
if req_status is None:
|
|
468
678
|
raise RuntimeError('Request not found')
|
|
469
679
|
|
|
470
|
-
if
|
|
680
|
+
if req_status.status == api_requests.RequestStatus.CANCELLED:
|
|
471
681
|
ctx.cancel()
|
|
472
682
|
return True
|
|
473
683
|
|
|
474
684
|
if fut.done():
|
|
475
685
|
try:
|
|
476
686
|
result = await fut
|
|
477
|
-
api_requests.
|
|
687
|
+
await api_requests.set_request_succeeded_async(
|
|
688
|
+
request_id, result)
|
|
478
689
|
except asyncio.CancelledError:
|
|
479
690
|
# The task is cancelled by ctx.cancel(), where the status
|
|
480
691
|
# should already be set to CANCELLED.
|
|
481
692
|
pass
|
|
482
693
|
except Exception as e: # pylint: disable=broad-except
|
|
483
694
|
ctx.redirect_log(original_output)
|
|
484
|
-
api_requests.
|
|
695
|
+
await api_requests.set_request_failed_async(request_id, e)
|
|
485
696
|
logger.error(f'Request {request_id} failed due to '
|
|
486
697
|
f'{common_utils.format_exception(e)}')
|
|
487
698
|
return True
|
|
@@ -496,22 +707,25 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
496
707
|
except asyncio.CancelledError:
|
|
497
708
|
# Current coroutine is cancelled due to client disconnect, set the
|
|
498
709
|
# request status for consistency.
|
|
499
|
-
api_requests.
|
|
710
|
+
await api_requests.set_request_cancelled_async(request.request_id)
|
|
500
711
|
pass
|
|
501
712
|
# pylint: disable=broad-except
|
|
502
713
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
503
714
|
# Handle any other error
|
|
504
715
|
ctx.redirect_log(original_output)
|
|
505
|
-
|
|
506
|
-
api_requests.set_request_failed(request.request_id, e)
|
|
716
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
507
717
|
logger.error(f'Request {request.request_id} interrupted due to '
|
|
508
718
|
f'unhandled exception: {common_utils.format_exception(e)}')
|
|
509
719
|
raise
|
|
720
|
+
finally:
|
|
721
|
+
# Always cancel the context to kill potentially running background
|
|
722
|
+
# routine.
|
|
723
|
+
ctx.cancel()
|
|
510
724
|
|
|
511
725
|
|
|
512
|
-
def
|
|
726
|
+
async def prepare_request_async(
|
|
513
727
|
request_id: str,
|
|
514
|
-
request_name:
|
|
728
|
+
request_name: request_names.RequestName,
|
|
515
729
|
request_body: payloads.RequestBody,
|
|
516
730
|
func: Callable[P, Any],
|
|
517
731
|
request_cluster_name: Optional[str] = None,
|
|
@@ -535,7 +749,7 @@ def prepare_request(
|
|
|
535
749
|
user_id=user_id,
|
|
536
750
|
cluster_name=request_cluster_name)
|
|
537
751
|
|
|
538
|
-
if not api_requests.
|
|
752
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
539
753
|
raise exceptions.RequestAlreadyExistsError(
|
|
540
754
|
f'Request {request_id} already exists.')
|
|
541
755
|
|
|
@@ -543,17 +757,18 @@ def prepare_request(
|
|
|
543
757
|
return request
|
|
544
758
|
|
|
545
759
|
|
|
546
|
-
def
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
760
|
+
async def schedule_request_async(request_id: str,
|
|
761
|
+
request_name: request_names.RequestName,
|
|
762
|
+
request_body: payloads.RequestBody,
|
|
763
|
+
func: Callable[P, Any],
|
|
764
|
+
request_cluster_name: Optional[str] = None,
|
|
765
|
+
ignore_return_value: bool = False,
|
|
766
|
+
schedule_type: api_requests.ScheduleType = (
|
|
767
|
+
api_requests.ScheduleType.LONG),
|
|
768
|
+
is_skypilot_system: bool = False,
|
|
769
|
+
precondition: Optional[
|
|
770
|
+
preconditions.Precondition] = None,
|
|
771
|
+
retryable: bool = False) -> None:
|
|
557
772
|
"""Enqueue a request to the request queue.
|
|
558
773
|
|
|
559
774
|
Args:
|
|
@@ -574,13 +789,37 @@ def schedule_request(request_id: str,
|
|
|
574
789
|
The precondition is waited asynchronously and does not block the
|
|
575
790
|
caller.
|
|
576
791
|
"""
|
|
577
|
-
|
|
578
|
-
|
|
792
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
793
|
+
request_body, func,
|
|
794
|
+
request_cluster_name,
|
|
795
|
+
schedule_type,
|
|
796
|
+
is_skypilot_system)
|
|
797
|
+
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
798
|
+
retryable)
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def schedule_prepared_request(request_task: api_requests.Request,
|
|
802
|
+
ignore_return_value: bool = False,
|
|
803
|
+
precondition: Optional[
|
|
804
|
+
preconditions.Precondition] = None,
|
|
805
|
+
retryable: bool = False) -> None:
|
|
806
|
+
"""Enqueue a request to the request queue
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
request_task: The prepared request task to schedule.
|
|
810
|
+
ignore_return_value: If True, the return value of the function will be
|
|
811
|
+
ignored.
|
|
812
|
+
precondition: If a precondition is provided, the request will only be
|
|
813
|
+
scheduled for execution when the precondition is met (returns True).
|
|
814
|
+
The precondition is waited asynchronously and does not block the
|
|
815
|
+
caller.
|
|
816
|
+
retryable: Whether the request should be retried if it fails.
|
|
817
|
+
"""
|
|
579
818
|
|
|
580
819
|
def enqueue():
|
|
581
|
-
input_tuple = (request_id, ignore_return_value, retryable)
|
|
582
|
-
logger.info(f'Queuing request: {request_id}')
|
|
583
|
-
_get_queue(schedule_type).put(input_tuple)
|
|
820
|
+
input_tuple = (request_task.request_id, ignore_return_value, retryable)
|
|
821
|
+
logger.info(f'Queuing request: {request_task.request_id}')
|
|
822
|
+
_get_queue(request_task.schedule_type).put(input_tuple)
|
|
584
823
|
|
|
585
824
|
if precondition is not None:
|
|
586
825
|
# Wait async to avoid blocking caller.
|