skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/common.py
CHANGED
|
@@ -17,7 +17,6 @@ import time
|
|
|
17
17
|
import typing
|
|
18
18
|
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
19
|
Tuple, TypeVar, Union)
|
|
20
|
-
from urllib import parse
|
|
21
20
|
import uuid
|
|
22
21
|
|
|
23
22
|
import cachetools
|
|
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
|
342
341
|
@annotations.lru_cache(scope='global')
|
|
343
342
|
def get_dashboard_url(server_url: str,
|
|
344
343
|
starting_page: Optional[str] = None) -> str:
|
|
345
|
-
|
|
346
|
-
# format of https://username:password@example.com:8080/path
|
|
347
|
-
# We need to remove the username and password and only
|
|
348
|
-
# return `https://example.com:8080/path`
|
|
349
|
-
parsed = parse.urlparse(server_url)
|
|
350
|
-
# Reconstruct the URL without credentials but keeping the scheme
|
|
351
|
-
dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
|
|
352
|
-
if parsed.port:
|
|
353
|
-
dashboard_url = f'{dashboard_url}:{parsed.port}'
|
|
354
|
-
if parsed.path:
|
|
355
|
-
dashboard_url = f'{dashboard_url}{parsed.path}'
|
|
356
|
-
dashboard_url = dashboard_url.rstrip('/')
|
|
344
|
+
dashboard_url = server_url.rstrip('/')
|
|
357
345
|
dashboard_url = f'{dashboard_url}/dashboard'
|
|
358
346
|
if starting_page:
|
|
359
347
|
dashboard_url = f'{dashboard_url}/{starting_page}'
|
|
@@ -490,6 +478,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
490
478
|
def handle_request_error(response: 'requests.Response') -> None:
|
|
491
479
|
# Keep the original HTTPError if the response code >= 400
|
|
492
480
|
response.raise_for_status()
|
|
481
|
+
|
|
493
482
|
# Other status codes are not expected neither, e.g. we do not expect to
|
|
494
483
|
# handle redirection here.
|
|
495
484
|
if response.status_code != 200:
|
|
@@ -515,6 +504,19 @@ def get_request_id(response: 'requests.Response') -> RequestId[T]:
|
|
|
515
504
|
return RequestId[T](request_id)
|
|
516
505
|
|
|
517
506
|
|
|
507
|
+
def get_stream_request_id(
|
|
508
|
+
response: 'requests.Response') -> Optional[RequestId[T]]:
|
|
509
|
+
"""This is same as the above function, but just for `sdk.stream_and_get.
|
|
510
|
+
We do this because `/api/stream` may choose the latest request id, and
|
|
511
|
+
we need to keep track of that information. Request id in this case can
|
|
512
|
+
be None."""
|
|
513
|
+
handle_request_error(response)
|
|
514
|
+
request_id = response.headers.get(server_constants.STREAM_REQUEST_HEADER)
|
|
515
|
+
if request_id is not None:
|
|
516
|
+
return RequestId[T](request_id)
|
|
517
|
+
return None
|
|
518
|
+
|
|
519
|
+
|
|
518
520
|
def _start_api_server(deploy: bool = False,
|
|
519
521
|
host: str = '127.0.0.1',
|
|
520
522
|
foreground: bool = False,
|
|
@@ -537,14 +539,27 @@ def _start_api_server(deploy: bool = False,
|
|
|
537
539
|
'is not a local URL')
|
|
538
540
|
|
|
539
541
|
# Check available memory before starting the server.
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
542
|
+
# Skip this warning if postgres is used, as:
|
|
543
|
+
# 1) that's almost certainly a remote API server;
|
|
544
|
+
# 2) the actual consolidation mode config is stashed in the database,
|
|
545
|
+
# and the value of `job_utils.is_consolidation_mode` will not be
|
|
546
|
+
# the actual value in the db, but only None as in this case, the
|
|
547
|
+
# whole YAML config is really just `db: <URI>`.
|
|
548
|
+
if skypilot_config.get_nested(('db',), None) is None:
|
|
549
|
+
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
|
550
|
+
# pylint: disable=import-outside-toplevel
|
|
551
|
+
import sky.jobs.utils as job_utils
|
|
552
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
553
|
+
if job_utils.is_consolidation_mode(
|
|
554
|
+
on_api_restart=True) else
|
|
555
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
556
|
+
if avail_mem_size_gb <= max_memory:
|
|
557
|
+
logger.warning(
|
|
558
|
+
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
|
|
559
|
+
f'only has {avail_mem_size_gb:.1f}GB memory available. '
|
|
560
|
+
f'At least {max_memory}GB is recommended to support higher '
|
|
561
|
+
'load with better performance.'
|
|
562
|
+
f'{colorama.Style.RESET_ALL}')
|
|
548
563
|
|
|
549
564
|
args = [sys.executable, *API_SERVER_CMD.split()]
|
|
550
565
|
if deploy:
|
|
@@ -762,6 +777,7 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
|
762
777
|
os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
|
|
763
778
|
# Check again if server is already running. Other processes may
|
|
764
779
|
# have started the server while we were waiting for the lock.
|
|
780
|
+
get_api_server_status.cache_clear() # type: ignore[attr-defined]
|
|
765
781
|
api_server_info = get_api_server_status(endpoint)
|
|
766
782
|
if api_server_info.status == ApiServerStatus.UNHEALTHY:
|
|
767
783
|
_start_api_server(deploy, host, foreground, metrics,
|
|
@@ -823,7 +839,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
|
|
|
823
839
|
for task_config in task_configs:
|
|
824
840
|
if task_config is None:
|
|
825
841
|
continue
|
|
826
|
-
file_mounts_mapping = task_config.
|
|
842
|
+
file_mounts_mapping = task_config.pop('file_mounts_mapping', {})
|
|
827
843
|
if not file_mounts_mapping:
|
|
828
844
|
# We did not mount any files to new paths on the remote server
|
|
829
845
|
# so no need to resolve filepaths.
|
|
@@ -895,12 +911,18 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
|
895
911
|
client_command: Optional[str],
|
|
896
912
|
using_remote_api_server: bool, user: 'models.User',
|
|
897
913
|
request_id: str) -> None:
|
|
898
|
-
"""Reload modules, global variables, and usage message for a new request.
|
|
914
|
+
"""Reload modules, global variables, and usage message for a new request.
|
|
915
|
+
|
|
916
|
+
Must be called within the request's context.
|
|
917
|
+
"""
|
|
899
918
|
# This should be called first to make sure the logger is up-to-date.
|
|
900
919
|
sky_logging.reload_logger()
|
|
901
920
|
|
|
902
921
|
# Reload the skypilot config to make sure the latest config is used.
|
|
903
|
-
|
|
922
|
+
# We don't need to grab the lock here because this function is only
|
|
923
|
+
# run once we are inside the request's context, so there shouldn't
|
|
924
|
+
# be any race conditions when reloading the config.
|
|
925
|
+
skypilot_config.reload_config()
|
|
904
926
|
|
|
905
927
|
# Reset the client entrypoint and command for the usage message.
|
|
906
928
|
common_utils.set_request_context(
|
|
@@ -931,6 +953,7 @@ def clear_local_api_server_database() -> None:
|
|
|
931
953
|
db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
932
954
|
for extension in ['', '-shm', '-wal']:
|
|
933
955
|
try:
|
|
956
|
+
logger.debug(f'Removing database file {db_path}{extension}')
|
|
934
957
|
os.remove(f'{db_path}{extension}')
|
|
935
958
|
except FileNotFoundError:
|
|
936
959
|
logger.debug(f'Database file {db_path}{extension} not found.')
|
sky/server/config.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Optional
|
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.server import constants as server_constants
|
|
9
|
+
from sky.server import daemons
|
|
9
10
|
from sky.utils import common_utils
|
|
10
11
|
|
|
11
12
|
# Constants based on profiling the peak memory usage while serving various
|
|
@@ -19,8 +20,9 @@ from sky.utils import common_utils
|
|
|
19
20
|
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
|
20
21
|
# automatically tune parallelism at runtime according to system usage stats
|
|
21
22
|
# in the future.
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
# TODO(luca): The future is now! ^^^
|
|
24
|
+
LONG_WORKER_MEM_GB = 0.4
|
|
25
|
+
SHORT_WORKER_MEM_GB = 0.3
|
|
24
26
|
# To control the number of long workers.
|
|
25
27
|
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
|
26
28
|
# Limit the number of long workers of local API server, since local server is
|
|
@@ -35,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
|
|
|
35
37
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
|
36
38
|
# Minimal number of long workers to ensure responsiveness.
|
|
37
39
|
_MIN_LONG_WORKERS = 1
|
|
38
|
-
# Minimal number of short workers
|
|
39
|
-
|
|
40
|
-
_MIN_SHORT_WORKERS = 2
|
|
40
|
+
# Minimal number of idle short workers to ensure responsiveness.
|
|
41
|
+
_MIN_IDLE_SHORT_WORKERS = 1
|
|
41
42
|
|
|
42
43
|
# Default number of burstable workers for local API server. A heuristic number
|
|
43
44
|
# that is large enough for most local cases.
|
|
@@ -74,9 +75,11 @@ class ServerConfig:
|
|
|
74
75
|
queue_backend: QueueBackend
|
|
75
76
|
|
|
76
77
|
|
|
77
|
-
def compute_server_config(
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
def compute_server_config(
|
|
79
|
+
deploy: bool,
|
|
80
|
+
max_db_connections: Optional[int] = None,
|
|
81
|
+
quiet: bool = False,
|
|
82
|
+
reserved_memory_mb: Optional[float] = None) -> ServerConfig:
|
|
80
83
|
"""Compute the server config based on environment.
|
|
81
84
|
|
|
82
85
|
We have different assumptions for the resources in different deployment
|
|
@@ -110,7 +113,11 @@ def compute_server_config(deploy: bool,
|
|
|
110
113
|
process after API server was introduced.
|
|
111
114
|
"""
|
|
112
115
|
cpu_count = common_utils.get_cpu_count()
|
|
116
|
+
logger.debug(f'CPU count: {cpu_count}')
|
|
113
117
|
mem_size_gb = common_utils.get_mem_size_gb()
|
|
118
|
+
if reserved_memory_mb is not None:
|
|
119
|
+
mem_size_gb -= (reserved_memory_mb / 1024)
|
|
120
|
+
logger.debug(f'Memory size: {mem_size_gb}GB')
|
|
114
121
|
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
|
115
122
|
mem_size_gb,
|
|
116
123
|
local=not deploy)
|
|
@@ -140,7 +147,12 @@ def compute_server_config(deploy: bool,
|
|
|
140
147
|
burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
|
|
141
148
|
# Runs in low resource mode if the available memory is less than
|
|
142
149
|
# server_constants.MIN_AVAIL_MEM_GB.
|
|
143
|
-
|
|
150
|
+
# pylint: disable=import-outside-toplevel
|
|
151
|
+
import sky.jobs.utils as job_utils
|
|
152
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
153
|
+
if job_utils.is_consolidation_mode() else
|
|
154
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
155
|
+
if not deploy and mem_size_gb < max_memory:
|
|
144
156
|
# Permanent worker process may have significant memory consumption
|
|
145
157
|
# (~350MB per worker) after running commands like `sky check`, so we
|
|
146
158
|
# don't start any permanent workers in low resource local mode. This
|
|
@@ -151,25 +163,29 @@ def compute_server_config(deploy: bool,
|
|
|
151
163
|
# permanently because it never exits.
|
|
152
164
|
max_parallel_for_long = 0
|
|
153
165
|
max_parallel_for_short = 0
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
166
|
+
if not quiet:
|
|
167
|
+
logger.warning(
|
|
168
|
+
'SkyPilot API server will run in low resource mode because '
|
|
169
|
+
'the available memory is less than '
|
|
170
|
+
f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
|
|
158
171
|
elif max_db_connections is not None:
|
|
159
172
|
if max_parallel_all_workers > max_db_connections:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
173
|
+
if not quiet:
|
|
174
|
+
logger.warning(
|
|
175
|
+
f'Max parallel all workers ({max_parallel_all_workers}) '
|
|
176
|
+
'is greater than max db connections '
|
|
177
|
+
f'({max_db_connections}). Increase the number of max db '
|
|
178
|
+
f'connections to at least {max_parallel_all_workers} for '
|
|
179
|
+
'optimal performance.')
|
|
165
180
|
else:
|
|
166
181
|
num_db_connections_per_worker = 1
|
|
167
182
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
183
|
+
if not quiet:
|
|
184
|
+
logger.info(
|
|
185
|
+
f'SkyPilot API server will start {num_server_workers} server '
|
|
186
|
+
f'processes with {max_parallel_for_long} background workers for '
|
|
187
|
+
f'long requests and will allow at max {max_parallel_for_short} '
|
|
188
|
+
'short requests in parallel.')
|
|
173
189
|
return ServerConfig(
|
|
174
190
|
num_server_workers=num_server_workers,
|
|
175
191
|
queue_backend=queue_backend,
|
|
@@ -190,10 +206,15 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
190
206
|
local=False) -> int:
|
|
191
207
|
"""Max parallelism for long workers."""
|
|
192
208
|
# Reserve min available memory to avoid OOM.
|
|
193
|
-
|
|
209
|
+
# pylint: disable=import-outside-toplevel
|
|
210
|
+
import sky.jobs.utils as job_utils
|
|
211
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
212
|
+
if job_utils.is_consolidation_mode() else
|
|
213
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
214
|
+
available_mem = max(0, mem_size_gb - max_memory)
|
|
194
215
|
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
|
195
216
|
mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
|
196
|
-
|
|
217
|
+
LONG_WORKER_MEM_GB)
|
|
197
218
|
n = max(_MIN_LONG_WORKERS,
|
|
198
219
|
min(cpu_based_max_parallel, mem_based_max_parallel))
|
|
199
220
|
if local:
|
|
@@ -201,12 +222,25 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
201
222
|
return n
|
|
202
223
|
|
|
203
224
|
|
|
225
|
+
def _get_min_short_workers() -> int:
|
|
226
|
+
"""Min number of short workers."""
|
|
227
|
+
daemon_count = 0
|
|
228
|
+
for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
229
|
+
if not daemon.should_skip():
|
|
230
|
+
daemon_count += 1
|
|
231
|
+
return _MIN_IDLE_SHORT_WORKERS + daemon_count
|
|
232
|
+
|
|
233
|
+
|
|
204
234
|
def _max_short_worker_parallism(mem_size_gb: float,
|
|
205
235
|
long_worker_parallism: int) -> int:
|
|
206
236
|
"""Max parallelism for short workers."""
|
|
207
237
|
# Reserve memory for long workers and min available memory.
|
|
208
|
-
|
|
209
|
-
|
|
238
|
+
# pylint: disable=import-outside-toplevel
|
|
239
|
+
import sky.jobs.utils as job_utils
|
|
240
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
241
|
+
if job_utils.is_consolidation_mode() else
|
|
242
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
243
|
+
reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
|
|
210
244
|
available_mem = max(0, mem_size_gb - reserved_mem)
|
|
211
|
-
n = max(
|
|
245
|
+
n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
|
|
212
246
|
return n
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 25
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
|
@@ -34,6 +34,7 @@ VERSION_HEADER = 'X-SkyPilot-Version'
|
|
|
34
34
|
REQUEST_NAME_PREFIX = 'sky.'
|
|
35
35
|
# The memory (GB) that SkyPilot tries to not use to prevent OOM.
|
|
36
36
|
MIN_AVAIL_MEM_GB = 2
|
|
37
|
+
MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
|
|
37
38
|
# Default encoder/decoder handler name.
|
|
38
39
|
DEFAULT_HANDLER_NAME = 'default'
|
|
39
40
|
# The path to the API request database.
|
|
@@ -60,3 +61,10 @@ DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|
|
|
60
61
|
|
|
61
62
|
# The interval (seconds) for the event to be restarted in the background.
|
|
62
63
|
DAEMON_RESTART_INTERVAL_SECONDS = 20
|
|
64
|
+
|
|
65
|
+
# Cookie header for stream request id.
|
|
66
|
+
STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
|
|
67
|
+
|
|
68
|
+
# Valid empty values for pickled fields (base64-encoded pickled None)
|
|
69
|
+
# base64.b64encode(pickle.dumps(None)).decode('utf-8')
|
|
70
|
+
EMPTY_PICKLED_VALUE = 'gAROLg=='
|
sky/server/daemons.py
CHANGED
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
"""Internal server daemons that run in the background."""
|
|
2
|
+
import atexit
|
|
2
3
|
import dataclasses
|
|
3
4
|
import os
|
|
4
5
|
import time
|
|
6
|
+
import typing
|
|
5
7
|
from typing import Callable
|
|
6
8
|
|
|
7
9
|
from sky import sky_logging
|
|
8
10
|
from sky import skypilot_config
|
|
11
|
+
from sky.adaptors import common as adaptors_common
|
|
9
12
|
from sky.server import constants as server_constants
|
|
13
|
+
from sky.server.requests import request_names
|
|
14
|
+
from sky.skylet import constants
|
|
10
15
|
from sky.utils import annotations
|
|
11
|
-
from sky.utils import
|
|
16
|
+
from sky.utils import common_utils
|
|
12
17
|
from sky.utils import env_options
|
|
18
|
+
from sky.utils import locks
|
|
19
|
+
from sky.utils import subprocess_utils
|
|
13
20
|
from sky.utils import timeline
|
|
14
21
|
from sky.utils import ux_utils
|
|
15
22
|
|
|
23
|
+
if typing.TYPE_CHECKING:
|
|
24
|
+
import pathlib
|
|
25
|
+
else:
|
|
26
|
+
pathlib = adaptors_common.LazyImport('pathlib')
|
|
27
|
+
|
|
16
28
|
logger = sky_logging.init_logger(__name__)
|
|
17
29
|
|
|
18
30
|
|
|
@@ -25,7 +37,7 @@ class InternalRequestDaemon:
|
|
|
25
37
|
"""Internal daemon that runs an event in the background."""
|
|
26
38
|
|
|
27
39
|
id: str
|
|
28
|
-
name:
|
|
40
|
+
name: request_names.RequestName
|
|
29
41
|
event_fn: Callable[[], None]
|
|
30
42
|
default_log_level: str = 'INFO'
|
|
31
43
|
should_skip: Callable[[], bool] = _default_should_skip
|
|
@@ -37,9 +49,11 @@ class InternalRequestDaemon:
|
|
|
37
49
|
try:
|
|
38
50
|
# Refresh config within the while loop.
|
|
39
51
|
# Since this is a long running daemon,
|
|
40
|
-
#
|
|
52
|
+
# reload_for_new_request()
|
|
41
53
|
# is not called in between the event runs.
|
|
42
|
-
|
|
54
|
+
# We don't need to grab the lock here because each of the daemons
|
|
55
|
+
# run in their own process and thus have their own request context.
|
|
56
|
+
skypilot_config.reload_config()
|
|
43
57
|
# Get the configured log level for the daemon inside the event loop
|
|
44
58
|
# in case the log level changes after the API server is started.
|
|
45
59
|
level_str = skypilot_config.get_nested(
|
|
@@ -69,10 +83,6 @@ class InternalRequestDaemon:
|
|
|
69
83
|
sky_logging.reload_logger()
|
|
70
84
|
level = self.refresh_log_level()
|
|
71
85
|
self.event_fn()
|
|
72
|
-
# Clear request level cache after each run to avoid
|
|
73
|
-
# using too much memory.
|
|
74
|
-
annotations.clear_request_level_cache()
|
|
75
|
-
timeline.save_timeline()
|
|
76
86
|
except Exception: # pylint: disable=broad-except
|
|
77
87
|
# It is OK to fail to run the event, as the event is not
|
|
78
88
|
# critical, but we should log the error.
|
|
@@ -82,18 +92,28 @@ class InternalRequestDaemon:
|
|
|
82
92
|
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
|
83
93
|
'seconds...')
|
|
84
94
|
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
|
95
|
+
finally:
|
|
96
|
+
# Clear request level cache after each run to avoid
|
|
97
|
+
# using too much memory.
|
|
98
|
+
annotations.clear_request_level_cache()
|
|
99
|
+
timeline.save_timeline()
|
|
100
|
+
# Kill all children processes related to this request.
|
|
101
|
+
# Each executor handles a single request, so we can safely
|
|
102
|
+
# kill all children processes related to this request.
|
|
103
|
+
subprocess_utils.kill_children_processes()
|
|
104
|
+
common_utils.release_memory()
|
|
85
105
|
|
|
86
106
|
|
|
87
107
|
def refresh_cluster_status_event():
|
|
88
108
|
"""Periodically refresh the cluster status."""
|
|
89
109
|
# pylint: disable=import-outside-toplevel
|
|
90
|
-
from sky import
|
|
110
|
+
from sky.backends import backend_utils
|
|
91
111
|
|
|
92
112
|
logger.info('=== Refreshing cluster status ===')
|
|
93
113
|
# This periodically refresh will hold the lock for the cluster being
|
|
94
114
|
# refreshed, but it is OK because other operations will just wait for
|
|
95
115
|
# the lock and get the just refreshed status without refreshing again.
|
|
96
|
-
|
|
116
|
+
backend_utils.refresh_cluster_records()
|
|
97
117
|
logger.info('Status refreshed. Sleeping '
|
|
98
118
|
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
99
119
|
' seconds for the next refresh...\n')
|
|
@@ -117,25 +137,75 @@ def refresh_volume_status_event():
|
|
|
117
137
|
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
118
138
|
|
|
119
139
|
|
|
140
|
+
_managed_job_consolidation_mode_lock = None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Attempt to gracefully release the lock when the process exits.
|
|
144
|
+
# If this fails, it's okay, the lock will be released when the process dies.
|
|
145
|
+
def _release_managed_job_consolidation_mode_lock() -> None:
|
|
146
|
+
global _managed_job_consolidation_mode_lock
|
|
147
|
+
if _managed_job_consolidation_mode_lock is not None:
|
|
148
|
+
_managed_job_consolidation_mode_lock.release()
|
|
149
|
+
_managed_job_consolidation_mode_lock = None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
atexit.register(_release_managed_job_consolidation_mode_lock)
|
|
153
|
+
|
|
154
|
+
|
|
120
155
|
def managed_job_status_refresh_event():
|
|
121
156
|
"""Refresh the managed job status for controller consolidation mode."""
|
|
122
157
|
# pylint: disable=import-outside-toplevel
|
|
158
|
+
from sky.jobs import constants as managed_job_constants
|
|
123
159
|
from sky.jobs import utils as managed_job_utils
|
|
124
|
-
from sky.utils import controller_utils
|
|
125
160
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
161
|
+
global _managed_job_consolidation_mode_lock
|
|
162
|
+
if _managed_job_consolidation_mode_lock is None:
|
|
163
|
+
_managed_job_consolidation_mode_lock = locks.get_lock(
|
|
164
|
+
managed_job_constants.CONSOLIDATION_MODE_LOCK_ID)
|
|
165
|
+
|
|
166
|
+
# Touch the signal file here to avoid conflict with
|
|
167
|
+
# update_managed_jobs_statuses. Although we run
|
|
168
|
+
# ha_recovery_for_consolidation_mode before checking the job statuses
|
|
169
|
+
# (events.ManagedJobEvent), update_managed_jobs_statuses is also called in
|
|
170
|
+
# cancel_jobs_by_id.
|
|
171
|
+
# We also need to make sure that new controllers are not started until we
|
|
172
|
+
# acquire the consolidation mode lock, since if we have controllers on both
|
|
173
|
+
# the new and old API server during a rolling update, calling
|
|
174
|
+
# update_managed_jobs_statuses on the old API server could lead to
|
|
175
|
+
# FAILED_CONTROLLER.
|
|
176
|
+
signal_file = pathlib.Path(
|
|
177
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
|
|
178
|
+
try:
|
|
179
|
+
signal_file.touch()
|
|
180
|
+
|
|
181
|
+
# Make sure the lock is acquired for this process before proceeding to
|
|
182
|
+
# do recovery. This will block if another API server is still running,
|
|
183
|
+
# but should proceed once it is terminated and releases the lock.
|
|
184
|
+
if not _managed_job_consolidation_mode_lock.is_locked():
|
|
185
|
+
logger.info('Acquiring the consolidation mode lock: '
|
|
186
|
+
f'{_managed_job_consolidation_mode_lock}')
|
|
187
|
+
_managed_job_consolidation_mode_lock.acquire()
|
|
188
|
+
logger.info('Lock acquired!')
|
|
189
|
+
# We don't explicitly release the lock until the process exits.
|
|
190
|
+
# Even if _release_managed_job_consolidation_mode_lock is not called,
|
|
191
|
+
# the lock should be released when the process dies (either due to the
|
|
192
|
+
# advisory file lock being released or the postgres session dying).
|
|
193
|
+
|
|
194
|
+
# We run the recovery logic before checking the job statuses as those
|
|
195
|
+
# two are conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for
|
|
196
|
+
# details.
|
|
130
197
|
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
198
|
+
finally:
|
|
199
|
+
# Now, we should be sure that this is the only API server, we have
|
|
200
|
+
# started the new controllers and unclaimed all the jobs, and we are
|
|
201
|
+
# ready to update the job statuses.
|
|
202
|
+
signal_file.unlink()
|
|
131
203
|
|
|
132
204
|
# After recovery, we start the event loop.
|
|
133
205
|
from sky.skylet import events
|
|
134
206
|
refresh_event = events.ManagedJobEvent()
|
|
135
|
-
scheduling_event = events.ManagedJobSchedulingEvent()
|
|
136
207
|
logger.info('=== Running managed job event ===')
|
|
137
208
|
refresh_event.run()
|
|
138
|
-
scheduling_event.run()
|
|
139
209
|
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
140
210
|
|
|
141
211
|
|
|
@@ -150,14 +220,10 @@ def _serve_status_refresh_event(pool: bool):
|
|
|
150
220
|
"""Refresh the sky serve status for controller consolidation mode."""
|
|
151
221
|
# pylint: disable=import-outside-toplevel
|
|
152
222
|
from sky.serve import serve_utils
|
|
153
|
-
from sky.utils import controller_utils
|
|
154
223
|
|
|
155
224
|
# We run the recovery logic before starting the event loop as those two are
|
|
156
225
|
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
157
|
-
|
|
158
|
-
if controller_utils.high_availability_specified(
|
|
159
|
-
controller.value.cluster_name):
|
|
160
|
-
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
226
|
+
serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
|
|
161
227
|
|
|
162
228
|
# After recovery, we start the event loop.
|
|
163
229
|
from sky.skylet import events
|
|
@@ -196,26 +262,31 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
|
196
262
|
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
197
263
|
# set to updated status automatically, without showing users the hint of
|
|
198
264
|
# cluster being stopped or down when `sky status -r` is called.
|
|
199
|
-
InternalRequestDaemon(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
265
|
+
InternalRequestDaemon(
|
|
266
|
+
id='skypilot-status-refresh-daemon',
|
|
267
|
+
name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
|
|
268
|
+
event_fn=refresh_cluster_status_event,
|
|
269
|
+
default_log_level='DEBUG'),
|
|
203
270
|
# Volume status refresh daemon to update the volume status periodically.
|
|
204
|
-
InternalRequestDaemon(
|
|
205
|
-
|
|
206
|
-
|
|
271
|
+
InternalRequestDaemon(
|
|
272
|
+
id='skypilot-volume-status-refresh-daemon',
|
|
273
|
+
name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
|
|
274
|
+
event_fn=refresh_volume_status_event),
|
|
207
275
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
208
|
-
name=
|
|
276
|
+
name=request_names.RequestName.
|
|
277
|
+
REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
|
|
209
278
|
event_fn=managed_job_status_refresh_event,
|
|
210
279
|
should_skip=should_skip_managed_job_status_refresh),
|
|
211
|
-
InternalRequestDaemon(
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
280
|
+
InternalRequestDaemon(
|
|
281
|
+
id='sky-serve-status-refresh-daemon',
|
|
282
|
+
name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
|
|
283
|
+
event_fn=sky_serve_status_refresh_event,
|
|
284
|
+
should_skip=should_skip_sky_serve_status_refresh),
|
|
285
|
+
InternalRequestDaemon(
|
|
286
|
+
id='pool-status-refresh-daemon',
|
|
287
|
+
name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
|
|
288
|
+
event_fn=pool_status_refresh_event,
|
|
289
|
+
should_skip=should_skip_pool_status_refresh),
|
|
219
290
|
]
|
|
220
291
|
|
|
221
292
|
|