skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/controller_utils.py
CHANGED
|
@@ -23,10 +23,10 @@ from sky.clouds import gcp
|
|
|
23
23
|
from sky.data import data_utils
|
|
24
24
|
from sky.data import storage as storage_lib
|
|
25
25
|
from sky.jobs import constants as managed_job_constants
|
|
26
|
-
from sky.jobs import state as managed_job_state
|
|
27
26
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
28
27
|
from sky.serve import constants as serve_constants
|
|
29
28
|
from sky.serve import serve_state
|
|
29
|
+
from sky.server import config as server_config
|
|
30
30
|
from sky.setup_files import dependencies
|
|
31
31
|
from sky.skylet import constants
|
|
32
32
|
from sky.skylet import log_lib
|
|
@@ -72,7 +72,8 @@ class _ControllerSpec:
|
|
|
72
72
|
"""Spec for skypilot controllers."""
|
|
73
73
|
controller_type: str
|
|
74
74
|
name: str
|
|
75
|
-
|
|
75
|
+
_cluster_name_func: Callable[[], str]
|
|
76
|
+
_cluster_name_from_server: Optional[str] # For client-side only
|
|
76
77
|
in_progress_hint: Callable[[bool], str]
|
|
77
78
|
decline_cancel_hint: str
|
|
78
79
|
_decline_down_when_failed_to_fetch_status_hint: str
|
|
@@ -93,6 +94,24 @@ class _ControllerSpec:
|
|
|
93
94
|
return self._check_cluster_name_hint.format(
|
|
94
95
|
cluster_name=self.cluster_name)
|
|
95
96
|
|
|
97
|
+
@property
|
|
98
|
+
def cluster_name(self) -> str:
|
|
99
|
+
"""The cluster name of the controller.
|
|
100
|
+
|
|
101
|
+
On the server-side, the cluster name is the actual cluster name,
|
|
102
|
+
which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
|
|
103
|
+
|
|
104
|
+
On the client-side, the cluster name may not be accurate,
|
|
105
|
+
as we may not know the exact name, because we are missing
|
|
106
|
+
the server-side common.SERVER_ID. We have to wait until
|
|
107
|
+
we get the actual cluster name from the server.
|
|
108
|
+
"""
|
|
109
|
+
return (self._cluster_name_from_server if self._cluster_name_from_server
|
|
110
|
+
is not None else self._cluster_name_func())
|
|
111
|
+
|
|
112
|
+
def set_cluster_name_from_server(self, cluster_name: str) -> None:
|
|
113
|
+
self._cluster_name_from_server = cluster_name
|
|
114
|
+
|
|
96
115
|
|
|
97
116
|
# TODO: refactor controller class to not be an enum.
|
|
98
117
|
class Controllers(enum.Enum):
|
|
@@ -102,7 +121,8 @@ class Controllers(enum.Enum):
|
|
|
102
121
|
JOBS_CONTROLLER = _ControllerSpec(
|
|
103
122
|
controller_type='jobs',
|
|
104
123
|
name='managed jobs controller',
|
|
105
|
-
|
|
124
|
+
_cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
|
|
125
|
+
_cluster_name_from_server=None,
|
|
106
126
|
in_progress_hint=lambda _:
|
|
107
127
|
('* {job_info}To see all managed jobs: '
|
|
108
128
|
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
|
@@ -133,7 +153,8 @@ class Controllers(enum.Enum):
|
|
|
133
153
|
SKY_SERVE_CONTROLLER = _ControllerSpec(
|
|
134
154
|
controller_type='serve',
|
|
135
155
|
name='serve controller',
|
|
136
|
-
|
|
156
|
+
_cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
|
|
157
|
+
_cluster_name_from_server=None,
|
|
137
158
|
in_progress_hint=(
|
|
138
159
|
lambda pool:
|
|
139
160
|
(f'* To see detailed pool status: {colorama.Style.BRIGHT}'
|
|
@@ -166,7 +187,9 @@ class Controllers(enum.Enum):
|
|
|
166
187
|
default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
|
|
167
188
|
|
|
168
189
|
@classmethod
|
|
169
|
-
def from_name(cls,
|
|
190
|
+
def from_name(cls,
|
|
191
|
+
name: Optional[str],
|
|
192
|
+
expect_exact_match: bool = True) -> Optional['Controllers']:
|
|
170
193
|
"""Check if the cluster name is a controller name.
|
|
171
194
|
|
|
172
195
|
Returns:
|
|
@@ -187,7 +210,11 @@ class Controllers(enum.Enum):
|
|
|
187
210
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
188
211
|
controller = cls.JOBS_CONTROLLER
|
|
189
212
|
prefix = common.JOB_CONTROLLER_PREFIX
|
|
190
|
-
|
|
213
|
+
|
|
214
|
+
if controller is not None and expect_exact_match:
|
|
215
|
+
assert name == controller.value.cluster_name, (
|
|
216
|
+
name, controller.value.cluster_name)
|
|
217
|
+
elif controller is not None and name != controller.value.cluster_name:
|
|
191
218
|
# The client-side cluster_name is not accurate. Assume that `name`
|
|
192
219
|
# is the actual cluster name, so need to set the controller's
|
|
193
220
|
# cluster name to the input name.
|
|
@@ -201,7 +228,7 @@ class Controllers(enum.Enum):
|
|
|
201
228
|
prefix)
|
|
202
229
|
|
|
203
230
|
# Update the cluster name.
|
|
204
|
-
controller.value.
|
|
231
|
+
controller.value.set_cluster_name_from_server(name)
|
|
205
232
|
return controller
|
|
206
233
|
|
|
207
234
|
@classmethod
|
|
@@ -228,10 +255,21 @@ def get_controller_for_pool(pool: bool) -> Controllers:
|
|
|
228
255
|
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
229
256
|
"""Check if the controller high availability is specified in user config.
|
|
230
257
|
"""
|
|
231
|
-
controller = Controllers.from_name(cluster_name)
|
|
258
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
232
259
|
if controller is None:
|
|
233
260
|
return False
|
|
234
261
|
|
|
262
|
+
if controller.value.controller_type == 'jobs':
|
|
263
|
+
# pylint: disable-next=import-outside-toplevel
|
|
264
|
+
from sky.jobs import utils as managed_job_utils
|
|
265
|
+
if managed_job_utils.is_consolidation_mode():
|
|
266
|
+
return True
|
|
267
|
+
elif controller.value.controller_type == 'serve':
|
|
268
|
+
# pylint: disable-next=import-outside-toplevel
|
|
269
|
+
from sky.serve import serve_utils
|
|
270
|
+
if serve_utils.is_consolidation_mode():
|
|
271
|
+
return True
|
|
272
|
+
|
|
235
273
|
if skypilot_config.loaded():
|
|
236
274
|
return skypilot_config.get_nested((controller.value.controller_type,
|
|
237
275
|
'controller', 'high_availability'),
|
|
@@ -400,7 +438,7 @@ def check_cluster_name_not_controller(
|
|
|
400
438
|
Returns:
|
|
401
439
|
None, if the cluster name is not a controller name.
|
|
402
440
|
"""
|
|
403
|
-
controller = Controllers.from_name(cluster_name)
|
|
441
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
404
442
|
if controller is not None:
|
|
405
443
|
msg = controller.value.check_cluster_name_hint
|
|
406
444
|
if operation_str is not None:
|
|
@@ -495,6 +533,9 @@ def shared_controller_vars_to_fill(
|
|
|
495
533
|
# before popping allowed_contexts. If it is not on Kubernetes,
|
|
496
534
|
# we may be able to use allowed_contexts.
|
|
497
535
|
local_user_config.pop('allowed_contexts', None)
|
|
536
|
+
# Remove api_server config so that the controller does not try to use
|
|
537
|
+
# a remote API server.
|
|
538
|
+
local_user_config.pop('api_server', None)
|
|
498
539
|
with tempfile.NamedTemporaryFile(
|
|
499
540
|
delete=False,
|
|
500
541
|
suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
|
|
@@ -528,7 +569,15 @@ def shared_controller_vars_to_fill(
|
|
|
528
569
|
# with a remote API server.
|
|
529
570
|
constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
|
|
530
571
|
common_utils.get_using_remote_api_server()),
|
|
572
|
+
constants.IS_SKYPILOT_SERVE_CONTROLLER:
|
|
573
|
+
('true'
|
|
574
|
+
if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),
|
|
531
575
|
})
|
|
576
|
+
override_concurrent_launches = os.environ.get(
|
|
577
|
+
constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
|
|
578
|
+
if override_concurrent_launches is not None:
|
|
579
|
+
env_vars[constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES] = str(
|
|
580
|
+
int(override_concurrent_launches))
|
|
532
581
|
if skypilot_config.loaded():
|
|
533
582
|
# Only set the SKYPILOT_CONFIG env var if the user has a config file.
|
|
534
583
|
env_vars[
|
|
@@ -609,15 +658,16 @@ def get_controller_resources(
|
|
|
609
658
|
controller_resources_to_use: resources.Resources = list(
|
|
610
659
|
controller_resources)[0]
|
|
611
660
|
|
|
612
|
-
|
|
661
|
+
controller_handle = global_user_state.get_handle_from_cluster_name(
|
|
613
662
|
controller.value.cluster_name)
|
|
614
|
-
if
|
|
615
|
-
|
|
616
|
-
if handle is not None:
|
|
663
|
+
if controller_handle is not None:
|
|
664
|
+
if controller_handle is not None:
|
|
617
665
|
# Use the existing resources, but override the autostop config with
|
|
618
666
|
# the one currently specified in the config.
|
|
619
|
-
controller_resources_to_use =
|
|
620
|
-
|
|
667
|
+
controller_resources_to_use = (
|
|
668
|
+
controller_handle.launched_resources.copy(
|
|
669
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
|
670
|
+
)
|
|
621
671
|
|
|
622
672
|
# If the controller and replicas are from the same cloud (and region/zone),
|
|
623
673
|
# it should provide better connectivity. We will let the controller choose
|
|
@@ -714,6 +764,17 @@ def get_controller_resources(
|
|
|
714
764
|
return result
|
|
715
765
|
|
|
716
766
|
|
|
767
|
+
def get_controller_mem_size_gb() -> float:
|
|
768
|
+
try:
|
|
769
|
+
with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
|
|
770
|
+
'r',
|
|
771
|
+
encoding='utf-8') as f:
|
|
772
|
+
return float(f.read())
|
|
773
|
+
except FileNotFoundError:
|
|
774
|
+
pass
|
|
775
|
+
return common_utils.get_mem_size_gb()
|
|
776
|
+
|
|
777
|
+
|
|
717
778
|
def _setup_proxy_command_on_controller(
|
|
718
779
|
controller_launched_cloud: 'clouds.Cloud',
|
|
719
780
|
user_config: Dict[str, Any]) -> config_utils.Config:
|
|
@@ -1174,77 +1235,175 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
|
1174
1235
|
|
|
1175
1236
|
# ======================= Resources Management Functions =======================
|
|
1176
1237
|
|
|
1177
|
-
#
|
|
1178
|
-
#
|
|
1179
|
-
JOB_MEMORY_MB = 350
|
|
1180
|
-
# Monitoring process for service is 1GB. This is based on an old estimation but
|
|
1181
|
-
# we keep it here for now.
|
|
1238
|
+
# Monitoring process for service is 512MB. This is based on an old
|
|
1239
|
+
# estimation but we keep it here for now.
|
|
1182
1240
|
# TODO(tian): Remeasure this.
|
|
1183
|
-
SERVE_MONITORING_MEMORY_MB =
|
|
1184
|
-
# The ratio of service
|
|
1185
|
-
|
|
1186
|
-
SERVE_PROC_RATIO = SERVE_MONITORING_MEMORY_MB / JOB_MEMORY_MB
|
|
1187
|
-
# Past 2000 simultaneous jobs, we become unstable.
|
|
1188
|
-
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
|
1189
|
-
MAX_JOB_LIMIT = 2000
|
|
1190
|
-
# Number of ongoing launches launches allowed per CPU, for managed jobs.
|
|
1191
|
-
JOB_LAUNCHES_PER_CPU = 4
|
|
1192
|
-
# Number of ongoing launches launches allowed per CPU, for services. This is
|
|
1193
|
-
# also based on an old estimation, but SKyServe indeed spawn a new process
|
|
1194
|
-
# for each launch operation, so it should be slightly more resources demanding
|
|
1195
|
-
# than managed jobs.
|
|
1196
|
-
SERVE_LAUNCHES_PER_CPU = 2
|
|
1197
|
-
# The ratio of service launch to job launch. This is inverted as the parallelism
|
|
1198
|
-
# is determined by 1 / LAUNCHES_PER_CPU.
|
|
1199
|
-
SERVE_LAUNCH_RATIO = JOB_LAUNCHES_PER_CPU / SERVE_LAUNCHES_PER_CPU
|
|
1241
|
+
SERVE_MONITORING_MEMORY_MB = 512
|
|
1242
|
+
# The resource consumption ratio of service launch to serve down.
|
|
1243
|
+
SERVE_LAUNCH_RATIO = 2.0
|
|
1200
1244
|
|
|
1201
1245
|
# The _RESOURCES_LOCK should be held whenever we are checking the parallelism
|
|
1202
1246
|
# control or updating the schedule_state of any job or service. Any code that
|
|
1203
1247
|
# takes this lock must conclude by calling maybe_schedule_next_jobs.
|
|
1204
1248
|
_RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
|
|
1205
1249
|
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1250
|
+
# keep 2GB reserved after the controllers
|
|
1251
|
+
MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
|
|
1252
|
+
|
|
1253
|
+
# NOTE: In the current implementation, we only consider the memory
|
|
1254
|
+
# The ratio of resources consumption for managed jobs and pool/serve.
|
|
1255
|
+
# This measures pool_resources / jobs_resources. If 2 GB memory is allocated to
|
|
1256
|
+
# jobs, then 2 * POOL_JOBS_RESOURCES_RATIO GB memory is allocated to pool/serve.
|
|
1257
|
+
POOL_JOBS_RESOURCES_RATIO = 1
|
|
1258
|
+
# Number of ongoing launches launches allowed per worker. Can probably be
|
|
1259
|
+
# increased a bit to around 16 but keeping it lower to just to be safe
|
|
1260
|
+
LAUNCHES_PER_WORKER = 8
|
|
1261
|
+
# Number of ongoing launches allowed per service. Can probably be increased
|
|
1262
|
+
# a bit as well.
|
|
1263
|
+
LAUNCHES_PER_SERVICE = 4
|
|
1264
|
+
|
|
1265
|
+
# Based on testing, each worker takes around 200-300MB memory. Keeping it
|
|
1266
|
+
# higher to be safe.
|
|
1267
|
+
JOB_WORKER_MEMORY_MB = 400
|
|
1268
|
+
# this can probably be increased to around 300-400 but keeping it lower to just
|
|
1269
|
+
# to be safe
|
|
1270
|
+
MAX_JOBS_PER_WORKER = 200
|
|
1271
|
+
# Maximum number of controllers that can be running. Hard to handle more than
|
|
1272
|
+
# 512 launches at once.
|
|
1273
|
+
MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
|
|
1274
|
+
# Limit the number of jobs that can be running at once on the entire jobs
|
|
1275
|
+
# controller cluster. It's hard to handle cancellation of more than 2000 jobs at
|
|
1276
|
+
# once.
|
|
1277
|
+
# TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
|
|
1278
|
+
# hardcoded max limit.
|
|
1279
|
+
MAX_TOTAL_RUNNING_JOBS = 2000
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
def compute_memory_reserved_for_controllers(
|
|
1283
|
+
reserve_for_controllers: bool, reserve_extra_for_pool: bool) -> float:
|
|
1284
|
+
reserved_memory_mb = 0.0
|
|
1285
|
+
if reserve_for_controllers:
|
|
1286
|
+
reserved_memory_mb = float(MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB)
|
|
1287
|
+
if reserve_extra_for_pool:
|
|
1288
|
+
reserved_memory_mb *= (1. + POOL_JOBS_RESOURCES_RATIO)
|
|
1289
|
+
return reserved_memory_mb
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _get_total_usable_memory_mb(pool: bool, consolidation_mode: bool) -> float:
|
|
1293
|
+
controller_reserved = compute_memory_reserved_for_controllers(
|
|
1294
|
+
reserve_for_controllers=True, reserve_extra_for_pool=pool)
|
|
1295
|
+
total_memory_mb = (common_utils.get_mem_size_gb() * 1024 -
|
|
1296
|
+
controller_reserved)
|
|
1297
|
+
if not consolidation_mode:
|
|
1298
|
+
return total_memory_mb
|
|
1299
|
+
config = server_config.compute_server_config(
|
|
1300
|
+
deploy=True, quiet=True, reserved_memory_mb=controller_reserved)
|
|
1301
|
+
used = 0.0
|
|
1302
|
+
used += ((config.long_worker_config.garanteed_parallelism +
|
|
1303
|
+
config.long_worker_config.burstable_parallelism) *
|
|
1304
|
+
server_config.LONG_WORKER_MEM_GB * 1024)
|
|
1305
|
+
used += ((config.short_worker_config.garanteed_parallelism +
|
|
1306
|
+
config.short_worker_config.burstable_parallelism) *
|
|
1307
|
+
server_config.SHORT_WORKER_MEM_GB * 1024)
|
|
1308
|
+
return total_memory_mb - used
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
def _is_consolidation_mode(pool: bool) -> bool:
|
|
1312
|
+
return skypilot_config.get_nested(
|
|
1313
|
+
('jobs' if pool else 'serve', 'controller', 'consolidation_mode'),
|
|
1314
|
+
default_value=False)
|
|
1212
1315
|
|
|
1213
1316
|
|
|
1214
1317
|
@annotations.lru_cache(scope='request')
|
|
1215
|
-
def
|
|
1216
|
-
|
|
1217
|
-
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
1218
|
-
return max(job_limit, 1)
|
|
1318
|
+
def _get_parallelism(pool: bool, raw_resource_per_unit: float) -> int:
|
|
1319
|
+
"""Returns the number of jobs controllers / services that should be running.
|
|
1219
1320
|
|
|
1321
|
+
This is the number of controllers / services that should be running
|
|
1322
|
+
to maximize resource utilization.
|
|
1220
1323
|
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
return cpus * JOB_LAUNCHES_PER_CPU if cpus is not None else 1
|
|
1324
|
+
In consolidation mode, we use the existing API server so our resource
|
|
1325
|
+
requirements are just for the job controllers / services. We try taking
|
|
1326
|
+
up as much memory as possible left over from the API server.
|
|
1225
1327
|
|
|
1328
|
+
In non-consolidation mode, we have to take into account the memory of the
|
|
1329
|
+
API server workers. We limit to only 8 launches per worker, so our logic is
|
|
1330
|
+
each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
|
|
1331
|
+
leave some leftover room for ssh codegen and ray status overhead.
|
|
1332
|
+
"""
|
|
1333
|
+
consolidation_mode = _is_consolidation_mode(pool)
|
|
1226
1334
|
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
#
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1335
|
+
total_memory_mb = _get_total_usable_memory_mb(pool, consolidation_mode)
|
|
1336
|
+
|
|
1337
|
+
# In consolidation mode, we assume the API server is running in deployment
|
|
1338
|
+
# mode, hence resource management (i.e. how many requests are allowed) is
|
|
1339
|
+
# done by the API server.
|
|
1340
|
+
resource_per_unit_worker = 0.
|
|
1341
|
+
# Otherwise, it runs a local API server on the jobs/serve controller.
|
|
1342
|
+
# We need to do the resource management ourselves.
|
|
1343
|
+
if not consolidation_mode:
|
|
1344
|
+
launches_per_worker = (LAUNCHES_PER_WORKER
|
|
1345
|
+
if pool else LAUNCHES_PER_SERVICE)
|
|
1346
|
+
resource_per_unit_worker = (launches_per_worker *
|
|
1347
|
+
server_config.LONG_WORKER_MEM_GB * 1024)
|
|
1233
1348
|
|
|
1349
|
+
# If running pool on jobs controller, we need to account for the resources
|
|
1350
|
+
# consumed by the jobs.
|
|
1351
|
+
ratio = (1. + POOL_JOBS_RESOURCES_RATIO) if pool else 1.
|
|
1352
|
+
resource_per_unit = ratio * (raw_resource_per_unit +
|
|
1353
|
+
resource_per_unit_worker)
|
|
1234
1354
|
|
|
1235
|
-
|
|
1236
|
-
num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
|
|
1237
|
-
managed_job_state.get_num_alive_jobs())
|
|
1238
|
-
return num_procs < _get_job_parallelism()
|
|
1355
|
+
return max(int(total_memory_mb / resource_per_unit), 1)
|
|
1239
1356
|
|
|
1240
1357
|
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1358
|
+
def get_number_of_jobs_controllers() -> int:
|
|
1359
|
+
return min(
|
|
1360
|
+
MAX_CONTROLLERS,
|
|
1361
|
+
_get_parallelism(pool=True, raw_resource_per_unit=JOB_WORKER_MEMORY_MB))
|
|
1362
|
+
|
|
1363
|
+
|
|
1364
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
1365
|
+
def get_resources_lock_path() -> str:
|
|
1366
|
+
path = os.path.expanduser(_RESOURCES_LOCK)
|
|
1367
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
1368
|
+
return path
|
|
1369
|
+
|
|
1370
|
+
|
|
1371
|
+
def _get_number_of_services(pool: bool) -> int:
|
|
1372
|
+
return _get_parallelism(pool=pool,
|
|
1373
|
+
raw_resource_per_unit=SERVE_MONITORING_MEMORY_MB *
|
|
1374
|
+
POOL_JOBS_RESOURCES_RATIO)
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
@annotations.lru_cache(scope='request')
|
|
1378
|
+
def _get_request_parallelism(pool: bool) -> int:
|
|
1379
|
+
# NOTE(dev): One smoke test depends on this value.
|
|
1380
|
+
# tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
|
|
1381
|
+
# assumes 4 concurrent launches.
|
|
1382
|
+
override_concurrent_launches = os.environ.get(
|
|
1383
|
+
constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
|
|
1384
|
+
if override_concurrent_launches is not None and not pool:
|
|
1385
|
+
return int(override_concurrent_launches)
|
|
1386
|
+
# Limitation per service x number of services
|
|
1387
|
+
launches_per_worker = (LAUNCHES_PER_WORKER
|
|
1388
|
+
if pool else LAUNCHES_PER_SERVICE)
|
|
1389
|
+
return (launches_per_worker * POOL_JOBS_RESOURCES_RATIO *
|
|
1390
|
+
_get_number_of_services(pool))
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
def can_provision(pool: bool) -> bool:
|
|
1394
|
+
# TODO(tian): probe API server to see if there is any pending provision
|
|
1395
|
+
# requests.
|
|
1396
|
+
return can_terminate(pool)
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def can_start_new_process(pool: bool) -> bool:
|
|
1400
|
+
return serve_state.get_num_services() < _get_number_of_services(pool)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def can_terminate(pool: bool) -> bool:
|
|
1404
|
+
# TODO(tian): probe API server to see if there is any pending terminate
|
|
1405
|
+
# requests.
|
|
1245
1406
|
num_terminating = (
|
|
1246
|
-
serve_state.total_number_provisioning_replicas()
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
managed_job_state.get_num_launching_jobs())
|
|
1250
|
-
return num_terminating < _get_launch_parallelism()
|
|
1407
|
+
serve_state.total_number_provisioning_replicas() +
|
|
1408
|
+
serve_state.total_number_terminating_replicas() / SERVE_LAUNCH_RATIO)
|
|
1409
|
+
return num_terminating < _get_request_parallelism(pool)
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -7,15 +7,17 @@ import pathlib
|
|
|
7
7
|
import sqlite3
|
|
8
8
|
import threading
|
|
9
9
|
import typing
|
|
10
|
-
from typing import Any, Callable, Dict, Iterable, Optional
|
|
10
|
+
from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
|
|
11
11
|
|
|
12
12
|
import aiosqlite
|
|
13
13
|
import aiosqlite.context
|
|
14
14
|
import sqlalchemy
|
|
15
15
|
from sqlalchemy import exc as sqlalchemy_exc
|
|
16
|
+
from sqlalchemy.ext import asyncio as sqlalchemy_async
|
|
16
17
|
|
|
17
18
|
from sky import sky_logging
|
|
18
19
|
from sky.skylet import constants
|
|
20
|
+
from sky.skylet import runtime_utils
|
|
19
21
|
|
|
20
22
|
logger = sky_logging.init_logger(__name__)
|
|
21
23
|
if typing.TYPE_CHECKING:
|
|
@@ -184,7 +186,7 @@ def add_column_to_table_sqlalchemy(
|
|
|
184
186
|
pass
|
|
185
187
|
else:
|
|
186
188
|
raise
|
|
187
|
-
#
|
|
189
|
+
#postgresql
|
|
188
190
|
except sqlalchemy_exc.ProgrammingError as e:
|
|
189
191
|
if 'already exists' in str(e):
|
|
190
192
|
pass
|
|
@@ -200,6 +202,7 @@ def add_column_to_table_alembic(
|
|
|
200
202
|
server_default: Optional[str] = None,
|
|
201
203
|
copy_from: Optional[str] = None,
|
|
202
204
|
value_to_replace_existing_entries: Optional[Any] = None,
|
|
205
|
+
index: Optional[bool] = None,
|
|
203
206
|
):
|
|
204
207
|
"""Add a column to a table using Alembic operations.
|
|
205
208
|
|
|
@@ -214,6 +217,8 @@ def add_column_to_table_alembic(
|
|
|
214
217
|
copy_from: Column name to copy values from (for existing rows)
|
|
215
218
|
value_to_replace_existing_entries: Default value for existing NULL
|
|
216
219
|
entries
|
|
220
|
+
index: If True, create an index on this column. If None, no index
|
|
221
|
+
is created.
|
|
217
222
|
"""
|
|
218
223
|
from alembic import op # pylint: disable=import-outside-toplevel
|
|
219
224
|
|
|
@@ -221,7 +226,8 @@ def add_column_to_table_alembic(
|
|
|
221
226
|
# Create the column with server_default if provided
|
|
222
227
|
column = sqlalchemy.Column(column_name,
|
|
223
228
|
column_type,
|
|
224
|
-
server_default=server_default
|
|
229
|
+
server_default=server_default,
|
|
230
|
+
index=index)
|
|
225
231
|
op.add_column(table_name, column)
|
|
226
232
|
|
|
227
233
|
# Handle data migration
|
|
@@ -353,6 +359,27 @@ class SQLiteConn(threading.local):
|
|
|
353
359
|
conn = await self._get_async_conn()
|
|
354
360
|
return await conn.execute_fetchall(sql, parameters)
|
|
355
361
|
|
|
362
|
+
async def execute_get_returning_value_async(
|
|
363
|
+
self,
|
|
364
|
+
sql: str,
|
|
365
|
+
parameters: Optional[Iterable[Any]] = None
|
|
366
|
+
) -> Optional[sqlite3.Row]:
|
|
367
|
+
conn = await self._get_async_conn()
|
|
368
|
+
|
|
369
|
+
if parameters is None:
|
|
370
|
+
parameters = []
|
|
371
|
+
|
|
372
|
+
def exec_and_get_returning_value(sql: str,
|
|
373
|
+
parameters: Optional[Iterable[Any]]):
|
|
374
|
+
# pylint: disable=protected-access
|
|
375
|
+
row = conn._conn.execute(sql, parameters).fetchone()
|
|
376
|
+
conn._conn.commit()
|
|
377
|
+
return row
|
|
378
|
+
|
|
379
|
+
# pylint: disable=protected-access
|
|
380
|
+
return await conn._execute(exec_and_get_returning_value, sql,
|
|
381
|
+
parameters)
|
|
382
|
+
|
|
356
383
|
async def close(self):
|
|
357
384
|
if self._async_conn is not None:
|
|
358
385
|
await self._async_conn.close()
|
|
@@ -375,32 +402,82 @@ def get_max_connections():
|
|
|
375
402
|
return _max_connections
|
|
376
403
|
|
|
377
404
|
|
|
378
|
-
|
|
405
|
+
@typing.overload
|
|
406
|
+
def get_engine(
|
|
407
|
+
db_name: Optional[str],
|
|
408
|
+
async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
|
|
409
|
+
...
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
@typing.overload
|
|
413
|
+
def get_engine(db_name: Optional[str],
|
|
414
|
+
async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
|
|
415
|
+
...
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def get_engine(
|
|
419
|
+
db_name: Optional[str],
|
|
420
|
+
async_engine: bool = False
|
|
421
|
+
) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
|
|
422
|
+
"""Get the engine for the given database name.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
db_name: The name of the database. ONLY used for SQLite. On Postgres,
|
|
426
|
+
we use a single database, which we get from the connection string.
|
|
427
|
+
async_engine: Whether to return an async engine.
|
|
428
|
+
"""
|
|
379
429
|
conn_string = None
|
|
380
430
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
381
431
|
conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
|
|
382
432
|
if conn_string:
|
|
433
|
+
if async_engine:
|
|
434
|
+
conn_string = conn_string.replace('postgresql://',
|
|
435
|
+
'postgresql+asyncpg://')
|
|
383
436
|
with _db_creation_lock:
|
|
437
|
+
# We use the same cache for both sync and async engines
|
|
438
|
+
# because we change the conn_string in the async case,
|
|
439
|
+
# so they would not overlap.
|
|
384
440
|
if conn_string not in _postgres_engine_cache:
|
|
441
|
+
engine_type = 'sync' if not async_engine else 'async'
|
|
442
|
+
logger.debug(
|
|
443
|
+
f'Creating a new postgres {engine_type} engine with '
|
|
444
|
+
f'maximum {_max_connections} connections')
|
|
385
445
|
if _max_connections == 0:
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
446
|
+
kw_args = {'poolclass': sqlalchemy.NullPool}
|
|
447
|
+
if async_engine:
|
|
448
|
+
_postgres_engine_cache[conn_string] = (
|
|
449
|
+
sqlalchemy_async.create_async_engine(
|
|
450
|
+
conn_string, **kw_args))
|
|
451
|
+
else:
|
|
452
|
+
_postgres_engine_cache[conn_string] = (
|
|
453
|
+
sqlalchemy.create_engine(conn_string, **kw_args))
|
|
393
454
|
else:
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
455
|
+
kw_args = {
|
|
456
|
+
'pool_size': _max_connections,
|
|
457
|
+
'max_overflow': max(0, 5 - _max_connections),
|
|
458
|
+
'pool_pre_ping': True,
|
|
459
|
+
'pool_recycle': 1800
|
|
460
|
+
}
|
|
461
|
+
if async_engine:
|
|
462
|
+
kw_args[
|
|
463
|
+
'poolclass'] = sqlalchemy.pool.AsyncAdaptedQueuePool
|
|
464
|
+
_postgres_engine_cache[conn_string] = (
|
|
465
|
+
sqlalchemy_async.create_async_engine(
|
|
466
|
+
conn_string, **kw_args))
|
|
467
|
+
else:
|
|
468
|
+
kw_args['poolclass'] = sqlalchemy.pool.QueuePool
|
|
469
|
+
_postgres_engine_cache[conn_string] = (
|
|
470
|
+
sqlalchemy.create_engine(conn_string, **kw_args))
|
|
400
471
|
engine = _postgres_engine_cache[conn_string]
|
|
401
472
|
else:
|
|
402
|
-
|
|
473
|
+
assert db_name is not None, 'db_name must be provided for SQLite'
|
|
474
|
+
db_path = runtime_utils.get_runtime_dir_path(f'.sky/{db_name}.db')
|
|
403
475
|
pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
|
|
476
|
+
if async_engine:
|
|
477
|
+
# This is an AsyncEngine, instead of a (normal, synchronous) Engine,
|
|
478
|
+
# so we should not put it in the cache. Instead, just return.
|
|
479
|
+
return sqlalchemy_async.create_async_engine(
|
|
480
|
+
'sqlite+aiosqlite:///' + db_path, connect_args={'timeout': 30})
|
|
404
481
|
if db_path not in _sqlite_engine_cache:
|
|
405
482
|
_sqlite_engine_cache[db_path] = sqlalchemy.create_engine(
|
|
406
483
|
'sqlite:///' + db_path)
|