skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/lock_events.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Lock events."""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import os
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
import filelock
|
|
8
|
+
|
|
9
|
+
from sky.utils import locks
|
|
10
|
+
from sky.utils import timeline
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DistributedLockEvent:
|
|
14
|
+
"""Serve both as a distributed lock and event for the lock."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, lock_id: str, timeout: Optional[float] = None):
|
|
17
|
+
self._lock_id = lock_id
|
|
18
|
+
self._lock = locks.get_lock(lock_id, timeout)
|
|
19
|
+
self._hold_lock_event = timeline.Event(
|
|
20
|
+
f'[DistributedLock.hold]:{lock_id}')
|
|
21
|
+
|
|
22
|
+
def acquire(self):
|
|
23
|
+
was_locked = self._lock.is_locked # type: ignore[truthy-function]
|
|
24
|
+
with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
|
|
25
|
+
self._lock.acquire()
|
|
26
|
+
if not was_locked and self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
|
|
27
|
+
# start holding the lock after initial acquiring
|
|
28
|
+
self._hold_lock_event.begin()
|
|
29
|
+
|
|
30
|
+
def release(self):
|
|
31
|
+
was_locked = self._lock.is_locked # type: ignore[truthy-function]
|
|
32
|
+
self._lock.release()
|
|
33
|
+
if was_locked and not self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
|
|
34
|
+
# stop holding the lock after initial releasing
|
|
35
|
+
self._hold_lock_event.end()
|
|
36
|
+
|
|
37
|
+
def __enter__(self):
|
|
38
|
+
self.acquire()
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
42
|
+
self.release()
|
|
43
|
+
|
|
44
|
+
def __call__(self, f):
|
|
45
|
+
|
|
46
|
+
@functools.wraps(f)
|
|
47
|
+
def wrapper(*args, **kwargs):
|
|
48
|
+
with self:
|
|
49
|
+
return f(*args, **kwargs)
|
|
50
|
+
|
|
51
|
+
return wrapper
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FileLockEvent:
|
|
55
|
+
"""Serve both as a file lock and event for the lock."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
|
|
58
|
+
self._lockfile = lockfile
|
|
59
|
+
os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
|
|
60
|
+
exist_ok=True)
|
|
61
|
+
self._lock = filelock.FileLock(self._lockfile, timeout)
|
|
62
|
+
self._hold_lock_event = timeline.Event(
|
|
63
|
+
f'[FileLock.hold]:{self._lockfile}')
|
|
64
|
+
|
|
65
|
+
def acquire(self):
|
|
66
|
+
was_locked = self._lock.is_locked
|
|
67
|
+
with timeline.Event(f'[FileLock.acquire]:{self._lockfile}'):
|
|
68
|
+
self._lock.acquire()
|
|
69
|
+
if not was_locked and self._lock.is_locked:
|
|
70
|
+
# start holding the lock after initial acquiring
|
|
71
|
+
self._hold_lock_event.begin()
|
|
72
|
+
|
|
73
|
+
def release(self):
|
|
74
|
+
was_locked = self._lock.is_locked
|
|
75
|
+
self._lock.release()
|
|
76
|
+
if was_locked and not self._lock.is_locked:
|
|
77
|
+
# stop holding the lock after initial releasing
|
|
78
|
+
self._hold_lock_event.end()
|
|
79
|
+
|
|
80
|
+
def __enter__(self):
|
|
81
|
+
self.acquire()
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
85
|
+
self.release()
|
|
86
|
+
|
|
87
|
+
def __call__(self, f):
|
|
88
|
+
# Make this class callable as a decorator.
|
|
89
|
+
@functools.wraps(f)
|
|
90
|
+
def wrapper(*args, **kwargs):
|
|
91
|
+
with self:
|
|
92
|
+
return f(*args, **kwargs)
|
|
93
|
+
|
|
94
|
+
return wrapper
|
sky/utils/locks.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""Lock for SkyPilot.
|
|
2
|
+
|
|
3
|
+
This module provides an abstraction for locking that can use
|
|
4
|
+
either local file locks or database-based distributed locks.
|
|
5
|
+
"""
|
|
6
|
+
import abc
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
import filelock
|
|
14
|
+
import psycopg2
|
|
15
|
+
import sqlalchemy
|
|
16
|
+
|
|
17
|
+
from sky import global_user_state
|
|
18
|
+
from sky.skylet import runtime_utils
|
|
19
|
+
from sky.utils import common_utils
|
|
20
|
+
from sky.utils.db import db_utils
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# The directory for file locks.
|
|
25
|
+
SKY_LOCKS_DIR = runtime_utils.get_runtime_dir_path('.sky/locks')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LockTimeout(RuntimeError):
|
|
29
|
+
"""Raised when a lock acquisition times out."""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AcquireReturnProxy:
|
|
34
|
+
"""A context manager that releases the lock when exiting.
|
|
35
|
+
|
|
36
|
+
This proxy is returned by acquire() and ensures proper cleanup
|
|
37
|
+
when used in a with statement.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, lock: 'DistributedLock') -> None:
|
|
41
|
+
self.lock = lock
|
|
42
|
+
|
|
43
|
+
def __enter__(self) -> 'DistributedLock':
|
|
44
|
+
return self.lock
|
|
45
|
+
|
|
46
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
47
|
+
self.lock.release()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DistributedLock(abc.ABC):
|
|
51
|
+
"""Abstract base class for a distributed lock.
|
|
52
|
+
|
|
53
|
+
Provides a context manager interface for acquiring and releasing locks
|
|
54
|
+
that can work across multiple processes and potentially multiple machines.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self,
|
|
58
|
+
lock_id: str,
|
|
59
|
+
timeout: Optional[float] = None,
|
|
60
|
+
poll_interval: float = 0.1):
|
|
61
|
+
"""Initialize the lock.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
lock_id: Unique identifier for the lock.
|
|
65
|
+
timeout: Maximum time to wait for lock acquisition.
|
|
66
|
+
If None, wait indefinitely.
|
|
67
|
+
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
68
|
+
"""
|
|
69
|
+
self.lock_id = lock_id
|
|
70
|
+
self.timeout = timeout
|
|
71
|
+
self.poll_interval = poll_interval
|
|
72
|
+
|
|
73
|
+
@abc.abstractmethod
|
|
74
|
+
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
75
|
+
"""Acquire the lock.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
blocking: If True, block until lock is acquired or timeout.
|
|
79
|
+
If False, return immediately.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
AcquireReturnProxy that can be used as a context manager.
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
LockTimeout: If lock cannot be acquired.
|
|
86
|
+
"""
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
@abc.abstractmethod
|
|
90
|
+
def release(self) -> None:
|
|
91
|
+
"""Release the lock."""
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
@abc.abstractmethod
|
|
95
|
+
def force_unlock(self) -> None:
|
|
96
|
+
"""Force unlock the lock if it is acquired."""
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
@abc.abstractmethod
|
|
100
|
+
def is_locked(self) -> bool:
|
|
101
|
+
"""Check if the lock is acquired."""
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
def __enter__(self) -> 'DistributedLock':
|
|
105
|
+
"""Context manager entry."""
|
|
106
|
+
self.acquire()
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
110
|
+
"""Context manager exit."""
|
|
111
|
+
self.release()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class FileLock(DistributedLock):
|
|
115
|
+
"""A wrapper around filelock.FileLock.
|
|
116
|
+
|
|
117
|
+
This implements a distributed lock that works across multiple processes
|
|
118
|
+
when they share the same filesystem.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
def __init__(self,
|
|
122
|
+
lock_id: str,
|
|
123
|
+
timeout: Optional[float] = None,
|
|
124
|
+
poll_interval: float = 0.1):
|
|
125
|
+
"""Initialize the file lock.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
lock_id: Unique identifier for the lock.
|
|
129
|
+
timeout: Maximum time to wait for lock acquisition.
|
|
130
|
+
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
131
|
+
"""
|
|
132
|
+
super().__init__(lock_id, timeout, poll_interval)
|
|
133
|
+
os.makedirs(SKY_LOCKS_DIR, exist_ok=True)
|
|
134
|
+
self.lock_path = os.path.join(SKY_LOCKS_DIR, f'.{lock_id}.lock')
|
|
135
|
+
if timeout is None:
|
|
136
|
+
timeout = -1
|
|
137
|
+
self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
|
|
138
|
+
timeout=timeout)
|
|
139
|
+
|
|
140
|
+
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
141
|
+
"""Acquire the file lock."""
|
|
142
|
+
try:
|
|
143
|
+
acquired = self._filelock.acquire(blocking=blocking)
|
|
144
|
+
if not acquired:
|
|
145
|
+
raise LockTimeout(f'Failed to acquire file lock {self.lock_id}')
|
|
146
|
+
return AcquireReturnProxy(self)
|
|
147
|
+
except filelock.Timeout as e:
|
|
148
|
+
raise LockTimeout(
|
|
149
|
+
f'Failed to acquire file lock {self.lock_id}') from e
|
|
150
|
+
|
|
151
|
+
def release(self) -> None:
|
|
152
|
+
"""Release the file lock."""
|
|
153
|
+
self._filelock.release()
|
|
154
|
+
|
|
155
|
+
def force_unlock(self) -> None:
|
|
156
|
+
"""Force unlock the file lock."""
|
|
157
|
+
common_utils.remove_file_if_exists(self.lock_path)
|
|
158
|
+
|
|
159
|
+
def is_locked(self) -> bool:
|
|
160
|
+
return self._filelock.is_locked
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class PostgresLock(DistributedLock):
|
|
164
|
+
"""PostgreSQL advisory lock implementation.
|
|
165
|
+
|
|
166
|
+
Uses PostgreSQL advisory locks to implement distributed locking
|
|
167
|
+
that works across multiple machines sharing the same database.
|
|
168
|
+
Supports both exclusive and shared lock modes.
|
|
169
|
+
|
|
170
|
+
References:
|
|
171
|
+
# pylint: disable=line-too-long
|
|
172
|
+
- https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
|
|
173
|
+
- https://www.postgresql.org/docs/current/functions-admin.html#FUNCTIONS-ADVISORY-LOCKS
|
|
174
|
+
# TODO(cooperc): re-enable pylint line-too-long
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
def __init__(self,
|
|
178
|
+
lock_id: str,
|
|
179
|
+
timeout: Optional[float] = None,
|
|
180
|
+
poll_interval: float = 1,
|
|
181
|
+
shared_lock: bool = False):
|
|
182
|
+
"""Initialize the postgres lock.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
lock_id: Unique identifier for the lock.
|
|
186
|
+
timeout: Maximum time to wait for lock acquisition.
|
|
187
|
+
poll_interval: Interval in seconds to poll for lock acquisition,
|
|
188
|
+
default to 1 second to avoid storming the database.
|
|
189
|
+
shared_lock: Whether to use shared advisory lock or exclusive
|
|
190
|
+
advisory lock (default).
|
|
191
|
+
"""
|
|
192
|
+
super().__init__(lock_id, timeout, poll_interval)
|
|
193
|
+
# Convert string lock_id to integer for postgres advisory locks
|
|
194
|
+
self._lock_key = self._string_to_lock_key(lock_id)
|
|
195
|
+
self._shared_lock = shared_lock
|
|
196
|
+
self._acquired = False
|
|
197
|
+
self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
|
|
198
|
+
|
|
199
|
+
def _string_to_lock_key(self, s: str) -> int:
|
|
200
|
+
"""Convert string to a 64-bit integer for advisory lock key."""
|
|
201
|
+
hash_digest = hashlib.sha256(s.encode('utf-8')).digest()
|
|
202
|
+
# Take first 8 bytes and convert to int, ensure positive 64-bit
|
|
203
|
+
return int.from_bytes(hash_digest[:8], 'big') & ((1 << 63) - 1)
|
|
204
|
+
|
|
205
|
+
def _get_connection(self) -> sqlalchemy.pool.PoolProxiedConnection:
|
|
206
|
+
"""Get database connection."""
|
|
207
|
+
engine = global_user_state.initialize_and_get_db()
|
|
208
|
+
if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
|
|
209
|
+
raise ValueError('PostgresLock requires PostgreSQL database. '
|
|
210
|
+
f'Current dialect: {engine.dialect.name}')
|
|
211
|
+
# Borrow a dedicated connection from the pool.
|
|
212
|
+
return engine.raw_connection()
|
|
213
|
+
|
|
214
|
+
def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
|
|
215
|
+
"""Acquire the postgres advisory lock."""
|
|
216
|
+
if self._acquired:
|
|
217
|
+
return AcquireReturnProxy(self)
|
|
218
|
+
|
|
219
|
+
self._connection = self._get_connection()
|
|
220
|
+
cursor = self._connection.cursor()
|
|
221
|
+
|
|
222
|
+
start_time = time.time()
|
|
223
|
+
|
|
224
|
+
if self._shared_lock:
|
|
225
|
+
lock_func = 'pg_try_advisory_lock_shared'
|
|
226
|
+
else:
|
|
227
|
+
lock_func = 'pg_try_advisory_lock'
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
while True:
|
|
231
|
+
cursor.execute(f'SELECT {lock_func}(%s)', (self._lock_key,))
|
|
232
|
+
result = cursor.fetchone()[0]
|
|
233
|
+
|
|
234
|
+
if result:
|
|
235
|
+
self._acquired = True
|
|
236
|
+
return AcquireReturnProxy(self)
|
|
237
|
+
|
|
238
|
+
mode_str = ('shared' if self._shared_lock else 'exclusive')
|
|
239
|
+
if not blocking:
|
|
240
|
+
raise LockTimeout(
|
|
241
|
+
f'Failed to immediately acquire {mode_str} '
|
|
242
|
+
f'postgres lock {self.lock_id}')
|
|
243
|
+
|
|
244
|
+
if (self.timeout is not None and
|
|
245
|
+
time.time() - start_time > self.timeout):
|
|
246
|
+
raise LockTimeout(
|
|
247
|
+
f'Failed to acquire {mode_str} postgres lock '
|
|
248
|
+
f'{self.lock_id} within {self.timeout} '
|
|
249
|
+
f'seconds')
|
|
250
|
+
|
|
251
|
+
time.sleep(self.poll_interval)
|
|
252
|
+
|
|
253
|
+
except Exception:
|
|
254
|
+
self._close_connection()
|
|
255
|
+
raise
|
|
256
|
+
|
|
257
|
+
def release(self) -> None:
|
|
258
|
+
"""Release the postgres advisory lock."""
|
|
259
|
+
if not self._acquired or not self._connection:
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
connection_lost = False
|
|
263
|
+
try:
|
|
264
|
+
cursor = self._connection.cursor()
|
|
265
|
+
if self._shared_lock:
|
|
266
|
+
unlock_func = 'pg_advisory_unlock_shared'
|
|
267
|
+
else:
|
|
268
|
+
unlock_func = 'pg_advisory_unlock'
|
|
269
|
+
cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
|
|
270
|
+
self._connection.commit()
|
|
271
|
+
self._acquired = False
|
|
272
|
+
except psycopg2.OperationalError as e:
|
|
273
|
+
# Lost connection to the database, likely the lock is force unlocked
|
|
274
|
+
# by other routines.
|
|
275
|
+
logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
|
|
276
|
+
connection_lost = True
|
|
277
|
+
finally:
|
|
278
|
+
# Invalidate if connection was lost to prevent SQLAlchemy from
|
|
279
|
+
# trying to reset a dead connection
|
|
280
|
+
self._close_connection(invalidate=connection_lost)
|
|
281
|
+
|
|
282
|
+
def force_unlock(self) -> None:
|
|
283
|
+
"""Force unlock the postgres advisory lock."""
|
|
284
|
+
try:
|
|
285
|
+
# The lock is held by current routine, gracefully unlock it
|
|
286
|
+
if self._acquired:
|
|
287
|
+
self.release()
|
|
288
|
+
return
|
|
289
|
+
|
|
290
|
+
# The lock is held by another routine, force unlock it.
|
|
291
|
+
if self._connection is None:
|
|
292
|
+
self._connection = self._get_connection()
|
|
293
|
+
cursor = self._connection.cursor()
|
|
294
|
+
if self._shared_lock:
|
|
295
|
+
unlock_func = 'pg_advisory_unlock_shared'
|
|
296
|
+
else:
|
|
297
|
+
unlock_func = 'pg_advisory_unlock'
|
|
298
|
+
|
|
299
|
+
cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
|
|
300
|
+
result = cursor.fetchone()[0]
|
|
301
|
+
if result:
|
|
302
|
+
# The lock is held by current routine and unlock succeed
|
|
303
|
+
self._connection.commit()
|
|
304
|
+
self._acquired = False
|
|
305
|
+
return
|
|
306
|
+
cursor.execute(
|
|
307
|
+
('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
|
|
308
|
+
'AND ((classid::bigint << 32) | objid::bigint) = %s'),
|
|
309
|
+
(self._lock_key,))
|
|
310
|
+
rows = cursor.fetchall()
|
|
311
|
+
if rows:
|
|
312
|
+
# There can be multiple PIDs holding the lock, it is not enough
|
|
313
|
+
# to only kill some of them. For example, if pid 1 is holding a
|
|
314
|
+
# shared lock, and pid 2 is waiting to grab an exclusive lock,
|
|
315
|
+
# killing pid 1 will transfer the lock to pid 2, so the lock
|
|
316
|
+
# will still not be released.
|
|
317
|
+
for row in rows:
|
|
318
|
+
cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
|
|
319
|
+
self._connection.commit()
|
|
320
|
+
return
|
|
321
|
+
except Exception as e:
|
|
322
|
+
raise RuntimeError(
|
|
323
|
+
f'Failed to force unlock postgres lock {self.lock_id}: {e}'
|
|
324
|
+
) from e
|
|
325
|
+
finally:
|
|
326
|
+
self._close_connection()
|
|
327
|
+
|
|
328
|
+
def _close_connection(self, invalidate: bool = False) -> None:
|
|
329
|
+
"""Close the postgres connection.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
invalidate: If True, invalidate connection instead of closing it.
|
|
333
|
+
Use this when the connection might be broken (e.g., after
|
|
334
|
+
pg_terminate_backend) to prevent SQLAlchemy from trying to
|
|
335
|
+
reset it (which would result in an error being logged).
|
|
336
|
+
"""
|
|
337
|
+
if self._connection:
|
|
338
|
+
try:
|
|
339
|
+
if invalidate:
|
|
340
|
+
self._connection.invalidate()
|
|
341
|
+
else:
|
|
342
|
+
self._connection.close()
|
|
343
|
+
except Exception as e: # pylint: disable=broad-except
|
|
344
|
+
if invalidate:
|
|
345
|
+
logger.debug(
|
|
346
|
+
f'Failed to invalidate postgres connection: {e}')
|
|
347
|
+
else:
|
|
348
|
+
logger.debug(f'Failed to close postgres connection: {e}')
|
|
349
|
+
self._connection = None
|
|
350
|
+
|
|
351
|
+
def is_locked(self) -> bool:
|
|
352
|
+
"""Check if the postgres advisory lock is acquired."""
|
|
353
|
+
return self._acquired
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def get_lock(lock_id: str,
|
|
357
|
+
timeout: Optional[float] = None,
|
|
358
|
+
lock_type: Optional[str] = None,
|
|
359
|
+
poll_interval: Optional[float] = None,
|
|
360
|
+
shared_lock: bool = False) -> DistributedLock:
|
|
361
|
+
"""Create a distributed lock instance.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
lock_id: Unique identifier for the lock.
|
|
365
|
+
timeout: Maximum time seconds to wait for lock acquisition,
|
|
366
|
+
None means wait indefinitely.
|
|
367
|
+
lock_type: Type of lock to create ('filelock' or 'postgres').
|
|
368
|
+
If None, auto-detect based on database configuration.
|
|
369
|
+
poll_interval: Interval in seconds to poll for lock acquisition.
|
|
370
|
+
shared_lock: Whether to use shared lock or exclusive lock (default).
|
|
371
|
+
NOTE: Only applicable for PostgresLock.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
DistributedLock instance.
|
|
375
|
+
"""
|
|
376
|
+
if lock_type is None:
|
|
377
|
+
lock_type = _detect_lock_type()
|
|
378
|
+
|
|
379
|
+
if lock_type == 'postgres':
|
|
380
|
+
if poll_interval is None:
|
|
381
|
+
return PostgresLock(lock_id, timeout, shared_lock=shared_lock)
|
|
382
|
+
return PostgresLock(lock_id,
|
|
383
|
+
timeout,
|
|
384
|
+
poll_interval,
|
|
385
|
+
shared_lock=shared_lock)
|
|
386
|
+
elif lock_type == 'filelock':
|
|
387
|
+
# The filelock library we use does not support shared locks.
|
|
388
|
+
# It explicitly uses fcntl.LOCK_EX on Unix systems,
|
|
389
|
+
# whereas fcntl.LOCK_SH is needed for shared locks.
|
|
390
|
+
|
|
391
|
+
# This should be fine as it should not introduce correctness issues,
|
|
392
|
+
# just that concurrency is reduced and so is performance, because
|
|
393
|
+
# read-only operations can't run at the same time, each of them need
|
|
394
|
+
# to wait to exclusively hold the lock.
|
|
395
|
+
|
|
396
|
+
# But given that we recommend users to use Postgres in production,
|
|
397
|
+
# the impact of this should be limited to local API server mostly.
|
|
398
|
+
del shared_lock
|
|
399
|
+
if poll_interval is None:
|
|
400
|
+
return FileLock(lock_id, timeout)
|
|
401
|
+
return FileLock(lock_id, timeout, poll_interval)
|
|
402
|
+
else:
|
|
403
|
+
raise ValueError(f'Unknown lock type: {lock_type}')
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _detect_lock_type() -> str:
|
|
407
|
+
"""Auto-detect the appropriate lock type based on configuration."""
|
|
408
|
+
try:
|
|
409
|
+
engine = global_user_state.initialize_and_get_db()
|
|
410
|
+
if engine.dialect.name == db_utils.SQLAlchemyDialect.POSTGRESQL.value:
|
|
411
|
+
return 'postgres'
|
|
412
|
+
except Exception: # pylint: disable=broad-except
|
|
413
|
+
# Fall back to filelock if database detection fails
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
return 'filelock'
|