skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,94 @@
1
+ """Lock events."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import Optional, Union
6
+
7
+ import filelock
8
+
9
+ from sky.utils import locks
10
+ from sky.utils import timeline
11
+
12
+
13
+ class DistributedLockEvent:
14
+ """Serve both as a distributed lock and event for the lock."""
15
+
16
+ def __init__(self, lock_id: str, timeout: Optional[float] = None):
17
+ self._lock_id = lock_id
18
+ self._lock = locks.get_lock(lock_id, timeout)
19
+ self._hold_lock_event = timeline.Event(
20
+ f'[DistributedLock.hold]:{lock_id}')
21
+
22
+ def acquire(self):
23
+ was_locked = self._lock.is_locked # type: ignore[truthy-function]
24
+ with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
25
+ self._lock.acquire()
26
+ if not was_locked and self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
27
+ # start holding the lock after initial acquiring
28
+ self._hold_lock_event.begin()
29
+
30
+ def release(self):
31
+ was_locked = self._lock.is_locked # type: ignore[truthy-function]
32
+ self._lock.release()
33
+ if was_locked and not self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
34
+ # stop holding the lock after initial releasing
35
+ self._hold_lock_event.end()
36
+
37
+ def __enter__(self):
38
+ self.acquire()
39
+ return self
40
+
41
+ def __exit__(self, exc_type, exc_val, exc_tb):
42
+ self.release()
43
+
44
+ def __call__(self, f):
45
+
46
+ @functools.wraps(f)
47
+ def wrapper(*args, **kwargs):
48
+ with self:
49
+ return f(*args, **kwargs)
50
+
51
+ return wrapper
52
+
53
+
54
+ class FileLockEvent:
55
+ """Serve both as a file lock and event for the lock."""
56
+
57
+ def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
58
+ self._lockfile = lockfile
59
+ os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
60
+ exist_ok=True)
61
+ self._lock = filelock.FileLock(self._lockfile, timeout)
62
+ self._hold_lock_event = timeline.Event(
63
+ f'[FileLock.hold]:{self._lockfile}')
64
+
65
+ def acquire(self):
66
+ was_locked = self._lock.is_locked
67
+ with timeline.Event(f'[FileLock.acquire]:{self._lockfile}'):
68
+ self._lock.acquire()
69
+ if not was_locked and self._lock.is_locked:
70
+ # start holding the lock after initial acquiring
71
+ self._hold_lock_event.begin()
72
+
73
+ def release(self):
74
+ was_locked = self._lock.is_locked
75
+ self._lock.release()
76
+ if was_locked and not self._lock.is_locked:
77
+ # stop holding the lock after initial releasing
78
+ self._hold_lock_event.end()
79
+
80
+ def __enter__(self):
81
+ self.acquire()
82
+ return self
83
+
84
+ def __exit__(self, exc_type, exc_val, exc_tb):
85
+ self.release()
86
+
87
+ def __call__(self, f):
88
+ # Make this class callable as a decorator.
89
+ @functools.wraps(f)
90
+ def wrapper(*args, **kwargs):
91
+ with self:
92
+ return f(*args, **kwargs)
93
+
94
+ return wrapper
sky/utils/locks.py ADDED
@@ -0,0 +1,416 @@
1
+ """Lock for SkyPilot.
2
+
3
+ This module provides an abstraction for locking that can use
4
+ either local file locks or database-based distributed locks.
5
+ """
6
+ import abc
7
+ import hashlib
8
+ import logging
9
+ import os
10
+ import time
11
+ from typing import Any, Optional
12
+
13
+ import filelock
14
+ import psycopg2
15
+ import sqlalchemy
16
+
17
+ from sky import global_user_state
18
+ from sky.skylet import runtime_utils
19
+ from sky.utils import common_utils
20
+ from sky.utils.db import db_utils
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # The directory for file locks.
25
+ SKY_LOCKS_DIR = runtime_utils.get_runtime_dir_path('.sky/locks')
26
+
27
+
28
+ class LockTimeout(RuntimeError):
29
+ """Raised when a lock acquisition times out."""
30
+ pass
31
+
32
+
33
+ class AcquireReturnProxy:
34
+ """A context manager that releases the lock when exiting.
35
+
36
+ This proxy is returned by acquire() and ensures proper cleanup
37
+ when used in a with statement.
38
+ """
39
+
40
+ def __init__(self, lock: 'DistributedLock') -> None:
41
+ self.lock = lock
42
+
43
+ def __enter__(self) -> 'DistributedLock':
44
+ return self.lock
45
+
46
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
47
+ self.lock.release()
48
+
49
+
50
+ class DistributedLock(abc.ABC):
51
+ """Abstract base class for a distributed lock.
52
+
53
+ Provides a context manager interface for acquiring and releasing locks
54
+ that can work across multiple processes and potentially multiple machines.
55
+ """
56
+
57
+ def __init__(self,
58
+ lock_id: str,
59
+ timeout: Optional[float] = None,
60
+ poll_interval: float = 0.1):
61
+ """Initialize the lock.
62
+
63
+ Args:
64
+ lock_id: Unique identifier for the lock.
65
+ timeout: Maximum time to wait for lock acquisition.
66
+ If None, wait indefinitely.
67
+ poll_interval: Interval in seconds to poll for lock acquisition.
68
+ """
69
+ self.lock_id = lock_id
70
+ self.timeout = timeout
71
+ self.poll_interval = poll_interval
72
+
73
+ @abc.abstractmethod
74
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
75
+ """Acquire the lock.
76
+
77
+ Args:
78
+ blocking: If True, block until lock is acquired or timeout.
79
+ If False, return immediately.
80
+
81
+ Returns:
82
+ AcquireReturnProxy that can be used as a context manager.
83
+
84
+ Raises:
85
+ LockTimeout: If lock cannot be acquired.
86
+ """
87
+ pass
88
+
89
+ @abc.abstractmethod
90
+ def release(self) -> None:
91
+ """Release the lock."""
92
+ pass
93
+
94
+ @abc.abstractmethod
95
+ def force_unlock(self) -> None:
96
+ """Force unlock the lock if it is acquired."""
97
+ pass
98
+
99
+ @abc.abstractmethod
100
+ def is_locked(self) -> bool:
101
+ """Check if the lock is acquired."""
102
+ pass
103
+
104
+ def __enter__(self) -> 'DistributedLock':
105
+ """Context manager entry."""
106
+ self.acquire()
107
+ return self
108
+
109
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
110
+ """Context manager exit."""
111
+ self.release()
112
+
113
+
114
+ class FileLock(DistributedLock):
115
+ """A wrapper around filelock.FileLock.
116
+
117
+ This implements a distributed lock that works across multiple processes
118
+ when they share the same filesystem.
119
+ """
120
+
121
+ def __init__(self,
122
+ lock_id: str,
123
+ timeout: Optional[float] = None,
124
+ poll_interval: float = 0.1):
125
+ """Initialize the file lock.
126
+
127
+ Args:
128
+ lock_id: Unique identifier for the lock.
129
+ timeout: Maximum time to wait for lock acquisition.
130
+ poll_interval: Interval in seconds to poll for lock acquisition.
131
+ """
132
+ super().__init__(lock_id, timeout, poll_interval)
133
+ os.makedirs(SKY_LOCKS_DIR, exist_ok=True)
134
+ self.lock_path = os.path.join(SKY_LOCKS_DIR, f'.{lock_id}.lock')
135
+ if timeout is None:
136
+ timeout = -1
137
+ self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
138
+ timeout=timeout)
139
+
140
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
141
+ """Acquire the file lock."""
142
+ try:
143
+ acquired = self._filelock.acquire(blocking=blocking)
144
+ if not acquired:
145
+ raise LockTimeout(f'Failed to acquire file lock {self.lock_id}')
146
+ return AcquireReturnProxy(self)
147
+ except filelock.Timeout as e:
148
+ raise LockTimeout(
149
+ f'Failed to acquire file lock {self.lock_id}') from e
150
+
151
+ def release(self) -> None:
152
+ """Release the file lock."""
153
+ self._filelock.release()
154
+
155
+ def force_unlock(self) -> None:
156
+ """Force unlock the file lock."""
157
+ common_utils.remove_file_if_exists(self.lock_path)
158
+
159
+ def is_locked(self) -> bool:
160
+ return self._filelock.is_locked
161
+
162
+
163
+ class PostgresLock(DistributedLock):
164
+ """PostgreSQL advisory lock implementation.
165
+
166
+ Uses PostgreSQL advisory locks to implement distributed locking
167
+ that works across multiple machines sharing the same database.
168
+ Supports both exclusive and shared lock modes.
169
+
170
+ References:
171
+ # pylint: disable=line-too-long
172
+ - https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
173
+ - https://www.postgresql.org/docs/current/functions-admin.html#FUNCTIONS-ADVISORY-LOCKS
174
+ # TODO(cooperc): re-enable pylint line-too-long
175
+ """
176
+
177
+ def __init__(self,
178
+ lock_id: str,
179
+ timeout: Optional[float] = None,
180
+ poll_interval: float = 1,
181
+ shared_lock: bool = False):
182
+ """Initialize the postgres lock.
183
+
184
+ Args:
185
+ lock_id: Unique identifier for the lock.
186
+ timeout: Maximum time to wait for lock acquisition.
187
+ poll_interval: Interval in seconds to poll for lock acquisition,
188
+ default to 1 second to avoid storming the database.
189
+ shared_lock: Whether to use shared advisory lock or exclusive
190
+ advisory lock (default).
191
+ """
192
+ super().__init__(lock_id, timeout, poll_interval)
193
+ # Convert string lock_id to integer for postgres advisory locks
194
+ self._lock_key = self._string_to_lock_key(lock_id)
195
+ self._shared_lock = shared_lock
196
+ self._acquired = False
197
+ self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
198
+
199
+ def _string_to_lock_key(self, s: str) -> int:
200
+ """Convert string to a 64-bit integer for advisory lock key."""
201
+ hash_digest = hashlib.sha256(s.encode('utf-8')).digest()
202
+ # Take first 8 bytes and convert to int, ensure positive 64-bit
203
+ return int.from_bytes(hash_digest[:8], 'big') & ((1 << 63) - 1)
204
+
205
+ def _get_connection(self) -> sqlalchemy.pool.PoolProxiedConnection:
206
+ """Get database connection."""
207
+ engine = global_user_state.initialize_and_get_db()
208
+ if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
209
+ raise ValueError('PostgresLock requires PostgreSQL database. '
210
+ f'Current dialect: {engine.dialect.name}')
211
+ # Borrow a dedicated connection from the pool.
212
+ return engine.raw_connection()
213
+
214
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
215
+ """Acquire the postgres advisory lock."""
216
+ if self._acquired:
217
+ return AcquireReturnProxy(self)
218
+
219
+ self._connection = self._get_connection()
220
+ cursor = self._connection.cursor()
221
+
222
+ start_time = time.time()
223
+
224
+ if self._shared_lock:
225
+ lock_func = 'pg_try_advisory_lock_shared'
226
+ else:
227
+ lock_func = 'pg_try_advisory_lock'
228
+
229
+ try:
230
+ while True:
231
+ cursor.execute(f'SELECT {lock_func}(%s)', (self._lock_key,))
232
+ result = cursor.fetchone()[0]
233
+
234
+ if result:
235
+ self._acquired = True
236
+ return AcquireReturnProxy(self)
237
+
238
+ mode_str = ('shared' if self._shared_lock else 'exclusive')
239
+ if not blocking:
240
+ raise LockTimeout(
241
+ f'Failed to immediately acquire {mode_str} '
242
+ f'postgres lock {self.lock_id}')
243
+
244
+ if (self.timeout is not None and
245
+ time.time() - start_time > self.timeout):
246
+ raise LockTimeout(
247
+ f'Failed to acquire {mode_str} postgres lock '
248
+ f'{self.lock_id} within {self.timeout} '
249
+ f'seconds')
250
+
251
+ time.sleep(self.poll_interval)
252
+
253
+ except Exception:
254
+ self._close_connection()
255
+ raise
256
+
257
+ def release(self) -> None:
258
+ """Release the postgres advisory lock."""
259
+ if not self._acquired or not self._connection:
260
+ return
261
+
262
+ connection_lost = False
263
+ try:
264
+ cursor = self._connection.cursor()
265
+ if self._shared_lock:
266
+ unlock_func = 'pg_advisory_unlock_shared'
267
+ else:
268
+ unlock_func = 'pg_advisory_unlock'
269
+ cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
270
+ self._connection.commit()
271
+ self._acquired = False
272
+ except psycopg2.OperationalError as e:
273
+ # Lost connection to the database, likely the lock is force unlocked
274
+ # by other routines.
275
+ logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
276
+ connection_lost = True
277
+ finally:
278
+ # Invalidate if connection was lost to prevent SQLAlchemy from
279
+ # trying to reset a dead connection
280
+ self._close_connection(invalidate=connection_lost)
281
+
282
+ def force_unlock(self) -> None:
283
+ """Force unlock the postgres advisory lock."""
284
+ try:
285
+ # The lock is held by current routine, gracefully unlock it
286
+ if self._acquired:
287
+ self.release()
288
+ return
289
+
290
+ # The lock is held by another routine, force unlock it.
291
+ if self._connection is None:
292
+ self._connection = self._get_connection()
293
+ cursor = self._connection.cursor()
294
+ if self._shared_lock:
295
+ unlock_func = 'pg_advisory_unlock_shared'
296
+ else:
297
+ unlock_func = 'pg_advisory_unlock'
298
+
299
+ cursor.execute(f'SELECT {unlock_func}(%s)', (self._lock_key,))
300
+ result = cursor.fetchone()[0]
301
+ if result:
302
+ # The lock is held by current routine and unlock succeed
303
+ self._connection.commit()
304
+ self._acquired = False
305
+ return
306
+ cursor.execute(
307
+ ('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
308
+ 'AND ((classid::bigint << 32) | objid::bigint) = %s'),
309
+ (self._lock_key,))
310
+ rows = cursor.fetchall()
311
+ if rows:
312
+ # There can be multiple PIDs holding the lock, it is not enough
313
+ # to only kill some of them. For example, if pid 1 is holding a
314
+ # shared lock, and pid 2 is waiting to grab an exclusive lock,
315
+ # killing pid 1 will transfer the lock to pid 2, so the lock
316
+ # will still not be released.
317
+ for row in rows:
318
+ cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
319
+ self._connection.commit()
320
+ return
321
+ except Exception as e:
322
+ raise RuntimeError(
323
+ f'Failed to force unlock postgres lock {self.lock_id}: {e}'
324
+ ) from e
325
+ finally:
326
+ self._close_connection()
327
+
328
+ def _close_connection(self, invalidate: bool = False) -> None:
329
+ """Close the postgres connection.
330
+
331
+ Args:
332
+ invalidate: If True, invalidate connection instead of closing it.
333
+ Use this when the connection might be broken (e.g., after
334
+ pg_terminate_backend) to prevent SQLAlchemy from trying to
335
+ reset it (which would result in an error being logged).
336
+ """
337
+ if self._connection:
338
+ try:
339
+ if invalidate:
340
+ self._connection.invalidate()
341
+ else:
342
+ self._connection.close()
343
+ except Exception as e: # pylint: disable=broad-except
344
+ if invalidate:
345
+ logger.debug(
346
+ f'Failed to invalidate postgres connection: {e}')
347
+ else:
348
+ logger.debug(f'Failed to close postgres connection: {e}')
349
+ self._connection = None
350
+
351
+ def is_locked(self) -> bool:
352
+ """Check if the postgres advisory lock is acquired."""
353
+ return self._acquired
354
+
355
+
356
+ def get_lock(lock_id: str,
357
+ timeout: Optional[float] = None,
358
+ lock_type: Optional[str] = None,
359
+ poll_interval: Optional[float] = None,
360
+ shared_lock: bool = False) -> DistributedLock:
361
+ """Create a distributed lock instance.
362
+
363
+ Args:
364
+ lock_id: Unique identifier for the lock.
365
+ timeout: Maximum time seconds to wait for lock acquisition,
366
+ None means wait indefinitely.
367
+ lock_type: Type of lock to create ('filelock' or 'postgres').
368
+ If None, auto-detect based on database configuration.
369
+ poll_interval: Interval in seconds to poll for lock acquisition.
370
+ shared_lock: Whether to use shared lock or exclusive lock (default).
371
+ NOTE: Only applicable for PostgresLock.
372
+
373
+ Returns:
374
+ DistributedLock instance.
375
+ """
376
+ if lock_type is None:
377
+ lock_type = _detect_lock_type()
378
+
379
+ if lock_type == 'postgres':
380
+ if poll_interval is None:
381
+ return PostgresLock(lock_id, timeout, shared_lock=shared_lock)
382
+ return PostgresLock(lock_id,
383
+ timeout,
384
+ poll_interval,
385
+ shared_lock=shared_lock)
386
+ elif lock_type == 'filelock':
387
+ # The filelock library we use does not support shared locks.
388
+ # It explicitly uses fcntl.LOCK_EX on Unix systems,
389
+ # whereas fcntl.LOCK_SH is needed for shared locks.
390
+
391
+ # This should be fine as it should not introduce correctness issues,
392
+ # just that concurrency is reduced and so is performance, because
393
+ # read-only operations can't run at the same time, each of them need
394
+ # to wait to exclusively hold the lock.
395
+
396
+ # But given that we recommend users to use Postgres in production,
397
+ # the impact of this should be limited to local API server mostly.
398
+ del shared_lock
399
+ if poll_interval is None:
400
+ return FileLock(lock_id, timeout)
401
+ return FileLock(lock_id, timeout, poll_interval)
402
+ else:
403
+ raise ValueError(f'Unknown lock type: {lock_type}')
404
+
405
+
406
+ def _detect_lock_type() -> str:
407
+ """Auto-detect the appropriate lock type based on configuration."""
408
+ try:
409
+ engine = global_user_state.initialize_and_get_db()
410
+ if engine.dialect.name == db_utils.SQLAlchemyDialect.POSTGRESQL.value:
411
+ return 'postgres'
412
+ except Exception: # pylint: disable=broad-except
413
+ # Fall back to filelock if database detection fails
414
+ pass
415
+
416
+ return 'filelock'