skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,94 @@
1
+ """Lock events."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import Optional, Union
6
+
7
+ import filelock
8
+
9
+ from sky.utils import locks
10
+ from sky.utils import timeline
11
+
12
+
13
+ class DistributedLockEvent:
14
+ """Serve both as a distributed lock and event for the lock."""
15
+
16
+ def __init__(self, lock_id: str, timeout: Optional[float] = None):
17
+ self._lock_id = lock_id
18
+ self._lock = locks.get_lock(lock_id, timeout)
19
+ self._hold_lock_event = timeline.Event(
20
+ f'[DistributedLock.hold]:{lock_id}')
21
+
22
+ def acquire(self):
23
+ was_locked = self._lock.is_locked
24
+ with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
25
+ self._lock.acquire()
26
+ if not was_locked and self._lock.is_locked:
27
+ # start holding the lock after initial acquiring
28
+ self._hold_lock_event.begin()
29
+
30
+ def release(self):
31
+ was_locked = self._lock.is_locked
32
+ self._lock.release()
33
+ if was_locked and not self._lock.is_locked:
34
+ # stop holding the lock after initial releasing
35
+ self._hold_lock_event.end()
36
+
37
+ def __enter__(self):
38
+ self.acquire()
39
+ return self
40
+
41
+ def __exit__(self, exc_type, exc_val, exc_tb):
42
+ self.release()
43
+
44
+ def __call__(self, f):
45
+
46
+ @functools.wraps(f)
47
+ def wrapper(*args, **kwargs):
48
+ with self:
49
+ return f(*args, **kwargs)
50
+
51
+ return wrapper
52
+
53
+
54
+ class FileLockEvent:
55
+ """Serve both as a file lock and event for the lock."""
56
+
57
+ def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
58
+ self._lockfile = lockfile
59
+ os.makedirs(os.path.dirname(os.path.abspath(self._lockfile)),
60
+ exist_ok=True)
61
+ self._lock = filelock.FileLock(self._lockfile, timeout)
62
+ self._hold_lock_event = timeline.Event(
63
+ f'[FileLock.hold]:{self._lockfile}')
64
+
65
+ def acquire(self):
66
+ was_locked = self._lock.is_locked
67
+ with timeline.Event(f'[FileLock.acquire]:{self._lockfile}'):
68
+ self._lock.acquire()
69
+ if not was_locked and self._lock.is_locked:
70
+ # start holding the lock after initial acquiring
71
+ self._hold_lock_event.begin()
72
+
73
+ def release(self):
74
+ was_locked = self._lock.is_locked
75
+ self._lock.release()
76
+ if was_locked and not self._lock.is_locked:
77
+ # stop holding the lock after initial releasing
78
+ self._hold_lock_event.end()
79
+
80
+ def __enter__(self):
81
+ self.acquire()
82
+ return self
83
+
84
+ def __exit__(self, exc_type, exc_val, exc_tb):
85
+ self.release()
86
+
87
+ def __call__(self, f):
88
+ # Make this class callable as a decorator.
89
+ @functools.wraps(f)
90
+ def wrapper(*args, **kwargs):
91
+ with self:
92
+ return f(*args, **kwargs)
93
+
94
+ return wrapper
sky/utils/locks.py ADDED
@@ -0,0 +1,368 @@
1
+ """Lock for SkyPilot.
2
+
3
+ This module provides an abstraction for locking that can use
4
+ either local file locks or database-based distributed locks.
5
+ """
6
+ import abc
7
+ import hashlib
8
+ import logging
9
+ import os
10
+ import time
11
+ from typing import Any, Optional
12
+
13
+ import filelock
14
+ import psycopg2
15
+ import sqlalchemy
16
+
17
+ from sky import global_user_state
18
+ from sky.skylet import constants
19
+ from sky.utils import common_utils
20
+ from sky.utils.db import db_utils
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class LockTimeout(RuntimeError):
26
+ """Raised when a lock acquisition times out."""
27
+ pass
28
+
29
+
30
+ class AcquireReturnProxy:
31
+ """A context manager that releases the lock when exiting.
32
+
33
+ This proxy is returned by acquire() and ensures proper cleanup
34
+ when used in a with statement.
35
+ """
36
+
37
+ def __init__(self, lock: 'DistributedLock') -> None:
38
+ self.lock = lock
39
+
40
+ def __enter__(self) -> 'DistributedLock':
41
+ return self.lock
42
+
43
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
44
+ self.lock.release()
45
+
46
+
47
+ class DistributedLock(abc.ABC):
48
+ """Abstract base class for a distributed lock.
49
+
50
+ Provides a context manager interface for acquiring and releasing locks
51
+ that can work across multiple processes and potentially multiple machines.
52
+ """
53
+
54
+ def __init__(self,
55
+ lock_id: str,
56
+ timeout: Optional[float] = None,
57
+ poll_interval: float = 0.1):
58
+ """Initialize the lock.
59
+
60
+ Args:
61
+ lock_id: Unique identifier for the lock.
62
+ timeout: Maximum time to wait for lock acquisition.
63
+ If None, wait indefinitely.
64
+ poll_interval: Interval in seconds to poll for lock acquisition.
65
+ """
66
+ self.lock_id = lock_id
67
+ self.timeout = timeout
68
+ self.poll_interval = poll_interval
69
+
70
+ @abc.abstractmethod
71
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
72
+ """Acquire the lock.
73
+
74
+ Args:
75
+ blocking: If True, block until lock is acquired or timeout.
76
+ If False, return immediately.
77
+
78
+ Returns:
79
+ AcquireReturnProxy that can be used as a context manager.
80
+
81
+ Raises:
82
+ LockTimeout: If lock cannot be acquired.
83
+ """
84
+ pass
85
+
86
+ @abc.abstractmethod
87
+ def release(self) -> None:
88
+ """Release the lock."""
89
+ pass
90
+
91
+ @abc.abstractmethod
92
+ def force_unlock(self) -> None:
93
+ """Force unlock the lock if it is acquired."""
94
+ pass
95
+
96
+ @abc.abstractmethod
97
+ def is_locked(self) -> bool:
98
+ """Check if the lock is acquired."""
99
+ pass
100
+
101
+ def __enter__(self) -> 'DistributedLock':
102
+ """Context manager entry."""
103
+ self.acquire()
104
+ return self
105
+
106
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
107
+ """Context manager exit."""
108
+ self.release()
109
+
110
+
111
+ class FileLock(DistributedLock):
112
+ """A wrapper around filelock.FileLock.
113
+
114
+ This implements a distributed lock that works across multiple processes
115
+ when they share the same filesystem.
116
+ """
117
+
118
+ def __init__(self,
119
+ lock_id: str,
120
+ timeout: Optional[float] = None,
121
+ poll_interval: float = 0.1):
122
+ """Initialize the file lock.
123
+
124
+ Args:
125
+ lock_id: Unique identifier for the lock.
126
+ timeout: Maximum time to wait for lock acquisition.
127
+ poll_interval: Interval in seconds to poll for lock acquisition.
128
+ """
129
+ super().__init__(lock_id, timeout, poll_interval)
130
+ os.makedirs(constants.SKY_LOCKS_DIR, exist_ok=True)
131
+ self.lock_path = os.path.join(constants.SKY_LOCKS_DIR,
132
+ f'.{lock_id}.lock')
133
+ if timeout is None:
134
+ timeout = -1
135
+ self._filelock: filelock.FileLock = filelock.FileLock(self.lock_path,
136
+ timeout=timeout)
137
+
138
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
139
+ """Acquire the file lock."""
140
+ try:
141
+ acquired = self._filelock.acquire(blocking=blocking)
142
+ if not acquired:
143
+ raise LockTimeout(f'Failed to acquire file lock {self.lock_id}')
144
+ return AcquireReturnProxy(self)
145
+ except filelock.Timeout as e:
146
+ raise LockTimeout(
147
+ f'Failed to acquire file lock {self.lock_id}') from e
148
+
149
+ def release(self) -> None:
150
+ """Release the file lock."""
151
+ self._filelock.release()
152
+
153
+ def force_unlock(self) -> None:
154
+ """Force unlock the file lock."""
155
+ common_utils.remove_file_if_exists(self.lock_path)
156
+
157
+ def is_locked(self) -> bool:
158
+ return self._filelock.is_locked()
159
+
160
+
161
+ class PostgresLock(DistributedLock):
162
+ """PostgreSQL advisory lock implementation.
163
+
164
+ Uses PostgreSQL advisory locks to implement distributed locking
165
+ that works across multiple machines sharing the same database.
166
+ Reference:
167
+ https://www.postgresql.org/docs/current/explicit-locking.html
168
+ #ADVISORY-LOCKS
169
+ """
170
+
171
+ def __init__(self,
172
+ lock_id: str,
173
+ timeout: Optional[float] = None,
174
+ poll_interval: float = 1):
175
+ """Initialize the postgres lock.
176
+
177
+ Args:
178
+ lock_id: Unique identifier for the lock.
179
+ timeout: Maximum time to wait for lock acquisition.
180
+ poll_interval: Interval in seconds to poll for lock acquisition,
181
+ default to 1 second to avoid storming the database.
182
+ """
183
+ super().__init__(lock_id, timeout, poll_interval)
184
+ # Convert string lock_id to integer for postgres advisory locks
185
+ self._lock_key = self._string_to_lock_key(lock_id)
186
+ self._acquired = False
187
+ self._connection: Optional[sqlalchemy.pool.PoolProxiedConnection] = None
188
+
189
+ def _string_to_lock_key(self, s: str) -> int:
190
+ """Convert string to a 64-bit integer for advisory lock key."""
191
+ hash_digest = hashlib.sha256(s.encode('utf-8')).digest()
192
+ # Take first 8 bytes and convert to int, ensure positive 64-bit
193
+ return int.from_bytes(hash_digest[:8], 'big') & ((1 << 63) - 1)
194
+
195
+ def _get_connection(self) -> sqlalchemy.pool.PoolProxiedConnection:
196
+ """Get database connection."""
197
+ engine = global_user_state.initialize_and_get_db()
198
+ if engine.dialect.name != db_utils.SQLAlchemyDialect.POSTGRESQL.value:
199
+ raise ValueError('PostgresLock requires PostgreSQL database. '
200
+ f'Current dialect: {engine.dialect.name}')
201
+ # Borrow a dedicated connection from the pool.
202
+ return engine.raw_connection()
203
+
204
+ def acquire(self, blocking: bool = True) -> AcquireReturnProxy:
205
+ """Acquire the postgres advisory lock."""
206
+ if self._acquired:
207
+ return AcquireReturnProxy(self)
208
+
209
+ self._connection = self._get_connection()
210
+ cursor = self._connection.cursor()
211
+
212
+ start_time = time.time()
213
+
214
+ try:
215
+ while True:
216
+ cursor.execute('SELECT pg_try_advisory_lock(%s)',
217
+ (self._lock_key,))
218
+ result = cursor.fetchone()[0]
219
+
220
+ if result:
221
+ self._acquired = True
222
+ return AcquireReturnProxy(self)
223
+
224
+ if not blocking:
225
+ raise LockTimeout(
226
+ f'Failed to immediately acquire postgres lock '
227
+ f'{self.lock_id}')
228
+
229
+ if (self.timeout is not None and
230
+ time.time() - start_time > self.timeout):
231
+ raise LockTimeout(
232
+ f'Failed to acquire postgres lock {self.lock_id} '
233
+ f'within {self.timeout} seconds')
234
+
235
+ time.sleep(self.poll_interval)
236
+
237
+ except Exception:
238
+ self._close_connection()
239
+ raise
240
+
241
+ def release(self) -> None:
242
+ """Release the postgres advisory lock."""
243
+ if not self._acquired or not self._connection:
244
+ return
245
+
246
+ connection_lost = False
247
+ try:
248
+ cursor = self._connection.cursor()
249
+ cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
250
+ self._connection.commit()
251
+ self._acquired = False
252
+ except psycopg2.OperationalError as e:
253
+ # Lost connection to the database, likely the lock is force unlocked
254
+ # by other routines.
255
+ logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
256
+ connection_lost = True
257
+ finally:
258
+ # Invalidate if connection was lost to prevent SQLAlchemy from
259
+ # trying to reset a dead connection
260
+ self._close_connection(invalidate=connection_lost)
261
+
262
+ def force_unlock(self) -> None:
263
+ """Force unlock the postgres advisory lock."""
264
+ try:
265
+ # The lock is held by current routine, gracefully unlock it
266
+ if self._acquired:
267
+ self.release()
268
+ return
269
+
270
+ # The lock is held by another routine, force unlock it.
271
+ if self._connection is None:
272
+ self._connection = self._get_connection()
273
+ cursor = self._connection.cursor()
274
+ cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
275
+ result = cursor.fetchone()[0]
276
+ if result:
277
+ # The lock is held by current routine and unlock succeed
278
+ self._connection.commit()
279
+ self._acquired = False
280
+ return
281
+ cursor.execute(
282
+ ('SELECT pid FROM pg_locks WHERE locktype = \'advisory\' '
283
+ 'AND ((classid::bigint << 32) | objid::bigint) = %s'),
284
+ (self._lock_key,))
285
+ row = cursor.fetchone()
286
+ if row:
287
+ # The lock is still held by another routine, false unlock it
288
+ # by killing the PG connection of that routine.
289
+ cursor.execute('SELECT pg_terminate_backend(%s)', (row[0],))
290
+ self._connection.commit()
291
+ return
292
+ except Exception as e:
293
+ raise RuntimeError(
294
+ f'Failed to force unlock postgres lock {self.lock_id}: {e}'
295
+ ) from e
296
+ finally:
297
+ self._close_connection()
298
+
299
+ def _close_connection(self, invalidate: bool = False) -> None:
300
+ """Close the postgres connection.
301
+
302
+ Args:
303
+ invalidate: If True, invalidate connection instead of closing it.
304
+ Use this when the connection might be broken (e.g., after
305
+ pg_terminate_backend) to prevent SQLAlchemy from trying to
306
+ reset it (which would result in an error being logged).
307
+ """
308
+ if self._connection:
309
+ try:
310
+ if invalidate:
311
+ self._connection.invalidate()
312
+ else:
313
+ self._connection.close()
314
+ except Exception as e: # pylint: disable=broad-except
315
+ if invalidate:
316
+ logger.debug(
317
+ f'Failed to invalidate postgres connection: {e}')
318
+ else:
319
+ logger.debug(f'Failed to close postgres connection: {e}')
320
+ self._connection = None
321
+
322
+ def is_locked(self) -> bool:
323
+ """Check if the postgres advisory lock is acquired."""
324
+ return self._acquired
325
+
326
+
327
+ def get_lock(lock_id: str,
328
+ timeout: Optional[float] = None,
329
+ lock_type: Optional[str] = None,
330
+ poll_interval: Optional[float] = None) -> DistributedLock:
331
+ """Create a distributed lock instance.
332
+
333
+ Args:
334
+ lock_id: Unique identifier for the lock.
335
+ timeout: Maximum time seconds to wait for lock acquisition,
336
+ None means wait indefinitely.
337
+ lock_type: Type of lock to create ('filelock' or 'postgres').
338
+ If None, auto-detect based on database configuration.
339
+
340
+ Returns:
341
+ DistributedLock instance.
342
+ """
343
+ if lock_type is None:
344
+ lock_type = _detect_lock_type()
345
+
346
+ if lock_type == 'postgres':
347
+ if poll_interval is None:
348
+ return PostgresLock(lock_id, timeout)
349
+ return PostgresLock(lock_id, timeout, poll_interval)
350
+ elif lock_type == 'filelock':
351
+ if poll_interval is None:
352
+ return FileLock(lock_id, timeout)
353
+ return FileLock(lock_id, timeout, poll_interval)
354
+ else:
355
+ raise ValueError(f'Unknown lock type: {lock_type}')
356
+
357
+
358
+ def _detect_lock_type() -> str:
359
+ """Auto-detect the appropriate lock type based on configuration."""
360
+ try:
361
+ engine = global_user_state.initialize_and_get_db()
362
+ if engine.dialect.name == db_utils.SQLAlchemyDialect.POSTGRESQL.value:
363
+ return 'postgres'
364
+ except Exception: # pylint: disable=broad-except
365
+ # Fall back to filelock if database detection fails
366
+ pass
367
+
368
+ return 'filelock'