skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -6,22 +6,40 @@ Concepts:
6
6
  - Cluster handle: (non-user facing) an opaque backend handle for us to
7
7
  interact with a cluster.
8
8
  """
9
+ import asyncio
10
+ import enum
11
+ import functools
9
12
  import json
10
13
  import os
11
- import pathlib
12
14
  import pickle
13
- import sqlite3
15
+ import re
16
+ import threading
14
17
  import time
15
18
  import typing
16
19
  from typing import Any, Dict, List, Optional, Set, Tuple
17
20
  import uuid
18
21
 
22
+ import sqlalchemy
23
+ from sqlalchemy import exc as sqlalchemy_exc
24
+ from sqlalchemy import orm
25
+ from sqlalchemy.dialects import postgresql
26
+ from sqlalchemy.dialects import sqlite
27
+ from sqlalchemy.ext import asyncio as sql_async
28
+ from sqlalchemy.ext import declarative
29
+
19
30
  from sky import models
20
31
  from sky import sky_logging
32
+ from sky import skypilot_config
33
+ from sky.metrics import utils as metrics_lib
34
+ from sky.skylet import constants
35
+ from sky.utils import annotations
21
36
  from sky.utils import common_utils
22
- from sky.utils import db_utils
37
+ from sky.utils import context_utils
23
38
  from sky.utils import registry
24
39
  from sky.utils import status_lib
40
+ from sky.utils import yaml_utils
41
+ from sky.utils.db import db_utils
42
+ from sky.utils.db import migration_utils
25
43
 
26
44
  if typing.TYPE_CHECKING:
27
45
  from sky import backends
@@ -32,171 +50,593 @@ if typing.TYPE_CHECKING:
32
50
  logger = sky_logging.init_logger(__name__)
33
51
 
34
52
  _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
35
-
36
- _DB_PATH = os.path.expanduser('~/.sky/state.db')
37
- pathlib.Path(_DB_PATH).parents[0].mkdir(parents=True, exist_ok=True)
38
-
39
-
40
- def create_table(cursor, conn):
53
+ _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
54
+
55
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
56
+ _SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
57
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
58
+
59
+ DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
60
+ DEBUG_CLUSTER_EVENT_RETENTION_HOURS = 30 * 24.0
61
+ MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
62
+
63
+ _UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS = [
64
+ # sqlite
65
+ 'UNIQUE constraint failed',
66
+ # postgres
67
+ 'duplicate key value violates unique constraint',
68
+ ]
69
+
70
+ Base = declarative.declarative_base()
71
+
72
+ config_table = sqlalchemy.Table(
73
+ 'config',
74
+ Base.metadata,
75
+ sqlalchemy.Column('key', sqlalchemy.Text, primary_key=True),
76
+ sqlalchemy.Column('value', sqlalchemy.Text),
77
+ )
78
+
79
+ user_table = sqlalchemy.Table(
80
+ 'users',
81
+ Base.metadata,
82
+ sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
83
+ sqlalchemy.Column('name', sqlalchemy.Text),
84
+ sqlalchemy.Column('password', sqlalchemy.Text),
85
+ sqlalchemy.Column('created_at', sqlalchemy.Integer),
86
+ )
87
+
88
+ cluster_table = sqlalchemy.Table(
89
+ 'clusters',
90
+ Base.metadata,
91
+ sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
92
+ sqlalchemy.Column('launched_at', sqlalchemy.Integer),
93
+ sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
94
+ sqlalchemy.Column('last_use', sqlalchemy.Text),
95
+ sqlalchemy.Column('status', sqlalchemy.Text),
96
+ sqlalchemy.Column('autostop', sqlalchemy.Integer, server_default='-1'),
97
+ sqlalchemy.Column('to_down', sqlalchemy.Integer, server_default='0'),
98
+ sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
99
+ sqlalchemy.Column('owner', sqlalchemy.Text, server_default=None),
100
+ sqlalchemy.Column('cluster_hash', sqlalchemy.Text, server_default=None),
101
+ sqlalchemy.Column('storage_mounts_metadata',
102
+ sqlalchemy.LargeBinary,
103
+ server_default=None),
104
+ sqlalchemy.Column('cluster_ever_up', sqlalchemy.Integer,
105
+ server_default='0'),
106
+ sqlalchemy.Column('status_updated_at',
107
+ sqlalchemy.Integer,
108
+ server_default=None),
109
+ sqlalchemy.Column('config_hash', sqlalchemy.Text, server_default=None),
110
+ sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
111
+ sqlalchemy.Column('workspace',
112
+ sqlalchemy.Text,
113
+ server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
114
+ sqlalchemy.Column('last_creation_yaml',
115
+ sqlalchemy.Text,
116
+ server_default=None),
117
+ sqlalchemy.Column('last_creation_command',
118
+ sqlalchemy.Text,
119
+ server_default=None),
120
+ sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
121
+ sqlalchemy.Column('provision_log_path',
122
+ sqlalchemy.Text,
123
+ server_default=None),
124
+ sqlalchemy.Column('skylet_ssh_tunnel_metadata',
125
+ sqlalchemy.LargeBinary,
126
+ server_default=None),
127
+ )
128
+
129
+ storage_table = sqlalchemy.Table(
130
+ 'storage',
131
+ Base.metadata,
132
+ sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
133
+ sqlalchemy.Column('launched_at', sqlalchemy.Integer),
134
+ sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
135
+ sqlalchemy.Column('last_use', sqlalchemy.Text),
136
+ sqlalchemy.Column('status', sqlalchemy.Text),
137
+ )
138
+
139
+ volume_table = sqlalchemy.Table(
140
+ 'volumes',
141
+ Base.metadata,
142
+ sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
143
+ sqlalchemy.Column('launched_at', sqlalchemy.Integer),
144
+ sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
145
+ sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
146
+ sqlalchemy.Column('workspace',
147
+ sqlalchemy.Text,
148
+ server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
149
+ sqlalchemy.Column('last_attached_at',
150
+ sqlalchemy.Integer,
151
+ server_default=None),
152
+ sqlalchemy.Column('last_use', sqlalchemy.Text),
153
+ sqlalchemy.Column('status', sqlalchemy.Text),
154
+ )
155
+
156
+ # Table for Cluster History
157
+ # usage_intervals: List[Tuple[int, int]]
158
+ # Specifies start and end timestamps of cluster.
159
+ # When the last end time is None, the cluster is still UP.
160
+ # Example: [(start1, end1), (start2, end2), (start3, None)]
161
+
162
+ # requested_resources: Set[resource_lib.Resource]
163
+ # Requested resources fetched from task that user specifies.
164
+
165
+ # launched_resources: Optional[resources_lib.Resources]
166
+ # Actual launched resources fetched from handle for cluster.
167
+
168
+ # num_nodes: Optional[int] number of nodes launched.
169
+ cluster_history_table = sqlalchemy.Table(
170
+ 'cluster_history',
171
+ Base.metadata,
172
+ sqlalchemy.Column('cluster_hash', sqlalchemy.Text, primary_key=True),
173
+ sqlalchemy.Column('name', sqlalchemy.Text),
174
+ sqlalchemy.Column('num_nodes', sqlalchemy.Integer),
175
+ sqlalchemy.Column('requested_resources', sqlalchemy.LargeBinary),
176
+ sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
177
+ sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
178
+ sqlalchemy.Column('user_hash', sqlalchemy.Text),
179
+ sqlalchemy.Column('last_creation_yaml',
180
+ sqlalchemy.Text,
181
+ server_default=None),
182
+ sqlalchemy.Column('last_creation_command',
183
+ sqlalchemy.Text,
184
+ server_default=None),
185
+ sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
186
+ sqlalchemy.Column('provision_log_path',
187
+ sqlalchemy.Text,
188
+ server_default=None),
189
+ sqlalchemy.Column('last_activity_time',
190
+ sqlalchemy.Integer,
191
+ server_default=None,
192
+ index=True),
193
+ sqlalchemy.Column('launched_at',
194
+ sqlalchemy.Integer,
195
+ server_default=None,
196
+ index=True),
197
+ )
198
+
199
+
200
+ class ClusterEventType(enum.Enum):
201
+ """Type of cluster event."""
202
+ DEBUG = 'DEBUG'
203
+ """Used to denote events that are not related to cluster status."""
204
+
205
+ STATUS_CHANGE = 'STATUS_CHANGE'
206
+ """Used to denote events that modify cluster status."""
207
+
208
+
209
+ # Table for cluster status change events.
210
+ # starting_status: Status of the cluster at the start of the event.
211
+ # ending_status: Status of the cluster at the end of the event.
212
+ # reason: Reason for the transition.
213
+ # transitioned_at: Timestamp of the transition.
214
+ cluster_event_table = sqlalchemy.Table(
215
+ 'cluster_events',
216
+ Base.metadata,
217
+ sqlalchemy.Column('cluster_hash', sqlalchemy.Text, primary_key=True),
218
+ sqlalchemy.Column('name', sqlalchemy.Text),
219
+ sqlalchemy.Column('starting_status', sqlalchemy.Text),
220
+ sqlalchemy.Column('ending_status', sqlalchemy.Text),
221
+ sqlalchemy.Column('reason', sqlalchemy.Text, primary_key=True),
222
+ sqlalchemy.Column('transitioned_at', sqlalchemy.Integer, primary_key=True),
223
+ sqlalchemy.Column('type', sqlalchemy.Text),
224
+ sqlalchemy.Column('request_id', sqlalchemy.Text, server_default=None),
225
+ )
226
+
227
+ ssh_key_table = sqlalchemy.Table(
228
+ 'ssh_key',
229
+ Base.metadata,
230
+ sqlalchemy.Column('user_hash', sqlalchemy.Text, primary_key=True),
231
+ sqlalchemy.Column('ssh_public_key', sqlalchemy.Text),
232
+ sqlalchemy.Column('ssh_private_key', sqlalchemy.Text),
233
+ )
234
+
235
+ service_account_token_table = sqlalchemy.Table(
236
+ 'service_account_tokens',
237
+ Base.metadata,
238
+ sqlalchemy.Column('token_id', sqlalchemy.Text, primary_key=True),
239
+ sqlalchemy.Column('token_name', sqlalchemy.Text),
240
+ sqlalchemy.Column('token_hash', sqlalchemy.Text),
241
+ sqlalchemy.Column('created_at', sqlalchemy.Integer),
242
+ sqlalchemy.Column('last_used_at', sqlalchemy.Integer, server_default=None),
243
+ sqlalchemy.Column('expires_at', sqlalchemy.Integer, server_default=None),
244
+ sqlalchemy.Column('creator_user_hash',
245
+ sqlalchemy.Text), # Who created this token
246
+ sqlalchemy.Column('service_account_user_id',
247
+ sqlalchemy.Text), # Service account's own user ID
248
+ )
249
+
250
+ cluster_yaml_table = sqlalchemy.Table(
251
+ 'cluster_yaml',
252
+ Base.metadata,
253
+ sqlalchemy.Column('cluster_name', sqlalchemy.Text, primary_key=True),
254
+ sqlalchemy.Column('yaml', sqlalchemy.Text),
255
+ )
256
+
257
+ system_config_table = sqlalchemy.Table(
258
+ 'system_config',
259
+ Base.metadata,
260
+ sqlalchemy.Column('config_key', sqlalchemy.Text, primary_key=True),
261
+ sqlalchemy.Column('config_value', sqlalchemy.Text),
262
+ sqlalchemy.Column('created_at', sqlalchemy.Integer),
263
+ sqlalchemy.Column('updated_at', sqlalchemy.Integer),
264
+ )
265
+
266
+
267
+ def _glob_to_similar(glob_pattern):
268
+ """Converts a glob pattern to a PostgreSQL LIKE pattern."""
269
+
270
+ # Escape special LIKE characters that are not special in glob
271
+ glob_pattern = glob_pattern.replace('%', '\\%').replace('_', '\\_')
272
+
273
+ # Convert glob wildcards to LIKE wildcards
274
+ like_pattern = glob_pattern.replace('*', '%').replace('?', '_')
275
+
276
+ # Handle character classes, including negation
277
+ def replace_char_class(match):
278
+ group = match.group(0)
279
+ if group.startswith('[!'):
280
+ return '[^' + group[2:-1] + ']'
281
+ return group
282
+
283
+ like_pattern = re.sub(r'\[(!)?.*?\]', replace_char_class, like_pattern)
284
+ return like_pattern
285
+
286
+
287
+ def create_table(engine: sqlalchemy.engine.Engine):
41
288
  # Enable WAL mode to avoid locking issues.
42
289
  # See: issue #1441 and PR #1509
43
290
  # https://github.com/microsoft/WSL/issues/2395
44
291
  # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
45
292
  # This may cause the database locked problem from WSL issue #1441.
46
- if not common_utils.is_wsl():
293
+ if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
294
+ not common_utils.is_wsl()):
47
295
  try:
48
- cursor.execute('PRAGMA journal_mode=WAL')
49
- except sqlite3.OperationalError as e:
296
+ with orm.Session(engine) as session:
297
+ session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
298
+ session.commit()
299
+ except sqlalchemy_exc.OperationalError as e:
50
300
  if 'database is locked' not in str(e):
51
301
  raise
52
302
  # If the database is locked, it is OK to continue, as the WAL mode
53
303
  # is not critical and is likely to be enabled by other processes.
54
304
 
55
- # Table for Clusters
56
- cursor.execute("""\
57
- CREATE TABLE IF NOT EXISTS clusters (
58
- name TEXT PRIMARY KEY,
59
- launched_at INTEGER,
60
- handle BLOB,
61
- last_use TEXT,
62
- status TEXT,
63
- autostop INTEGER DEFAULT -1,
64
- metadata TEXT DEFAULT '{}',
65
- to_down INTEGER DEFAULT 0,
66
- owner TEXT DEFAULT null,
67
- cluster_hash TEXT DEFAULT null,
68
- storage_mounts_metadata BLOB DEFAULT null,
69
- cluster_ever_up INTEGER DEFAULT 0,
70
- status_updated_at INTEGER DEFAULT null,
71
- config_hash TEXT DEFAULT null,
72
- user_hash TEXT DEFAULT null)""")
73
-
74
- # Table for Cluster History
75
- # usage_intervals: List[Tuple[int, int]]
76
- # Specifies start and end timestamps of cluster.
77
- # When the last end time is None, the cluster is still UP.
78
- # Example: [(start1, end1), (start2, end2), (start3, None)]
79
-
80
- # requested_resources: Set[resource_lib.Resource]
81
- # Requested resources fetched from task that user specifies.
82
-
83
- # launched_resources: Optional[resources_lib.Resources]
84
- # Actual launched resources fetched from handle for cluster.
85
-
86
- # num_nodes: Optional[int] number of nodes launched.
87
-
88
- cursor.execute("""\
89
- CREATE TABLE IF NOT EXISTS cluster_history (
90
- cluster_hash TEXT PRIMARY KEY,
91
- name TEXT,
92
- num_nodes int,
93
- requested_resources BLOB,
94
- launched_resources BLOB,
95
- usage_intervals BLOB,
96
- user_hash TEXT)""")
97
- # Table for configs (e.g. enabled clouds)
98
- cursor.execute("""\
99
- CREATE TABLE IF NOT EXISTS config (
100
- key TEXT PRIMARY KEY, value TEXT)""")
101
- # Table for Storage
102
- cursor.execute("""\
103
- CREATE TABLE IF NOT EXISTS storage (
104
- name TEXT PRIMARY KEY,
105
- launched_at INTEGER,
106
- handle BLOB,
107
- last_use TEXT,
108
- status TEXT)""")
109
- # Table for User
110
- cursor.execute("""\
111
- CREATE TABLE IF NOT EXISTS users (
112
- id TEXT PRIMARY KEY,
113
- name TEXT)""")
114
- # For backward compatibility.
115
- # TODO(zhwu): Remove this function after all users have migrated to
116
- # the latest version of SkyPilot.
117
- # Add autostop column to clusters table
118
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'autostop',
119
- 'INTEGER DEFAULT -1')
120
-
121
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'metadata',
122
- 'TEXT DEFAULT \'{}\'')
123
-
124
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down',
125
- 'INTEGER DEFAULT 0')
126
-
127
- # The cloud identity that created the cluster.
128
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'owner', 'TEXT')
129
-
130
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash',
131
- 'TEXT DEFAULT null')
132
-
133
- db_utils.add_column_to_table(cursor, conn, 'clusters',
134
- 'storage_mounts_metadata', 'BLOB DEFAULT null')
135
- db_utils.add_column_to_table(
136
- cursor,
137
- conn,
138
- 'clusters',
139
- 'cluster_ever_up',
140
- 'INTEGER DEFAULT 0',
141
- # Set the value to 1 so that all the existing clusters before #2977
142
- # are considered as ever up, i.e:
143
- # existing cluster's default (null) -> 1;
144
- # new cluster's default -> 0;
145
- # This is conservative for the existing clusters: even if some INIT
146
- # clusters were never really UP, setting it to 1 means they won't be
147
- # auto-deleted during any failover.
148
- value_to_replace_existing_entries=1)
149
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
150
- 'INTEGER DEFAULT null')
151
- db_utils.add_column_to_table(
152
- cursor,
153
- conn,
154
- 'clusters',
155
- 'user_hash',
156
- 'TEXT DEFAULT null',
157
- value_to_replace_existing_entries=common_utils.get_user_hash())
158
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
159
- 'TEXT DEFAULT null')
160
-
161
- db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
162
- 'TEXT DEFAULT null')
163
-
164
- db_utils.add_column_to_table(cursor, conn, 'cluster_history', 'user_hash',
165
- 'TEXT DEFAULT null')
166
- conn.commit()
167
-
168
-
169
- _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
170
-
171
-
172
- def add_or_update_user(user: models.User):
173
- """Store the mapping from user hash to user name for display purposes."""
174
- if user.name is None:
175
- return
176
- _DB.cursor.execute('INSERT OR REPLACE INTO users (id, name) VALUES (?, ?)',
177
- (user.id, user.name))
178
- _DB.conn.commit()
305
+ migration_utils.safe_alembic_upgrade(
306
+ engine, migration_utils.GLOBAL_USER_STATE_DB_NAME,
307
+ migration_utils.GLOBAL_USER_STATE_VERSION)
179
308
 
180
309
 
181
- def get_user(user_id: str) -> models.User:
182
- row = _DB.cursor.execute('SELECT id, name FROM users WHERE id=?',
183
- (user_id,)).fetchone()
184
- if row is None:
185
- return models.User(id=user_id)
186
- return models.User(id=row[0], name=row[1])
310
+ def initialize_and_get_db_async() -> sql_async.AsyncEngine:
311
+ global _SQLALCHEMY_ENGINE_ASYNC
312
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
313
+ return _SQLALCHEMY_ENGINE_ASYNC
314
+ with _SQLALCHEMY_ENGINE_LOCK:
315
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
316
+ return _SQLALCHEMY_ENGINE_ASYNC
187
317
 
318
+ _SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('state',
319
+ async_engine=True)
320
+ initialize_and_get_db()
321
+ return _SQLALCHEMY_ENGINE_ASYNC
188
322
 
189
- def get_all_users() -> List[models.User]:
190
- rows = _DB.cursor.execute('SELECT id, name FROM users').fetchall()
191
- return [models.User(id=row[0], name=row[1]) for row in rows]
192
323
 
324
+ # We wrap the sqlalchemy engine initialization in a thread
325
+ # lock to ensure that multiple threads do not initialize the
326
+ # engine which could result in a rare race condition where
327
+ # a session has already been created with _SQLALCHEMY_ENGINE = e1,
328
+ # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
329
+ # which could result in e1 being garbage collected unexpectedly.
330
+ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
331
+ global _SQLALCHEMY_ENGINE
332
+
333
+ if _SQLALCHEMY_ENGINE is not None:
334
+ return _SQLALCHEMY_ENGINE
335
+ with _SQLALCHEMY_ENGINE_LOCK:
336
+ if _SQLALCHEMY_ENGINE is not None:
337
+ return _SQLALCHEMY_ENGINE
338
+ # get an engine to the db
339
+ engine = db_utils.get_engine('state')
340
+
341
+ # run migrations if needed
342
+ create_table(engine)
343
+
344
+ # return engine
345
+ _SQLALCHEMY_ENGINE = engine
346
+ # Cache the result of _sqlite_supports_returning()
347
+ # ahead of time, as it won't change throughout
348
+ # the lifetime of the engine.
349
+ _sqlite_supports_returning()
350
+ return _SQLALCHEMY_ENGINE
351
+
352
+
353
+ def _init_db_async(func):
354
+ """Initialize the async database."""
355
+
356
+ @functools.wraps(func)
357
+ async def wrapper(*args, **kwargs):
358
+ if _SQLALCHEMY_ENGINE_ASYNC is None:
359
+ # this may happen multiple times since there is no locking
360
+ # here but thats fine, this is just a short circuit for the
361
+ # common case.
362
+ await context_utils.to_thread(initialize_and_get_db_async)
193
363
 
364
+ return await func(*args, **kwargs)
365
+
366
+ return wrapper
367
+
368
+
369
+ def _init_db(func):
370
+ """Initialize the database."""
371
+
372
+ @functools.wraps(func)
373
+ def wrapper(*args, **kwargs):
374
+ initialize_and_get_db()
375
+ return func(*args, **kwargs)
376
+
377
+ return wrapper
378
+
379
+
380
+ @annotations.lru_cache(scope='global', maxsize=1)
381
+ def _sqlite_supports_returning() -> bool:
382
+ """Check if SQLite (3.35.0+) and SQLAlchemy (2.0+) support RETURNING.
383
+
384
+ See https://sqlite.org/lang_returning.html and
385
+ https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#insert-update-delete-returning # pylint: disable=line-too-long
386
+ """
387
+ sqlalchemy_version_parts = sqlalchemy.__version__.split('.')
388
+ assert len(sqlalchemy_version_parts) >= 1, \
389
+ f'Invalid SQLAlchemy version: {sqlalchemy.__version__}'
390
+ sqlalchemy_major = int(sqlalchemy_version_parts[0])
391
+ if sqlalchemy_major < 2:
392
+ return False
393
+
394
+ assert _SQLALCHEMY_ENGINE is not None
395
+ if (_SQLALCHEMY_ENGINE.dialect.name !=
396
+ db_utils.SQLAlchemyDialect.SQLITE.value):
397
+ return False
398
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
399
+ result = session.execute(sqlalchemy.text('SELECT sqlite_version()'))
400
+ version_str = result.scalar()
401
+ version_parts = version_str.split('.')
402
+ assert len(version_parts) >= 2, \
403
+ f'Invalid version string: {version_str}'
404
+ major, minor = int(version_parts[0]), int(version_parts[1])
405
+ return (major > 3) or (major == 3 and minor >= 35)
406
+
407
+
408
+ @_init_db
409
+ @metrics_lib.time_me
410
+ def add_or_update_user(
411
+ user: models.User,
412
+ allow_duplicate_name: bool = True,
413
+ return_user: bool = False
414
+ ) -> typing.Union[bool, typing.Tuple[bool, models.User]]:
415
+ """Store the mapping from user hash to user name for display purposes.
416
+
417
+ Returns:
418
+ If return_user=False: bool (whether the user is newly added)
419
+ If return_user=True: Tuple[bool, models.User]
420
+ """
421
+ assert _SQLALCHEMY_ENGINE is not None
422
+
423
+ if user.name is None:
424
+ return (False, user) if return_user else False
425
+
426
+ # Set created_at if not already set
427
+ created_at = user.created_at
428
+ if created_at is None:
429
+ created_at = int(time.time())
430
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
431
+ # Check for duplicate names if not allowed (within the same transaction)
432
+ if not allow_duplicate_name:
433
+ existing_user = session.query(user_table).filter(
434
+ user_table.c.name == user.name).first()
435
+ if existing_user is not None:
436
+ return (False, user) if return_user else False
437
+
438
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
439
+ db_utils.SQLAlchemyDialect.SQLITE.value):
440
+ # For SQLite, use INSERT OR IGNORE followed by UPDATE to detect new
441
+ # vs existing
442
+ insert_func = sqlite.insert
443
+
444
+ # First try INSERT OR IGNORE - this won't fail if user exists
445
+ insert_stmnt = insert_func(user_table).prefix_with(
446
+ 'OR IGNORE').values(id=user.id,
447
+ name=user.name,
448
+ password=user.password,
449
+ created_at=created_at)
450
+ use_returning = return_user and _sqlite_supports_returning()
451
+ if use_returning:
452
+ insert_stmnt = insert_stmnt.returning(
453
+ user_table.c.id,
454
+ user_table.c.name,
455
+ user_table.c.password,
456
+ user_table.c.created_at,
457
+ )
458
+ result = session.execute(insert_stmnt)
459
+
460
+ row = None
461
+ if use_returning:
462
+ # With RETURNING, check if we got a row back.
463
+ row = result.fetchone()
464
+ was_inserted = row is not None
465
+ else:
466
+ # Without RETURNING, use rowcount.
467
+ was_inserted = result.rowcount > 0
468
+
469
+ if not was_inserted:
470
+ # User existed, so update it (but don't update created_at)
471
+ update_values = {user_table.c.name: user.name}
472
+ if user.password:
473
+ update_values[user_table.c.password] = user.password
474
+
475
+ update_stmnt = sqlalchemy.update(user_table).where(
476
+ user_table.c.id == user.id).values(update_values)
477
+ if use_returning:
478
+ update_stmnt = update_stmnt.returning(
479
+ user_table.c.id, user_table.c.name,
480
+ user_table.c.password, user_table.c.created_at)
481
+
482
+ result = session.execute(update_stmnt)
483
+ if use_returning:
484
+ row = result.fetchone()
485
+
486
+ session.commit()
487
+
488
+ if return_user:
489
+ if row is None:
490
+ # row=None means the sqlite used has no RETURNING support,
491
+ # so we need to do a separate query
492
+ row = session.query(user_table).filter_by(
493
+ id=user.id).first()
494
+ updated_user = models.User(id=row.id,
495
+ name=row.name,
496
+ password=row.password,
497
+ created_at=row.created_at)
498
+ return was_inserted, updated_user
499
+ else:
500
+ return was_inserted
501
+
502
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
503
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
504
+ # For PostgreSQL, use INSERT ... ON CONFLICT with RETURNING to
505
+ # detect insert vs update
506
+ insert_func = postgresql.insert
507
+
508
+ insert_stmnt = insert_func(user_table).values(
509
+ id=user.id,
510
+ name=user.name,
511
+ password=user.password,
512
+ created_at=created_at)
513
+
514
+ # Use a sentinel in the RETURNING clause to detect insert vs update
515
+ if user.password:
516
+ set_ = {
517
+ user_table.c.name: user.name,
518
+ user_table.c.password: user.password
519
+ }
520
+ else:
521
+ set_ = {user_table.c.name: user.name}
522
+ upsert_stmnt = insert_stmnt.on_conflict_do_update(
523
+ index_elements=[user_table.c.id], set_=set_).returning(
524
+ user_table.c.id,
525
+ user_table.c.name,
526
+ user_table.c.password,
527
+ user_table.c.created_at,
528
+ # This will be True for INSERT, False for UPDATE
529
+ sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
530
+ ))
531
+
532
+ result = session.execute(upsert_stmnt)
533
+ row = result.fetchone()
534
+
535
+ was_inserted = bool(row.was_inserted) if row else False
536
+ session.commit()
537
+
538
+ if return_user:
539
+ updated_user = models.User(id=row.id,
540
+ name=row.name,
541
+ password=row.password,
542
+ created_at=row.created_at)
543
+ return was_inserted, updated_user
544
+ else:
545
+ return was_inserted
546
+ else:
547
+ raise ValueError('Unsupported database dialect')
548
+
549
+
550
+ @_init_db
551
+ @metrics_lib.time_me
552
+ def get_user(user_id: str) -> Optional[models.User]:
553
+ assert _SQLALCHEMY_ENGINE is not None
554
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
555
+ row = session.query(user_table).filter_by(id=user_id).first()
556
+ if row is None:
557
+ return None
558
+ return models.User(id=row.id,
559
+ name=row.name,
560
+ password=row.password,
561
+ created_at=row.created_at)
562
+
563
+
564
+ @_init_db
565
+ @metrics_lib.time_me
566
+ def get_users(user_ids: Set[str]) -> Dict[str, models.User]:
567
+ assert _SQLALCHEMY_ENGINE is not None
568
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
569
+ rows = session.query(user_table).filter(
570
+ user_table.c.id.in_(user_ids)).all()
571
+ return {
572
+ row.id: models.User(id=row.id,
573
+ name=row.name,
574
+ password=row.password,
575
+ created_at=row.created_at) for row in rows
576
+ }
577
+
578
+
579
+ @_init_db
580
+ @metrics_lib.time_me
581
+ def get_user_by_name(username: str) -> List[models.User]:
582
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
583
+ rows = session.query(user_table).filter_by(name=username).all()
584
+ if len(rows) == 0:
585
+ return []
586
+ return [
587
+ models.User(id=row.id,
588
+ name=row.name,
589
+ password=row.password,
590
+ created_at=row.created_at) for row in rows
591
+ ]
592
+
593
+
594
+ @_init_db
595
+ @metrics_lib.time_me
596
+ def get_user_by_name_match(username_match: str) -> List[models.User]:
597
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
598
+ rows = session.query(user_table).filter(
599
+ user_table.c.name.like(f'%{username_match}%')).all()
600
+ return [
601
+ models.User(id=row.id, name=row.name, created_at=row.created_at)
602
+ for row in rows
603
+ ]
604
+
605
+
606
+ @_init_db
607
+ @metrics_lib.time_me
608
+ def delete_user(user_id: str) -> None:
609
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
610
+ session.query(user_table).filter_by(id=user_id).delete()
611
+ session.commit()
612
+
613
+
614
+ @_init_db
615
+ @metrics_lib.time_me
616
+ def get_all_users() -> List[models.User]:
617
+ assert _SQLALCHEMY_ENGINE is not None
618
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
619
+ rows = session.query(user_table).all()
620
+ return [
621
+ models.User(id=row.id,
622
+ name=row.name,
623
+ password=row.password,
624
+ created_at=row.created_at) for row in rows
625
+ ]
626
+
627
+
628
+ @_init_db
629
+ @metrics_lib.time_me
194
630
  def add_or_update_cluster(cluster_name: str,
195
631
  cluster_handle: 'backends.ResourceHandle',
196
632
  requested_resources: Optional[Set[Any]],
197
633
  ready: bool,
198
634
  is_launch: bool = True,
199
- config_hash: Optional[str] = None):
635
+ config_hash: Optional[str] = None,
636
+ task_config: Optional[Dict[str, Any]] = None,
637
+ is_managed: bool = False,
638
+ provision_log_path: Optional[str] = None,
639
+ existing_cluster_hash: Optional[str] = None):
200
640
  """Adds or updates cluster_name -> cluster_handle mapping.
201
641
 
202
642
  Args:
@@ -207,7 +647,17 @@ def add_or_update_cluster(cluster_name: str,
207
647
  be marked as INIT, otherwise it will be marked as UP.
208
648
  is_launch: if the cluster is firstly launched. If True, the launched_at
209
649
  and last_use will be updated. Otherwise, use the old value.
650
+ config_hash: Configuration hash for the cluster.
651
+ task_config: The config of the task being launched.
652
+ is_managed: Whether the cluster is launched by the
653
+ controller.
654
+ provision_log_path: Absolute path to provision.log, if available.
655
+ existing_cluster_hash: If specified, the cluster will be updated
656
+ only if the cluster_hash matches. If a cluster does not exist,
657
+ it will not be inserted and an error will be raised.
210
658
  """
659
+ assert _SQLALCHEMY_ENGINE is not None
660
+
211
661
  # FIXME: launched_at will be changed when `sky launch -c` is called.
212
662
  handle = pickle.dumps(cluster_handle)
213
663
  cluster_launched_at = int(time.time()) if is_launch else None
@@ -240,143 +690,362 @@ def add_or_update_cluster(cluster_name: str,
240
690
  cluster_launched_at = int(time.time())
241
691
  usage_intervals.append((cluster_launched_at, None))
242
692
 
243
- user_hash = common_utils.get_user_hash()
244
-
245
- _DB.cursor.execute(
246
- 'INSERT or REPLACE INTO clusters'
247
- # All the fields need to exist here, even if they don't need
248
- # be changed, as the INSERT OR REPLACE statement will replace
249
- # the field of the existing row with the default value if not
250
- # specified.
251
- '(name, launched_at, handle, last_use, status, '
252
- 'autostop, to_down, metadata, owner, cluster_hash, '
253
- 'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
254
- 'config_hash, user_hash) '
255
- 'VALUES ('
256
- # name
257
- '?, '
258
- # launched_at
259
- 'COALESCE('
260
- '?, (SELECT launched_at FROM clusters WHERE name=?)), '
261
- # handle
262
- '?, '
263
- # last_use
264
- 'COALESCE('
265
- '?, (SELECT last_use FROM clusters WHERE name=?)), '
266
- # status
267
- '?, '
268
- # autostop
269
- # Keep the old autostop value if it exists, otherwise set it to
270
- # default -1.
271
- 'COALESCE('
272
- '(SELECT autostop FROM clusters WHERE name=? AND status!=?), -1), '
273
- # Keep the old to_down value if it exists, otherwise set it to
274
- # default 0.
275
- 'COALESCE('
276
- '(SELECT to_down FROM clusters WHERE name=? AND status!=?), 0),'
277
- # Keep the old metadata value if it exists, otherwise set it to
278
- # default {}.
279
- 'COALESCE('
280
- '(SELECT metadata FROM clusters WHERE name=?), \'{}\'),'
281
- # Keep the old owner value if it exists, otherwise set it to
282
- # default null.
283
- 'COALESCE('
284
- '(SELECT owner FROM clusters WHERE name=?), null),'
285
- # cluster_hash
286
- '?,'
287
- # storage_mounts_metadata
288
- 'COALESCE('
289
- '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
290
- # cluster_ever_up
291
- '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?), '
292
- # status_updated_at
293
- '?,'
294
- # config_hash
295
- 'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?)),'
296
- # user_hash: keep original user_hash if it exists
297
- 'COALESCE('
298
- '(SELECT user_hash FROM clusters WHERE name=?), ?)'
299
- ')',
300
- (
301
- # name
302
- cluster_name,
303
- # launched_at
304
- cluster_launched_at,
305
- cluster_name,
306
- # handle
307
- handle,
308
- # last_use
309
- last_use,
310
- cluster_name,
311
- # status
312
- status.value,
313
- # autostop
314
- cluster_name,
315
- status_lib.ClusterStatus.STOPPED.value,
316
- # to_down
317
- cluster_name,
318
- status_lib.ClusterStatus.STOPPED.value,
319
- # metadata
320
- cluster_name,
321
- # owner
322
- cluster_name,
323
- # cluster_hash
324
- cluster_hash,
325
- # storage_mounts_metadata
326
- cluster_name,
327
- # cluster_ever_up
328
- cluster_name,
329
- int(ready),
330
- # status_updated_at
331
- status_updated_at,
332
- # config_hash
333
- config_hash,
334
- cluster_name,
335
- # user_hash
336
- cluster_name,
337
- user_hash,
338
- ))
339
-
340
- launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
341
- launched_resources = getattr(cluster_handle, 'launched_resources', None)
342
- _DB.cursor.execute(
343
- 'INSERT or REPLACE INTO cluster_history'
344
- '(cluster_hash, name, num_nodes, requested_resources, '
345
- 'launched_resources, usage_intervals, user_hash) '
346
- 'VALUES ('
347
- # hash
348
- '?, '
349
- # name
350
- '?, '
351
- # requested resources
352
- '?, '
353
- # launched resources
354
- '?, '
355
- # number of nodes
356
- '?, '
357
- # usage intervals
358
- '?, '
359
- # user_hash
360
- '?'
361
- ')',
362
- (
363
- # hash
364
- cluster_hash,
365
- # name
366
- cluster_name,
367
- # number of nodes
368
- launched_nodes,
369
- # requested resources
370
- pickle.dumps(requested_resources),
371
- # launched resources
372
- pickle.dumps(launched_resources),
373
- # usage intervals
374
- pickle.dumps(usage_intervals),
375
- # user_hash
376
- user_hash,
377
- ))
378
-
379
- _DB.conn.commit()
693
+ user_hash = common_utils.get_current_user().id
694
+ active_workspace = skypilot_config.get_active_workspace()
695
+ history_workspace = active_workspace
696
+ history_hash = user_hash
697
+
698
+ conditional_values = {}
699
+ if is_launch:
700
+ conditional_values.update({
701
+ 'launched_at': cluster_launched_at,
702
+ 'last_use': last_use
703
+ })
704
+
705
+ if int(ready) == 1:
706
+ conditional_values.update({
707
+ 'cluster_ever_up': 1,
708
+ })
709
+
710
+ if config_hash is not None:
711
+ conditional_values.update({
712
+ 'config_hash': config_hash,
713
+ })
714
+
715
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
716
+ # with_for_update() locks the row until commit() or rollback()
717
+ # is called, or until the code escapes the with block.
718
+ cluster_row = session.query(cluster_table).filter_by(
719
+ name=cluster_name).with_for_update().first()
720
+ if (not cluster_row or
721
+ cluster_row.status == status_lib.ClusterStatus.STOPPED.value):
722
+ conditional_values.update({
723
+ 'autostop': -1,
724
+ 'to_down': 0,
725
+ })
726
+ if not cluster_row or not cluster_row.user_hash:
727
+ conditional_values.update({
728
+ 'user_hash': user_hash,
729
+ })
730
+ if not cluster_row or not cluster_row.workspace:
731
+ conditional_values.update({
732
+ 'workspace': active_workspace,
733
+ })
734
+ if (is_launch and not cluster_row or
735
+ cluster_row.status != status_lib.ClusterStatus.UP.value):
736
+ conditional_values.update({
737
+ 'last_creation_yaml': yaml_utils.dump_yaml_str(task_config)
738
+ if task_config else None,
739
+ 'last_creation_command': last_use,
740
+ })
741
+ if provision_log_path is not None:
742
+ conditional_values.update({
743
+ 'provision_log_path': provision_log_path,
744
+ })
745
+
746
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
747
+ db_utils.SQLAlchemyDialect.SQLITE.value):
748
+ insert_func = sqlite.insert
749
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
750
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
751
+ insert_func = postgresql.insert
752
+ else:
753
+ session.rollback()
754
+ raise ValueError('Unsupported database dialect')
755
+
756
+ if existing_cluster_hash is not None:
757
+ count = session.query(cluster_table).filter_by(
758
+ name=cluster_name, cluster_hash=existing_cluster_hash).update({
759
+ **conditional_values, cluster_table.c.handle: handle,
760
+ cluster_table.c.status: status.value,
761
+ cluster_table.c.status_updated_at: status_updated_at
762
+ })
763
+ assert count <= 1
764
+ if count == 0:
765
+ raise ValueError(f'Cluster {cluster_name} with hash '
766
+ f'{existing_cluster_hash} not found.')
767
+ else:
768
+ insert_stmnt = insert_func(cluster_table).values(
769
+ name=cluster_name,
770
+ **conditional_values,
771
+ handle=handle,
772
+ status=status.value,
773
+ # set metadata to server default ('{}')
774
+ # set owner to server default (null)
775
+ cluster_hash=cluster_hash,
776
+ # set storage_mounts_metadata to server default (null)
777
+ status_updated_at=status_updated_at,
778
+ is_managed=int(is_managed),
779
+ )
780
+ insert_or_update_stmt = insert_stmnt.on_conflict_do_update(
781
+ index_elements=[cluster_table.c.name],
782
+ set_={
783
+ **conditional_values,
784
+ cluster_table.c.handle: handle,
785
+ cluster_table.c.status: status.value,
786
+ # do not update metadata value
787
+ # do not update owner value
788
+ cluster_table.c.cluster_hash: cluster_hash,
789
+ # do not update storage_mounts_metadata
790
+ cluster_table.c.status_updated_at: status_updated_at,
791
+ # do not update user_hash
792
+ })
793
+ session.execute(insert_or_update_stmt)
794
+
795
+ # Modify cluster history table
796
+ launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
797
+ launched_resources = getattr(cluster_handle, 'launched_resources', None)
798
+ if cluster_row and cluster_row.workspace:
799
+ history_workspace = cluster_row.workspace
800
+ if cluster_row and cluster_row.user_hash:
801
+ history_hash = cluster_row.user_hash
802
+ creation_info = {}
803
+ if conditional_values.get('last_creation_yaml') is not None:
804
+ creation_info = {
805
+ 'last_creation_yaml':
806
+ conditional_values.get('last_creation_yaml'),
807
+ 'last_creation_command':
808
+ conditional_values.get('last_creation_command'),
809
+ }
810
+
811
+ # Calculate last_activity_time and launched_at from usage_intervals
812
+ last_activity_time = _get_cluster_last_activity_time(usage_intervals)
813
+ launched_at = _get_cluster_launch_time(usage_intervals)
814
+
815
+ insert_stmnt = insert_func(cluster_history_table).values(
816
+ cluster_hash=cluster_hash,
817
+ name=cluster_name,
818
+ num_nodes=launched_nodes,
819
+ requested_resources=pickle.dumps(requested_resources),
820
+ launched_resources=pickle.dumps(launched_resources),
821
+ usage_intervals=pickle.dumps(usage_intervals),
822
+ user_hash=user_hash,
823
+ workspace=history_workspace,
824
+ provision_log_path=provision_log_path,
825
+ last_activity_time=last_activity_time,
826
+ launched_at=launched_at,
827
+ **creation_info,
828
+ )
829
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
830
+ index_elements=[cluster_history_table.c.cluster_hash],
831
+ set_={
832
+ cluster_history_table.c.name: cluster_name,
833
+ cluster_history_table.c.num_nodes: launched_nodes,
834
+ cluster_history_table.c.requested_resources:
835
+ pickle.dumps(requested_resources),
836
+ cluster_history_table.c.launched_resources:
837
+ pickle.dumps(launched_resources),
838
+ cluster_history_table.c.usage_intervals:
839
+ pickle.dumps(usage_intervals),
840
+ cluster_history_table.c.user_hash: history_hash,
841
+ cluster_history_table.c.workspace: history_workspace,
842
+ cluster_history_table.c.provision_log_path: provision_log_path,
843
+ cluster_history_table.c.last_activity_time: last_activity_time,
844
+ cluster_history_table.c.launched_at: launched_at,
845
+ **creation_info,
846
+ })
847
+ session.execute(do_update_stmt)
848
+
849
+ session.commit()
850
+
851
+
852
+ @_init_db
853
+ @metrics_lib.time_me
854
+ def add_cluster_event(cluster_name: str,
855
+ new_status: Optional[status_lib.ClusterStatus],
856
+ reason: str,
857
+ event_type: ClusterEventType,
858
+ nop_if_duplicate: bool = False,
859
+ duplicate_regex: Optional[str] = None,
860
+ expose_duplicate_error: bool = False,
861
+ transitioned_at: Optional[int] = None) -> None:
862
+ """Add a cluster event.
863
+
864
+ Args:
865
+ cluster_name: Name of the cluster.
866
+ new_status: New status of the cluster.
867
+ reason: Reason for the event.
868
+ event_type: Type of the event.
869
+ nop_if_duplicate: If True, do not add the event if it is a duplicate.
870
+ duplicate_regex: If provided, do not add the event if it matches the
871
+ regex. Only used if nop_if_duplicate is True.
872
+ expose_duplicate_error: If True, raise an error if the event is a
873
+ duplicate. Only used if nop_if_duplicate is True.
874
+ transitioned_at: If provided, use this timestamp for the event.
875
+ """
876
+ assert _SQLALCHEMY_ENGINE is not None
877
+ cluster_hash = _get_hash_for_existing_cluster(cluster_name)
878
+ if cluster_hash is None:
879
+ logger.debug(f'Hash for cluster {cluster_name} not found. '
880
+ 'Skipping event.')
881
+ return
882
+ if transitioned_at is None:
883
+ transitioned_at = int(time.time())
884
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
885
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
886
+ db_utils.SQLAlchemyDialect.SQLITE.value):
887
+ insert_func = sqlite.insert
888
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
889
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
890
+ insert_func = postgresql.insert
891
+ else:
892
+ session.rollback()
893
+ raise ValueError('Unsupported database dialect')
894
+
895
+ cluster_row = session.query(cluster_table).filter_by(name=cluster_name)
896
+ last_status = cluster_row.first(
897
+ ).status if cluster_row and cluster_row.first() is not None else None
898
+ if nop_if_duplicate:
899
+ last_event = get_last_cluster_event(cluster_hash,
900
+ event_type=event_type)
901
+ if duplicate_regex is not None and last_event is not None:
902
+ if re.search(duplicate_regex, last_event):
903
+ return
904
+ elif last_event == reason:
905
+ return
906
+ try:
907
+ request_id = common_utils.get_current_request_id()
908
+ session.execute(
909
+ insert_func(cluster_event_table).values(
910
+ cluster_hash=cluster_hash,
911
+ name=cluster_name,
912
+ starting_status=last_status,
913
+ ending_status=new_status.value if new_status else None,
914
+ reason=reason,
915
+ transitioned_at=transitioned_at,
916
+ type=event_type.value,
917
+ request_id=request_id,
918
+ ))
919
+ session.commit()
920
+ except sqlalchemy.exc.IntegrityError as e:
921
+ for msg in _UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS:
922
+ if msg in str(e):
923
+ # This can happen if the cluster event is added twice.
924
+ # We can ignore this error unless the caller requests
925
+ # to expose the error.
926
+ if expose_duplicate_error:
927
+ raise db_utils.UniqueConstraintViolationError(
928
+ value=reason, message=str(e))
929
+ else:
930
+ return
931
+ raise e
932
+
933
+
934
+ def get_last_cluster_event(cluster_hash: str,
935
+ event_type: ClusterEventType) -> Optional[str]:
936
+ assert _SQLALCHEMY_ENGINE is not None
937
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
938
+ row = session.query(cluster_event_table).filter_by(
939
+ cluster_hash=cluster_hash, type=event_type.value).order_by(
940
+ cluster_event_table.c.transitioned_at.desc()).first()
941
+ if row is None:
942
+ return None
943
+ return row.reason
944
+
945
+
946
+ def _get_last_cluster_event_multiple(
947
+ cluster_hashes: Set[str],
948
+ event_type: ClusterEventType) -> Dict[str, str]:
949
+ assert _SQLALCHEMY_ENGINE is not None
950
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
951
+ # Use a subquery to get the latest event for each cluster_hash
952
+ latest_events = session.query(
953
+ cluster_event_table.c.cluster_hash,
954
+ sqlalchemy.func.max(cluster_event_table.c.transitioned_at).label(
955
+ 'max_time')).filter(
956
+ cluster_event_table.c.cluster_hash.in_(cluster_hashes),
957
+ cluster_event_table.c.type == event_type.value).group_by(
958
+ cluster_event_table.c.cluster_hash).subquery()
959
+
960
+ # Join with original table to get the full event details
961
+ rows = session.query(cluster_event_table).join(
962
+ latest_events,
963
+ sqlalchemy.and_(
964
+ cluster_event_table.c.cluster_hash ==
965
+ latest_events.c.cluster_hash,
966
+ cluster_event_table.c.transitioned_at ==
967
+ latest_events.c.max_time)).all()
968
+
969
+ return {row.cluster_hash: row.reason for row in rows}
970
+
971
+
972
+ def cleanup_cluster_events_with_retention(retention_hours: float,
973
+ event_type: ClusterEventType) -> None:
974
+ assert _SQLALCHEMY_ENGINE is not None
975
+ # Once for events with type STATUS_CHANGE.
976
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
977
+ query = session.query(cluster_event_table).filter(
978
+ cluster_event_table.c.transitioned_at <
979
+ time.time() - retention_hours * 3600,
980
+ cluster_event_table.c.type == event_type.value)
981
+ logger.debug(f'Deleting {query.count()} cluster events.')
982
+ query.delete()
983
+ session.commit()
984
+
985
+
986
+ async def cluster_event_retention_daemon():
987
+ """Garbage collect cluster events periodically."""
988
+ while True:
989
+ logger.info('Running cluster event retention daemon...')
990
+ # Use the latest config.
991
+ skypilot_config.reload_config()
992
+ retention_hours = skypilot_config.get_nested(
993
+ ('api_server', 'cluster_event_retention_hours'),
994
+ DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
995
+ debug_retention_hours = skypilot_config.get_nested(
996
+ ('api_server', 'cluster_debug_event_retention_hours'),
997
+ DEBUG_CLUSTER_EVENT_RETENTION_HOURS)
998
+ try:
999
+ if retention_hours >= 0:
1000
+ logger.debug('Cleaning up cluster events with retention '
1001
+ f'{retention_hours} hours.')
1002
+ cleanup_cluster_events_with_retention(
1003
+ retention_hours, ClusterEventType.STATUS_CHANGE)
1004
+ if debug_retention_hours >= 0:
1005
+ logger.debug('Cleaning up debug cluster events with retention '
1006
+ f'{debug_retention_hours} hours.')
1007
+ cleanup_cluster_events_with_retention(debug_retention_hours,
1008
+ ClusterEventType.DEBUG)
1009
+ except asyncio.CancelledError:
1010
+ logger.info('Cluster event retention daemon cancelled')
1011
+ break
1012
+ except Exception as e: # pylint: disable=broad-except
1013
+ logger.error(f'Error running cluster event retention daemon: {e}')
1014
+
1015
+ # Run daemon at most once every hour to avoid too frequent cleanup.
1016
+ sleep_amount = max(
1017
+ min(retention_hours * 3600, debug_retention_hours * 3600),
1018
+ MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
1019
+ await asyncio.sleep(sleep_amount)
1020
+
1021
+
1022
+ def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
1023
+ event_type: ClusterEventType) -> List[str]:
1024
+ """Returns the cluster events for the cluster.
1025
+
1026
+ Args:
1027
+ cluster_name: Name of the cluster. Cannot be specified if cluster_hash
1028
+ is specified.
1029
+ cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
1030
+ is specified.
1031
+ event_type: Type of the event.
1032
+ """
1033
+ assert _SQLALCHEMY_ENGINE is not None
1034
+
1035
+ if cluster_name is not None and cluster_hash is not None:
1036
+ raise ValueError('Cannot specify both cluster_name and cluster_hash')
1037
+ if cluster_name is None and cluster_hash is None:
1038
+ raise ValueError('Must specify either cluster_name or cluster_hash')
1039
+ if cluster_name is not None:
1040
+ cluster_hash = _get_hash_for_existing_cluster(cluster_name)
1041
+ if cluster_hash is None:
1042
+ raise ValueError(f'Hash for cluster {cluster_name} not found.')
1043
+
1044
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1045
+ rows = session.query(cluster_event_table).filter_by(
1046
+ cluster_hash=cluster_hash, type=event_type.value).order_by(
1047
+ cluster_event_table.c.transitioned_at.asc()).all()
1048
+ return [row.reason for row in rows]
380
1049
 
381
1050
 
382
1051
  def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
@@ -391,186 +1060,402 @@ def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
391
1060
  return common_utils.get_user_hash()
392
1061
 
393
1062
 
1063
+ @_init_db
1064
+ @metrics_lib.time_me
394
1065
  def update_cluster_handle(cluster_name: str,
395
1066
  cluster_handle: 'backends.ResourceHandle'):
1067
+ assert _SQLALCHEMY_ENGINE is not None
396
1068
  handle = pickle.dumps(cluster_handle)
397
- _DB.cursor.execute('UPDATE clusters SET handle=(?) WHERE name=(?)',
398
- (handle, cluster_name))
399
- _DB.conn.commit()
1069
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1070
+ session.query(cluster_table).filter_by(name=cluster_name).update(
1071
+ {cluster_table.c.handle: handle})
1072
+ session.commit()
400
1073
 
401
1074
 
1075
+ @_init_db
1076
+ @metrics_lib.time_me
402
1077
  def update_last_use(cluster_name: str):
403
1078
  """Updates the last used command for the cluster."""
404
- _DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)',
405
- (common_utils.get_current_command(), cluster_name))
406
- _DB.conn.commit()
1079
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1080
+ session.query(cluster_table).filter_by(name=cluster_name).update(
1081
+ {cluster_table.c.last_use: common_utils.get_current_command()})
1082
+ session.commit()
407
1083
 
408
1084
 
1085
+ @_init_db
1086
+ @metrics_lib.time_me
409
1087
  def remove_cluster(cluster_name: str, terminate: bool) -> None:
410
1088
  """Removes cluster_name mapping."""
1089
+ assert _SQLALCHEMY_ENGINE is not None
411
1090
  cluster_hash = _get_hash_for_existing_cluster(cluster_name)
412
1091
  usage_intervals = _get_cluster_usage_intervals(cluster_hash)
1092
+ provision_log_path = get_cluster_provision_log_path(cluster_name)
413
1093
 
414
- # usage_intervals is not None and not empty
415
- if usage_intervals:
416
- assert cluster_hash is not None, cluster_name
417
- start_time = usage_intervals.pop()[0]
418
- end_time = int(time.time())
419
- usage_intervals.append((start_time, end_time))
420
- _set_cluster_usage_intervals(cluster_hash, usage_intervals)
421
-
422
- if terminate:
423
- _DB.cursor.execute('DELETE FROM clusters WHERE name=(?)',
424
- (cluster_name,))
425
- else:
426
- handle = get_handle_from_cluster_name(cluster_name)
427
- if handle is None:
428
- return
429
- # Must invalidate IP list to avoid directly trying to ssh into a
430
- # stopped VM, which leads to timeout.
431
- if hasattr(handle, 'stable_internal_external_ips'):
432
- handle = typing.cast('backends.CloudVmRayResourceHandle', handle)
433
- handle.stable_internal_external_ips = None
434
- current_time = int(time.time())
435
- _DB.cursor.execute(
436
- 'UPDATE clusters SET handle=(?), status=(?), '
437
- 'status_updated_at=(?) WHERE name=(?)', (
438
- pickle.dumps(handle),
439
- status_lib.ClusterStatus.STOPPED.value,
440
- current_time,
441
- cluster_name,
442
- ))
443
- _DB.conn.commit()
444
-
445
-
1094
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1095
+ # usage_intervals is not None and not empty
1096
+ if usage_intervals:
1097
+ assert cluster_hash is not None, cluster_name
1098
+ start_time = usage_intervals.pop()[0]
1099
+ end_time = int(time.time())
1100
+ usage_intervals.append((start_time, end_time))
1101
+ _set_cluster_usage_intervals(cluster_hash, usage_intervals)
1102
+
1103
+ if provision_log_path:
1104
+ assert cluster_hash is not None, cluster_name
1105
+ session.query(cluster_history_table).filter_by(
1106
+ cluster_hash=cluster_hash
1107
+ ).filter(
1108
+ cluster_history_table.c.provision_log_path.is_(None)
1109
+ ).update({
1110
+ cluster_history_table.c.provision_log_path: provision_log_path
1111
+ })
1112
+
1113
+ if terminate:
1114
+ session.query(cluster_table).filter_by(name=cluster_name).delete()
1115
+ else:
1116
+ handle = get_handle_from_cluster_name(cluster_name)
1117
+ if handle is None:
1118
+ return
1119
+ # Must invalidate IP list to avoid directly trying to ssh into a
1120
+ # stopped VM, which leads to timeout.
1121
+ if hasattr(handle, 'stable_internal_external_ips'):
1122
+ handle = typing.cast('backends.CloudVmRayResourceHandle',
1123
+ handle)
1124
+ handle.stable_internal_external_ips = None
1125
+ current_time = int(time.time())
1126
+ session.query(cluster_table).filter_by(name=cluster_name).update({
1127
+ cluster_table.c.handle: pickle.dumps(handle),
1128
+ cluster_table.c.status: status_lib.ClusterStatus.STOPPED.value,
1129
+ cluster_table.c.status_updated_at: current_time
1130
+ })
1131
+ session.commit()
1132
+
1133
+
1134
+ @_init_db
1135
+ @metrics_lib.time_me
446
1136
  def get_handle_from_cluster_name(
447
1137
  cluster_name: str) -> Optional['backends.ResourceHandle']:
1138
+ assert _SQLALCHEMY_ENGINE is not None
448
1139
  assert cluster_name is not None, 'cluster_name cannot be None'
449
- rows = _DB.cursor.execute('SELECT handle FROM clusters WHERE name=(?)',
450
- (cluster_name,))
451
- for (handle,) in rows:
452
- return pickle.loads(handle)
453
- return None
1140
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1141
+ row = (session.query(
1142
+ cluster_table.c.handle).filter_by(name=cluster_name).first())
1143
+ if row is None:
1144
+ return None
1145
+ return pickle.loads(row.handle)
1146
+
1147
+
1148
+ @_init_db
1149
+ @metrics_lib.time_me
1150
+ def get_handles_from_cluster_names(
1151
+ cluster_names: Set[str]
1152
+ ) -> Dict[str, Optional['backends.ResourceHandle']]:
1153
+ assert _SQLALCHEMY_ENGINE is not None
1154
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1155
+ rows = session.query(cluster_table.c.name,
1156
+ cluster_table.c.handle).filter(
1157
+ cluster_table.c.name.in_(cluster_names)).all()
1158
+ return {
1159
+ row.name: pickle.loads(row.handle) if row is not None else None
1160
+ for row in rows
1161
+ }
454
1162
 
455
1163
 
456
- def get_glob_cluster_names(cluster_name: str) -> List[str]:
1164
+ @_init_db
1165
+ @metrics_lib.time_me
1166
+ def get_cluster_name_to_handle_map(
1167
+ is_managed: Optional[bool] = None,
1168
+ ) -> Dict[str, Optional['backends.ResourceHandle']]:
1169
+ assert _SQLALCHEMY_ENGINE is not None
1170
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1171
+ query = session.query(cluster_table.c.name, cluster_table.c.handle)
1172
+ if is_managed is not None:
1173
+ query = query.filter(cluster_table.c.is_managed == int(is_managed))
1174
+ rows = query.all()
1175
+ name_to_handle = {}
1176
+ for row in rows:
1177
+ if row.handle and len(row.handle) > 0:
1178
+ name_to_handle[row.name] = pickle.loads(row.handle)
1179
+ else:
1180
+ name_to_handle[row.name] = None
1181
+ return name_to_handle
1182
+
1183
+
1184
+ @_init_db_async
1185
+ @metrics_lib.time_me
1186
+ async def get_status_from_cluster_name_async(
1187
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1188
+ """Get the status of a cluster."""
1189
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
457
1190
  assert cluster_name is not None, 'cluster_name cannot be None'
458
- rows = _DB.cursor.execute('SELECT name FROM clusters WHERE name GLOB (?)',
459
- (cluster_name,))
460
- return [row[0] for row in rows]
1191
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1192
+ result = await session.execute(
1193
+ sqlalchemy.select(cluster_table.c.status).where(
1194
+ cluster_table.c.name == cluster_name))
1195
+ row = result.first()
1196
+
1197
+ if row is None:
1198
+ return None
1199
+ return status_lib.ClusterStatus(row[0])
461
1200
 
462
1201
 
1202
+ @_init_db
1203
+ @metrics_lib.time_me
1204
+ def get_status_from_cluster_name(
1205
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1206
+ assert _SQLALCHEMY_ENGINE is not None
1207
+ assert cluster_name is not None, 'cluster_name cannot be None'
1208
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1209
+ row = session.query(
1210
+ cluster_table.c.status).filter_by(name=cluster_name).first()
1211
+ if row is None:
1212
+ return None
1213
+ return status_lib.ClusterStatus[row.status]
1214
+
1215
+
1216
+ @_init_db
1217
+ @metrics_lib.time_me
1218
+ def get_glob_cluster_names(
1219
+ cluster_name: str,
1220
+ workspaces_filter: Optional[Set[str]] = None) -> List[str]:
1221
+ assert _SQLALCHEMY_ENGINE is not None
1222
+ assert cluster_name is not None, 'cluster_name cannot be None'
1223
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1224
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
1225
+ db_utils.SQLAlchemyDialect.SQLITE.value):
1226
+ query = session.query(cluster_table.c.name).filter(
1227
+ cluster_table.c.name.op('GLOB')(cluster_name))
1228
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
1229
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1230
+ query = session.query(cluster_table.c.name).filter(
1231
+ cluster_table.c.name.op('SIMILAR TO')(
1232
+ _glob_to_similar(cluster_name)))
1233
+ else:
1234
+ raise ValueError('Unsupported database dialect')
1235
+ if workspaces_filter is not None:
1236
+ query = query.filter(
1237
+ cluster_table.c.workspace.in_(workspaces_filter))
1238
+ rows = query.all()
1239
+ return [row.name for row in rows]
1240
+
1241
+
1242
+ @_init_db
1243
+ @metrics_lib.time_me
463
1244
  def set_cluster_status(cluster_name: str,
464
1245
  status: status_lib.ClusterStatus) -> None:
1246
+ assert _SQLALCHEMY_ENGINE is not None
465
1247
  current_time = int(time.time())
466
- _DB.cursor.execute(
467
- 'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
468
- (status.value, current_time, cluster_name))
469
- count = _DB.cursor.rowcount
470
- _DB.conn.commit()
1248
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1249
+ count = session.query(cluster_table).filter_by(
1250
+ name=cluster_name).update({
1251
+ cluster_table.c.status: status.value,
1252
+ cluster_table.c.status_updated_at: current_time
1253
+ })
1254
+ session.commit()
471
1255
  assert count <= 1, count
472
1256
  if count == 0:
473
1257
  raise ValueError(f'Cluster {cluster_name} not found.')
474
1258
 
475
1259
 
1260
+ @_init_db
1261
+ @metrics_lib.time_me
476
1262
  def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
477
1263
  to_down: bool) -> None:
478
- _DB.cursor.execute(
479
- 'UPDATE clusters SET autostop=(?), to_down=(?) WHERE name=(?)', (
480
- idle_minutes,
481
- int(to_down),
482
- cluster_name,
483
- ))
484
- count = _DB.cursor.rowcount
485
- _DB.conn.commit()
1264
+ assert _SQLALCHEMY_ENGINE is not None
1265
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1266
+ count = session.query(cluster_table).filter_by(
1267
+ name=cluster_name).update({
1268
+ cluster_table.c.autostop: idle_minutes,
1269
+ cluster_table.c.to_down: int(to_down)
1270
+ })
1271
+ session.commit()
486
1272
  assert count <= 1, count
487
1273
  if count == 0:
488
1274
  raise ValueError(f'Cluster {cluster_name} not found.')
489
1275
 
490
1276
 
1277
+ @_init_db
1278
+ @metrics_lib.time_me
491
1279
  def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
492
- rows = _DB.cursor.execute('SELECT launched_at FROM clusters WHERE name=(?)',
493
- (cluster_name,))
494
- for (launch_time,) in rows:
495
- if launch_time is None:
496
- return None
497
- return int(launch_time)
498
- return None
1280
+ assert _SQLALCHEMY_ENGINE is not None
1281
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1282
+ row = session.query(
1283
+ cluster_table.c.launched_at).filter_by(name=cluster_name).first()
1284
+ if row is None or row.launched_at is None:
1285
+ return None
1286
+ return int(row.launched_at)
499
1287
 
500
1288
 
1289
+ @_init_db
1290
+ @metrics_lib.time_me
501
1291
  def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
502
- rows = _DB.cursor.execute('SELECT metadata FROM clusters WHERE name=(?)',
503
- (cluster_name,))
504
- for (metadata,) in rows:
505
- if metadata is None:
1292
+ assert _SQLALCHEMY_ENGINE is not None
1293
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1294
+ row = session.query(
1295
+ cluster_table.c.metadata).filter_by(name=cluster_name).first()
1296
+ if row is None or row.metadata is None:
1297
+ return None
1298
+ return json.loads(row.metadata)
1299
+
1300
+
1301
+ @_init_db
1302
+ @metrics_lib.time_me
1303
+ def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
1304
+ """Returns provision_log_path from clusters table, if recorded."""
1305
+ assert _SQLALCHEMY_ENGINE is not None
1306
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1307
+ row = session.query(cluster_table).filter_by(name=cluster_name).first()
1308
+ if row is None:
1309
+ return None
1310
+ return getattr(row, 'provision_log_path', None)
1311
+
1312
+
1313
+ @_init_db
1314
+ @metrics_lib.time_me
1315
+ def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
1316
+ """Returns provision_log_path from cluster_history for this name.
1317
+
1318
+ If the cluster currently exists, we use its hash. Otherwise, we look up
1319
+ historical rows by name and choose the most recent one based on
1320
+ usage_intervals.
1321
+ """
1322
+ assert _SQLALCHEMY_ENGINE is not None
1323
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1324
+ # Try current cluster first (fast path)
1325
+ cluster_hash = _get_hash_for_existing_cluster(cluster_name)
1326
+ if cluster_hash is not None:
1327
+ row = session.query(cluster_history_table).filter_by(
1328
+ cluster_hash=cluster_hash).first()
1329
+ if row is not None:
1330
+ return getattr(row, 'provision_log_path', None)
1331
+
1332
+ # Fallback: search history by name and pick the latest by
1333
+ # usage_intervals
1334
+ rows = session.query(cluster_history_table).filter_by(
1335
+ name=cluster_name).all()
1336
+ if not rows:
506
1337
  return None
507
- return json.loads(metadata)
508
- return None
1338
+
1339
+ def latest_timestamp(usages_bin) -> int:
1340
+ try:
1341
+ intervals = pickle.loads(usages_bin)
1342
+ # intervals: List[Tuple[int, Optional[int]]]
1343
+ if not intervals:
1344
+ return -1
1345
+ _, end = intervals[-1]
1346
+ return end if end is not None else int(time.time())
1347
+ except Exception: # pylint: disable=broad-except
1348
+ return -1
1349
+
1350
+ latest_row = max(rows,
1351
+ key=lambda r: latest_timestamp(r.usage_intervals))
1352
+ return getattr(latest_row, 'provision_log_path', None)
509
1353
 
510
1354
 
1355
+ @_init_db
1356
+ @metrics_lib.time_me
511
1357
  def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
512
- _DB.cursor.execute('UPDATE clusters SET metadata=(?) WHERE name=(?)', (
513
- json.dumps(metadata),
514
- cluster_name,
515
- ))
516
- count = _DB.cursor.rowcount
517
- _DB.conn.commit()
1358
+ assert _SQLALCHEMY_ENGINE is not None
1359
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1360
+ count = session.query(cluster_table).filter_by(
1361
+ name=cluster_name).update(
1362
+ {cluster_table.c.metadata: json.dumps(metadata)})
1363
+ session.commit()
518
1364
  assert count <= 1, count
519
1365
  if count == 0:
520
1366
  raise ValueError(f'Cluster {cluster_name} not found.')
521
1367
 
522
1368
 
1369
+ @_init_db
1370
+ @metrics_lib.time_me
523
1371
  def get_cluster_storage_mounts_metadata(
524
1372
  cluster_name: str) -> Optional[Dict[str, Any]]:
525
- rows = _DB.cursor.execute(
526
- 'SELECT storage_mounts_metadata FROM clusters WHERE name=(?)',
527
- (cluster_name,))
528
- for (storage_mounts_metadata,) in rows:
529
- if storage_mounts_metadata is None:
530
- return None
531
- return pickle.loads(storage_mounts_metadata)
532
- return None
1373
+ assert _SQLALCHEMY_ENGINE is not None
1374
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1375
+ row = (session.query(cluster_table.c.storage_mounts_metadata).filter_by(
1376
+ name=cluster_name).first())
1377
+ if row is None or row.storage_mounts_metadata is None:
1378
+ return None
1379
+ return pickle.loads(row.storage_mounts_metadata)
533
1380
 
534
1381
 
1382
+ @_init_db
1383
+ @metrics_lib.time_me
535
1384
  def set_cluster_storage_mounts_metadata(
536
1385
  cluster_name: str, storage_mounts_metadata: Dict[str, Any]) -> None:
537
- _DB.cursor.execute(
538
- 'UPDATE clusters SET storage_mounts_metadata=(?) WHERE name=(?)', (
539
- pickle.dumps(storage_mounts_metadata),
540
- cluster_name,
541
- ))
542
- count = _DB.cursor.rowcount
543
- _DB.conn.commit()
1386
+ assert _SQLALCHEMY_ENGINE is not None
1387
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1388
+ count = session.query(cluster_table).filter_by(
1389
+ name=cluster_name).update({
1390
+ cluster_table.c.storage_mounts_metadata:
1391
+ pickle.dumps(storage_mounts_metadata)
1392
+ })
1393
+ session.commit()
1394
+ assert count <= 1, count
1395
+ if count == 0:
1396
+ raise ValueError(f'Cluster {cluster_name} not found.')
1397
+
1398
+
1399
+ @_init_db
1400
+ @metrics_lib.time_me
1401
+ def get_cluster_skylet_ssh_tunnel_metadata(
1402
+ cluster_name: str) -> Optional[Tuple[int, int]]:
1403
+ assert _SQLALCHEMY_ENGINE is not None
1404
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1405
+ row = session.query(
1406
+ cluster_table.c.skylet_ssh_tunnel_metadata).filter_by(
1407
+ name=cluster_name).first()
1408
+ if row is None or row.skylet_ssh_tunnel_metadata is None:
1409
+ return None
1410
+ return pickle.loads(row.skylet_ssh_tunnel_metadata)
1411
+
1412
+
1413
+ @_init_db
1414
+ @metrics_lib.time_me
1415
+ def set_cluster_skylet_ssh_tunnel_metadata(
1416
+ cluster_name: str,
1417
+ skylet_ssh_tunnel_metadata: Optional[Tuple[int, int]]) -> None:
1418
+ assert _SQLALCHEMY_ENGINE is not None
1419
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1420
+ value = pickle.dumps(
1421
+ skylet_ssh_tunnel_metadata
1422
+ ) if skylet_ssh_tunnel_metadata is not None else None
1423
+ count = session.query(cluster_table).filter_by(
1424
+ name=cluster_name).update(
1425
+ {cluster_table.c.skylet_ssh_tunnel_metadata: value})
1426
+ session.commit()
544
1427
  assert count <= 1, count
545
1428
  if count == 0:
546
1429
  raise ValueError(f'Cluster {cluster_name} not found.')
547
1430
 
548
1431
 
1432
+ @_init_db
1433
+ @metrics_lib.time_me
549
1434
  def _get_cluster_usage_intervals(
550
1435
  cluster_hash: Optional[str]
551
1436
  ) -> Optional[List[Tuple[int, Optional[int]]]]:
1437
+ assert _SQLALCHEMY_ENGINE is not None
552
1438
  if cluster_hash is None:
553
1439
  return None
554
- rows = _DB.cursor.execute(
555
- 'SELECT usage_intervals FROM cluster_history WHERE cluster_hash=(?)',
556
- (cluster_hash,))
557
- for (usage_intervals,) in rows:
558
- if usage_intervals is None:
559
- return None
560
- return pickle.loads(usage_intervals)
561
- return None
1440
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1441
+ row = session.query(cluster_history_table.c.usage_intervals).filter_by(
1442
+ cluster_hash=cluster_hash).first()
1443
+ if row is None or row.usage_intervals is None:
1444
+ return None
1445
+ return pickle.loads(row.usage_intervals)
562
1446
 
563
1447
 
564
- def _get_cluster_launch_time(cluster_hash: str) -> Optional[int]:
565
- usage_intervals = _get_cluster_usage_intervals(cluster_hash)
1448
+ def _get_cluster_launch_time(
1449
+ usage_intervals: Optional[List[Tuple[int,
1450
+ Optional[int]]]]) -> Optional[int]:
566
1451
  if usage_intervals is None:
567
1452
  return None
568
1453
  return usage_intervals[0][0]
569
1454
 
570
1455
 
571
- def _get_cluster_duration(cluster_hash: str) -> int:
1456
+ def _get_cluster_duration(
1457
+ usage_intervals: Optional[List[Tuple[int, Optional[int]]]]) -> int:
572
1458
  total_duration = 0
573
- usage_intervals = _get_cluster_usage_intervals(cluster_hash)
574
1459
 
575
1460
  if usage_intervals is None:
576
1461
  return total_duration
@@ -587,60 +1472,89 @@ def _get_cluster_duration(cluster_hash: str) -> int:
587
1472
  return total_duration
588
1473
 
589
1474
 
1475
+ def _get_cluster_last_activity_time(
1476
+ usage_intervals: Optional[List[Tuple[int,
1477
+ Optional[int]]]]) -> Optional[int]:
1478
+ last_activity_time = None
1479
+ if usage_intervals:
1480
+ last_interval = usage_intervals[-1]
1481
+ last_activity_time = (last_interval[1] if last_interval[1] is not None
1482
+ else last_interval[0])
1483
+ return last_activity_time
1484
+
1485
+
1486
+ @_init_db
1487
+ @metrics_lib.time_me
590
1488
  def _set_cluster_usage_intervals(
591
1489
  cluster_hash: str, usage_intervals: List[Tuple[int,
592
1490
  Optional[int]]]) -> None:
593
- _DB.cursor.execute(
594
- 'UPDATE cluster_history SET usage_intervals=(?) WHERE cluster_hash=(?)',
595
- (
596
- pickle.dumps(usage_intervals),
597
- cluster_hash,
598
- ))
599
-
600
- count = _DB.cursor.rowcount
601
- _DB.conn.commit()
1491
+ assert _SQLALCHEMY_ENGINE is not None
1492
+
1493
+ # Calculate last_activity_time from usage_intervals
1494
+ last_activity_time = _get_cluster_last_activity_time(usage_intervals)
1495
+
1496
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1497
+ count = session.query(cluster_history_table).filter_by(
1498
+ cluster_hash=cluster_hash).update({
1499
+ cluster_history_table.c.usage_intervals:
1500
+ pickle.dumps(usage_intervals),
1501
+ cluster_history_table.c.last_activity_time: last_activity_time,
1502
+ })
1503
+ session.commit()
602
1504
  assert count <= 1, count
603
1505
  if count == 0:
604
1506
  raise ValueError(f'Cluster hash {cluster_hash} not found.')
605
1507
 
606
1508
 
1509
+ @_init_db
1510
+ @metrics_lib.time_me
607
1511
  def set_owner_identity_for_cluster(cluster_name: str,
608
1512
  owner_identity: Optional[List[str]]) -> None:
1513
+ assert _SQLALCHEMY_ENGINE is not None
609
1514
  if owner_identity is None:
610
1515
  return
611
1516
  owner_identity_str = json.dumps(owner_identity)
612
- _DB.cursor.execute('UPDATE clusters SET owner=(?) WHERE name=(?)',
613
- (owner_identity_str, cluster_name))
614
-
615
- count = _DB.cursor.rowcount
616
- _DB.conn.commit()
1517
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1518
+ count = session.query(cluster_table).filter_by(
1519
+ name=cluster_name).update(
1520
+ {cluster_table.c.owner: owner_identity_str})
1521
+ session.commit()
617
1522
  assert count <= 1, count
618
1523
  if count == 0:
619
1524
  raise ValueError(f'Cluster {cluster_name} not found.')
620
1525
 
621
1526
 
1527
+ @_init_db
1528
+ @metrics_lib.time_me
622
1529
  def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
623
- rows = _DB.cursor.execute(
624
- 'SELECT cluster_hash FROM clusters WHERE name=(?)', (cluster_name,))
625
- for (cluster_hash,) in rows:
626
- if cluster_hash is None:
627
- return None
628
- return cluster_hash
629
- return None
1530
+ assert _SQLALCHEMY_ENGINE is not None
1531
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1532
+ row = (session.query(
1533
+ cluster_table.c.cluster_hash).filter_by(name=cluster_name).first())
1534
+ if row is None or row.cluster_hash is None:
1535
+ return None
1536
+ return row.cluster_hash
630
1537
 
631
1538
 
1539
+ @_init_db
1540
+ @metrics_lib.time_me
632
1541
  def get_launched_resources_from_cluster_hash(
633
1542
  cluster_hash: str) -> Optional[Tuple[int, Any]]:
1543
+ assert _SQLALCHEMY_ENGINE is not None
1544
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1545
+ row = session.query(
1546
+ cluster_history_table.c.num_nodes,
1547
+ cluster_history_table.c.launched_resources).filter_by(
1548
+ cluster_hash=cluster_hash).first()
1549
+ if row is None:
1550
+ return None
1551
+ num_nodes = row.num_nodes
1552
+ launched_resources = row.launched_resources
634
1553
 
635
- rows = _DB.cursor.execute(
636
- 'SELECT num_nodes, launched_resources '
637
- 'FROM cluster_history WHERE cluster_hash=(?)', (cluster_hash,))
638
- for (num_nodes, launched_resources) in rows:
639
- if num_nodes is None or launched_resources is None:
640
- return None
641
- launched_resources = pickle.loads(launched_resources)
642
- return num_nodes, launched_resources
643
- return None
1554
+ if num_nodes is None or launched_resources is None:
1555
+ return None
1556
+ launched_resources = pickle.loads(launched_resources)
1557
+ return num_nodes, launched_resources
644
1558
 
645
1559
 
646
1560
  def _load_owner(record_owner: Optional[str]) -> Optional[List[str]]:
@@ -671,176 +1585,491 @@ def _load_storage_mounts_metadata(
671
1585
  return pickle.loads(record_storage_mounts_metadata)
672
1586
 
673
1587
 
1588
+ @_init_db
1589
+ @metrics_lib.time_me
1590
+ @context_utils.cancellation_guard
674
1591
  def get_cluster_from_name(
675
- cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
676
- rows = _DB.cursor.execute(
677
- 'SELECT name, launched_at, handle, last_use, status, autostop, '
678
- 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
679
- 'cluster_ever_up, status_updated_at, config_hash, user_hash '
680
- 'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
681
- for row in rows:
682
- # Explicitly specify the number of fields to unpack, so that
683
- # we can add new fields to the database in the future without
684
- # breaking the previous code.
685
- (name, launched_at, handle, last_use, status, autostop, metadata,
686
- to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
687
- status_updated_at, config_hash, user_hash) = row
688
- user_hash = _get_user_hash_or_current_user(user_hash)
689
- # TODO: use namedtuple instead of dict
690
- record = {
691
- 'name': name,
692
- 'launched_at': launched_at,
693
- 'handle': pickle.loads(handle),
694
- 'last_use': last_use,
695
- 'status': status_lib.ClusterStatus[status],
696
- 'autostop': autostop,
697
- 'to_down': bool(to_down),
698
- 'owner': _load_owner(owner),
699
- 'metadata': json.loads(metadata),
700
- 'cluster_hash': cluster_hash,
701
- 'storage_mounts_metadata':
702
- _load_storage_mounts_metadata(storage_mounts_metadata),
703
- 'cluster_ever_up': bool(cluster_ever_up),
704
- 'status_updated_at': status_updated_at,
705
- 'user_hash': user_hash,
706
- 'user_name': get_user(user_hash).name,
707
- 'config_hash': config_hash,
708
- }
709
- return record
710
- return None
711
-
1592
+ cluster_name: Optional[str],
1593
+ *,
1594
+ include_user_info: bool = True,
1595
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
1596
+ assert _SQLALCHEMY_ENGINE is not None
1597
+ query_fields = [
1598
+ cluster_table.c.name,
1599
+ cluster_table.c.launched_at,
1600
+ cluster_table.c.handle,
1601
+ cluster_table.c.last_use,
1602
+ cluster_table.c.status,
1603
+ cluster_table.c.autostop,
1604
+ cluster_table.c.to_down,
1605
+ cluster_table.c.owner,
1606
+ cluster_table.c.metadata,
1607
+ cluster_table.c.cluster_hash,
1608
+ cluster_table.c.cluster_ever_up,
1609
+ cluster_table.c.status_updated_at,
1610
+ cluster_table.c.user_hash,
1611
+ cluster_table.c.config_hash,
1612
+ cluster_table.c.workspace,
1613
+ cluster_table.c.is_managed,
1614
+ ]
1615
+ if not summary_response:
1616
+ query_fields.extend([
1617
+ cluster_table.c.last_creation_yaml,
1618
+ cluster_table.c.last_creation_command,
1619
+ ])
1620
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1621
+ query = session.query(*query_fields)
1622
+ row = query.filter_by(name=cluster_name).first()
1623
+ if row is None:
1624
+ return None
1625
+ if include_user_info:
1626
+ user_hash = _get_user_hash_or_current_user(row.user_hash)
1627
+ user = get_user(user_hash)
1628
+ user_name = user.name if user is not None else None
1629
+ if not summary_response:
1630
+ last_event = get_last_cluster_event(
1631
+ row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1632
+ # TODO: use namedtuple instead of dict
1633
+ record = {
1634
+ 'name': row.name,
1635
+ 'launched_at': row.launched_at,
1636
+ 'handle': pickle.loads(row.handle),
1637
+ 'last_use': row.last_use,
1638
+ 'status': status_lib.ClusterStatus[row.status],
1639
+ 'autostop': row.autostop,
1640
+ 'to_down': bool(row.to_down),
1641
+ 'owner': _load_owner(row.owner),
1642
+ 'metadata': json.loads(row.metadata),
1643
+ 'cluster_hash': row.cluster_hash,
1644
+ 'cluster_ever_up': bool(row.cluster_ever_up),
1645
+ 'status_updated_at': row.status_updated_at,
1646
+ 'workspace': row.workspace,
1647
+ 'is_managed': bool(row.is_managed),
1648
+ 'config_hash': row.config_hash,
1649
+ }
1650
+ if not summary_response:
1651
+ record['last_creation_yaml'] = row.last_creation_yaml
1652
+ record['last_creation_command'] = row.last_creation_command
1653
+ record['last_event'] = last_event
1654
+ if include_user_info:
1655
+ record['user_hash'] = user_hash
1656
+ record['user_name'] = user_name
1657
+
1658
+ return record
1659
+
1660
+
1661
+ @_init_db
1662
+ @metrics_lib.time_me
1663
+ @context_utils.cancellation_guard
1664
+ def cluster_with_name_exists(cluster_name: str) -> bool:
1665
+ assert _SQLALCHEMY_ENGINE is not None
1666
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1667
+ row = session.query(
1668
+ cluster_table.c.name).filter_by(name=cluster_name).first()
1669
+ if row is None:
1670
+ return False
1671
+ return True
1672
+
1673
+
1674
+ @_init_db
1675
+ @metrics_lib.time_me
1676
+ def get_clusters(
1677
+ *, # keyword only separator
1678
+ exclude_managed_clusters: bool = False,
1679
+ workspaces_filter: Optional[Dict[str, Any]] = None,
1680
+ user_hashes_filter: Optional[Set[str]] = None,
1681
+ cluster_names: Optional[List[str]] = None,
1682
+ summary_response: bool = False,
1683
+ ) -> List[Dict[str, Any]]:
1684
+ """Get clusters from the database.
712
1685
 
713
- def get_clusters() -> List[Dict[str, Any]]:
714
- rows = _DB.cursor.execute(
715
- 'select name, launched_at, handle, last_use, status, autostop, '
716
- 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
717
- 'cluster_ever_up, status_updated_at, config_hash, user_hash '
718
- 'from clusters order by launched_at desc').fetchall()
1686
+ Args:
1687
+ exclude_managed_clusters: If True, exclude clusters that have
1688
+ is_managed field set to True.
1689
+ workspaces_filter: If specified, only include clusters
1690
+ that has workspace field set to one of the values.
1691
+ user_hashes_filter: If specified, only include clusters
1692
+ that has user_hash field set to one of the values.
1693
+ cluster_names: If specified, only include clusters
1694
+ that has name field set to one of the values.
1695
+ """
1696
+ # is a cluster has a null user_hash,
1697
+ # we treat it as belonging to the current user.
1698
+ current_user_hash = common_utils.get_user_hash()
1699
+ assert _SQLALCHEMY_ENGINE is not None
1700
+ query_fields = [
1701
+ cluster_table.c.name,
1702
+ cluster_table.c.launched_at,
1703
+ cluster_table.c.handle,
1704
+ cluster_table.c.status,
1705
+ cluster_table.c.autostop,
1706
+ cluster_table.c.to_down,
1707
+ cluster_table.c.cluster_hash,
1708
+ cluster_table.c.cluster_ever_up,
1709
+ cluster_table.c.user_hash,
1710
+ cluster_table.c.workspace,
1711
+ user_table.c.name.label('user_name'),
1712
+ ]
1713
+ if not summary_response:
1714
+ query_fields.extend([
1715
+ cluster_table.c.last_creation_yaml,
1716
+ cluster_table.c.last_creation_command,
1717
+ cluster_table.c.config_hash,
1718
+ cluster_table.c.owner,
1719
+ cluster_table.c.metadata,
1720
+ cluster_table.c.last_use,
1721
+ cluster_table.c.status_updated_at,
1722
+ ])
1723
+ if not exclude_managed_clusters:
1724
+ query_fields.append(cluster_table.c.is_managed)
1725
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1726
+ query = session.query(*query_fields).outerjoin(
1727
+ user_table, cluster_table.c.user_hash == user_table.c.id)
1728
+ if exclude_managed_clusters:
1729
+ query = query.filter(cluster_table.c.is_managed == int(False))
1730
+ if workspaces_filter is not None:
1731
+ query = query.filter(
1732
+ cluster_table.c.workspace.in_(workspaces_filter))
1733
+ if user_hashes_filter is not None:
1734
+ if current_user_hash in user_hashes_filter:
1735
+ # backwards compatibility for old clusters.
1736
+ # If current_user_hash is in user_hashes_filter, we include
1737
+ # clusters that have a null user_hash.
1738
+ query = query.filter(
1739
+ (cluster_table.c.user_hash.in_(user_hashes_filter) |
1740
+ (cluster_table.c.user_hash is None)))
1741
+ else:
1742
+ query = query.filter(
1743
+ cluster_table.c.user_hash.in_(user_hashes_filter))
1744
+ if cluster_names is not None:
1745
+ query = query.filter(cluster_table.c.name.in_(cluster_names))
1746
+ query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
1747
+ rows = query.all()
719
1748
  records = []
1749
+
1750
+ # Check if we need to fetch the current user's name,
1751
+ # for backwards compatibility, if user_hash is None.
1752
+ current_user_name = None
1753
+ needs_current_user = any(row.user_hash is None for row in rows)
1754
+ if needs_current_user:
1755
+ current_user = get_user(current_user_hash)
1756
+ current_user_name = (current_user.name
1757
+ if current_user is not None else None)
1758
+
1759
+ # get last cluster event for each row
1760
+ if not summary_response:
1761
+ cluster_hashes = {row.cluster_hash for row in rows}
1762
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1763
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1764
+
720
1765
  for row in rows:
721
- (name, launched_at, handle, last_use, status, autostop, metadata,
722
- to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
723
- status_updated_at, config_hash, user_hash) = row
724
- user_hash = _get_user_hash_or_current_user(user_hash)
725
1766
  # TODO: use namedtuple instead of dict
726
1767
  record = {
727
- 'name': name,
728
- 'launched_at': launched_at,
729
- 'handle': pickle.loads(handle),
730
- 'last_use': last_use,
731
- 'status': status_lib.ClusterStatus[status],
732
- 'autostop': autostop,
733
- 'to_down': bool(to_down),
734
- 'owner': _load_owner(owner),
735
- 'metadata': json.loads(metadata),
736
- 'cluster_hash': cluster_hash,
737
- 'storage_mounts_metadata':
738
- _load_storage_mounts_metadata(storage_mounts_metadata),
739
- 'cluster_ever_up': bool(cluster_ever_up),
740
- 'status_updated_at': status_updated_at,
741
- 'user_hash': user_hash,
742
- 'user_name': get_user(user_hash).name,
743
- 'config_hash': config_hash,
1768
+ 'name': row.name,
1769
+ 'launched_at': row.launched_at,
1770
+ 'handle': pickle.loads(row.handle),
1771
+ 'status': status_lib.ClusterStatus[row.status],
1772
+ 'autostop': row.autostop,
1773
+ 'to_down': bool(row.to_down),
1774
+ 'cluster_hash': row.cluster_hash,
1775
+ 'cluster_ever_up': bool(row.cluster_ever_up),
1776
+ 'user_hash': (row.user_hash
1777
+ if row.user_hash is not None else current_user_hash),
1778
+ 'user_name': (row.user_name
1779
+ if row.user_name is not None else current_user_name),
1780
+ 'workspace': row.workspace,
1781
+ 'is_managed': False
1782
+ if exclude_managed_clusters else bool(row.is_managed),
744
1783
  }
1784
+ if not summary_response:
1785
+ record['last_creation_yaml'] = row.last_creation_yaml
1786
+ record['last_creation_command'] = row.last_creation_command
1787
+ record['last_event'] = last_cluster_event_dict.get(
1788
+ row.cluster_hash, None)
1789
+ record['config_hash'] = row.config_hash
1790
+ record['owner'] = _load_owner(row.owner)
1791
+ record['metadata'] = json.loads(row.metadata)
1792
+ record['last_use'] = row.last_use
1793
+ record['status_updated_at'] = row.status_updated_at
745
1794
 
746
1795
  records.append(record)
747
1796
  return records
748
1797
 
749
1798
 
750
- def get_clusters_from_history() -> List[Dict[str, Any]]:
751
- rows = _DB.cursor.execute(
752
- 'SELECT ch.cluster_hash, ch.name, ch.num_nodes, '
753
- 'ch.launched_resources, ch.usage_intervals, clusters.status, '
754
- 'ch.user_hash '
755
- 'FROM cluster_history ch '
756
- 'LEFT OUTER JOIN clusters '
757
- 'ON ch.cluster_hash=clusters.cluster_hash ').fetchall()
1799
+ @_init_db
1800
+ @metrics_lib.time_me
1801
+ def get_cluster_names(exclude_managed_clusters: bool = False,) -> List[str]:
1802
+ assert _SQLALCHEMY_ENGINE is not None
1803
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1804
+ query = session.query(cluster_table.c.name)
1805
+ if exclude_managed_clusters:
1806
+ query = query.filter(cluster_table.c.is_managed == int(False))
1807
+ rows = query.all()
1808
+ return [row[0] for row in rows]
758
1809
 
759
- # '(cluster_hash, name, num_nodes, requested_resources, '
760
- # 'launched_resources, usage_intervals) '
761
- records = []
762
1810
 
763
- for row in rows:
764
- # TODO: use namedtuple instead of dict
1811
+ @_init_db
1812
+ @metrics_lib.time_me
1813
+ def get_clusters_from_history(
1814
+ days: Optional[int] = None,
1815
+ abbreviate_response: bool = False,
1816
+ cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
1817
+ """Get cluster reports from history.
765
1818
 
766
- (
767
- cluster_hash,
768
- name,
769
- num_nodes,
770
- launched_resources,
771
- usage_intervals,
772
- status,
773
- user_hash,
774
- ) = row[:7]
775
- user_hash = _get_user_hash_or_current_user(user_hash)
1819
+ Args:
1820
+ days: If specified, only include historical clusters (those not
1821
+ currently active) that were last used within the past 'days'
1822
+ days. Active clusters are always included regardless of this
1823
+ parameter.
776
1824
 
777
- if status is not None:
778
- status = status_lib.ClusterStatus[status]
1825
+ Returns:
1826
+ List of cluster records with history information.
1827
+ """
1828
+ assert _SQLALCHEMY_ENGINE is not None
1829
+
1830
+ current_user_hash = common_utils.get_user_hash()
1831
+
1832
+ # Prepare filtering parameters
1833
+ cutoff_time = 0
1834
+ if days is not None:
1835
+ cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1836
+
1837
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1838
+ # Explicitly select columns from both tables to avoid ambiguity
1839
+ if abbreviate_response:
1840
+ query = session.query(
1841
+ cluster_history_table.c.cluster_hash,
1842
+ cluster_history_table.c.name, cluster_history_table.c.num_nodes,
1843
+ cluster_history_table.c.launched_resources,
1844
+ cluster_history_table.c.usage_intervals,
1845
+ cluster_history_table.c.user_hash,
1846
+ cluster_history_table.c.workspace.label('history_workspace'),
1847
+ cluster_history_table.c.last_activity_time,
1848
+ cluster_history_table.c.launched_at, cluster_table.c.status,
1849
+ cluster_table.c.workspace)
1850
+ else:
1851
+ query = session.query(
1852
+ cluster_history_table.c.cluster_hash,
1853
+ cluster_history_table.c.name, cluster_history_table.c.num_nodes,
1854
+ cluster_history_table.c.launched_resources,
1855
+ cluster_history_table.c.usage_intervals,
1856
+ cluster_history_table.c.user_hash,
1857
+ cluster_history_table.c.last_creation_yaml,
1858
+ cluster_history_table.c.last_creation_command,
1859
+ cluster_history_table.c.workspace.label('history_workspace'),
1860
+ cluster_history_table.c.last_activity_time,
1861
+ cluster_history_table.c.launched_at, cluster_table.c.status,
1862
+ cluster_table.c.workspace)
1863
+
1864
+ query = query.select_from(
1865
+ cluster_history_table.join(cluster_table,
1866
+ cluster_history_table.c.cluster_hash ==
1867
+ cluster_table.c.cluster_hash,
1868
+ isouter=True))
1869
+
1870
+ # Only include clusters that are either active (status is not None)
1871
+ # or are within the cutoff time (cutoff_time <= last_activity_time).
1872
+ # If days is not specified, we include all clusters by setting
1873
+ # cutoff_time to 0.
1874
+ query = query.filter(
1875
+ (cluster_table.c.status.isnot(None) |
1876
+ (cluster_history_table.c.last_activity_time >= cutoff_time)))
1877
+
1878
+ # Order by launched_at descending (most recent first)
1879
+ query = query.order_by(
1880
+ sqlalchemy.desc(cluster_history_table.c.launched_at))
1881
+
1882
+ if cluster_hashes is not None:
1883
+ query = query.filter(
1884
+ cluster_history_table.c.cluster_hash.in_(cluster_hashes))
1885
+ rows = query.all()
1886
+
1887
+ usage_intervals_dict = {}
1888
+ row_to_user_hash = {}
1889
+ for row in rows:
1890
+ row_usage_intervals: List[Tuple[int, Optional[int]]] = []
1891
+ if row.usage_intervals:
1892
+ try:
1893
+ row_usage_intervals = pickle.loads(row.usage_intervals)
1894
+ except (pickle.PickleError, AttributeError):
1895
+ pass
1896
+ usage_intervals_dict[row.cluster_hash] = row_usage_intervals
1897
+ user_hash = (row.user_hash
1898
+ if row.user_hash is not None else current_user_hash)
1899
+ row_to_user_hash[row.cluster_hash] = user_hash
1900
+
1901
+ user_hashes = set(row_to_user_hash.values())
1902
+ user_hash_to_user = get_users(user_hashes)
1903
+ cluster_hashes = set(row_to_user_hash.keys())
1904
+ if not abbreviate_response:
1905
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1906
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1907
+
1908
+ records = []
1909
+ for row in rows:
1910
+ user_hash = row_to_user_hash[row.cluster_hash]
1911
+ user = user_hash_to_user.get(user_hash, None)
1912
+ user_name = user.name if user is not None else None
1913
+ if not abbreviate_response:
1914
+ last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1915
+ launched_at = row.launched_at
1916
+ usage_intervals: Optional[List[Tuple[
1917
+ int,
1918
+ Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
1919
+ duration = _get_cluster_duration(usage_intervals)
1920
+
1921
+ # Parse status
1922
+ status = None
1923
+ if row.status:
1924
+ status = status_lib.ClusterStatus[row.status]
1925
+
1926
+ # Parse launched resources safely
1927
+ launched_resources = None
1928
+ if row.launched_resources:
1929
+ try:
1930
+ launched_resources = pickle.loads(row.launched_resources)
1931
+ except (pickle.PickleError, AttributeError):
1932
+ launched_resources = None
1933
+
1934
+ workspace = (row.history_workspace
1935
+ if row.history_workspace else row.workspace)
779
1936
 
780
1937
  record = {
781
- 'name': name,
782
- 'launched_at': _get_cluster_launch_time(cluster_hash),
783
- 'duration': _get_cluster_duration(cluster_hash),
784
- 'num_nodes': num_nodes,
785
- 'resources': pickle.loads(launched_resources),
786
- 'cluster_hash': cluster_hash,
787
- 'usage_intervals': pickle.loads(usage_intervals),
1938
+ 'name': row.name,
1939
+ 'launched_at': launched_at,
1940
+ 'duration': duration,
1941
+ 'num_nodes': row.num_nodes,
1942
+ 'resources': launched_resources,
1943
+ 'cluster_hash': row.cluster_hash,
1944
+ 'usage_intervals': usage_intervals,
788
1945
  'status': status,
789
1946
  'user_hash': user_hash,
1947
+ 'user_name': user_name,
1948
+ 'workspace': workspace,
790
1949
  }
1950
+ if not abbreviate_response:
1951
+ record['last_creation_yaml'] = row.last_creation_yaml
1952
+ record['last_creation_command'] = row.last_creation_command
1953
+ record['last_event'] = last_event
791
1954
 
792
1955
  records.append(record)
793
1956
 
794
1957
  # sort by launch time, descending in recency
795
- records = sorted(records, key=lambda record: -record['launched_at'])
1958
+ records = sorted(records, key=lambda record: -(record['launched_at'] or 0))
796
1959
  return records
797
1960
 
798
1961
 
1962
+ @_init_db
1963
+ @metrics_lib.time_me
799
1964
  def get_cluster_names_start_with(starts_with: str) -> List[str]:
800
- rows = _DB.cursor.execute('SELECT name FROM clusters WHERE name LIKE (?)',
801
- (f'{starts_with}%',))
1965
+ assert _SQLALCHEMY_ENGINE is not None
1966
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1967
+ rows = session.query(cluster_table.c.name).filter(
1968
+ cluster_table.c.name.like(f'{starts_with}%')).all()
802
1969
  return [row[0] for row in rows]
803
1970
 
804
1971
 
805
- def get_cached_enabled_clouds(
806
- cloud_capability: 'cloud.CloudCapability') -> List['clouds.Cloud']:
807
-
808
- rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
809
- (_get_capability_key(cloud_capability),))
1972
+ @_init_db
1973
+ @metrics_lib.time_me
1974
+ def get_cached_enabled_clouds(cloud_capability: 'cloud.CloudCapability',
1975
+ workspace: str) -> List['clouds.Cloud']:
1976
+ assert _SQLALCHEMY_ENGINE is not None
1977
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1978
+ row = session.query(config_table).filter_by(
1979
+ key=_get_enabled_clouds_key(cloud_capability, workspace)).first()
810
1980
  ret = []
811
- for (value,) in rows:
812
- ret = json.loads(value)
813
- break
1981
+ if row:
1982
+ ret = json.loads(row.value)
814
1983
  enabled_clouds: List['clouds.Cloud'] = []
815
1984
  for c in ret:
816
1985
  try:
817
1986
  cloud = registry.CLOUD_REGISTRY.from_str(c)
818
1987
  except ValueError:
819
- # Handle the case for the clouds whose support has been removed from
820
- # SkyPilot, e.g., 'local' was a cloud in the past and may be stored
821
- # in the database for users before #3037. We should ignore removed
822
- # clouds and continue.
1988
+ # Handle the case for the clouds whose support has been
1989
+ # removed from SkyPilot, e.g., 'local' was a cloud in the past
1990
+ # and may be stored in the database for users before #3037.
1991
+ # We should ignore removed clouds and continue.
823
1992
  continue
824
1993
  if cloud is not None:
825
1994
  enabled_clouds.append(cloud)
826
1995
  return enabled_clouds
827
1996
 
828
1997
 
1998
+ @_init_db
1999
+ @metrics_lib.time_me
829
2000
  def set_enabled_clouds(enabled_clouds: List[str],
830
- cloud_capability: 'cloud.CloudCapability') -> None:
831
- _DB.cursor.execute(
832
- 'INSERT OR REPLACE INTO config VALUES (?, ?)',
833
- (_get_capability_key(cloud_capability), json.dumps(enabled_clouds)))
834
- _DB.conn.commit()
835
-
836
-
837
- def _get_capability_key(cloud_capability: 'cloud.CloudCapability') -> str:
838
- return _ENABLED_CLOUDS_KEY_PREFIX + cloud_capability.value
839
-
840
-
2001
+ cloud_capability: 'cloud.CloudCapability',
2002
+ workspace: str) -> None:
2003
+ assert _SQLALCHEMY_ENGINE is not None
2004
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2005
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2006
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2007
+ insert_func = sqlite.insert
2008
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2009
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2010
+ insert_func = postgresql.insert
2011
+ else:
2012
+ raise ValueError('Unsupported database dialect')
2013
+ insert_stmnt = insert_func(config_table).values(
2014
+ key=_get_enabled_clouds_key(cloud_capability, workspace),
2015
+ value=json.dumps(enabled_clouds))
2016
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
2017
+ index_elements=[config_table.c.key],
2018
+ set_={config_table.c.value: json.dumps(enabled_clouds)})
2019
+ session.execute(do_update_stmt)
2020
+ session.commit()
2021
+
2022
+
2023
+ def _get_enabled_clouds_key(cloud_capability: 'cloud.CloudCapability',
2024
+ workspace: str) -> str:
2025
+ return _ENABLED_CLOUDS_KEY_PREFIX + workspace + '_' + cloud_capability.value
2026
+
2027
+
2028
+ @_init_db
2029
+ @metrics_lib.time_me
2030
+ def get_allowed_clouds(workspace: str) -> List[str]:
2031
+ assert _SQLALCHEMY_ENGINE is not None
2032
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2033
+ row = session.query(config_table).filter_by(
2034
+ key=_get_allowed_clouds_key(workspace)).first()
2035
+ if row:
2036
+ return json.loads(row.value)
2037
+ return []
2038
+
2039
+
2040
+ @_init_db
2041
+ @metrics_lib.time_me
2042
+ def set_allowed_clouds(allowed_clouds: List[str], workspace: str) -> None:
2043
+ assert _SQLALCHEMY_ENGINE is not None
2044
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2045
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2046
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2047
+ insert_func = sqlite.insert
2048
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2049
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2050
+ insert_func = postgresql.insert
2051
+ else:
2052
+ raise ValueError('Unsupported database dialect')
2053
+ insert_stmnt = insert_func(config_table).values(
2054
+ key=_get_allowed_clouds_key(workspace),
2055
+ value=json.dumps(allowed_clouds))
2056
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
2057
+ index_elements=[config_table.c.key],
2058
+ set_={config_table.c.value: json.dumps(allowed_clouds)})
2059
+ session.execute(do_update_stmt)
2060
+ session.commit()
2061
+
2062
+
2063
+ def _get_allowed_clouds_key(workspace: str) -> str:
2064
+ return _ALLOWED_CLOUDS_KEY_PREFIX + workspace
2065
+
2066
+
2067
+ @_init_db
2068
+ @metrics_lib.time_me
841
2069
  def add_or_update_storage(storage_name: str,
842
2070
  storage_handle: 'Storage.StorageMetadata',
843
2071
  storage_status: status_lib.StorageStatus):
2072
+ assert _SQLALCHEMY_ENGINE is not None
844
2073
  storage_launched_at = int(time.time())
845
2074
  handle = pickle.dumps(storage_handle)
846
2075
  last_use = common_utils.get_current_command()
@@ -851,89 +2080,648 @@ def add_or_update_storage(storage_name: str,
851
2080
  if not status_check(storage_status):
852
2081
  raise ValueError(f'Error in updating global state. Storage Status '
853
2082
  f'{storage_status} is passed in incorrectly')
854
- _DB.cursor.execute('INSERT OR REPLACE INTO storage VALUES (?, ?, ?, ?, ?)',
855
- (storage_name, storage_launched_at, handle, last_use,
856
- storage_status.value))
857
- _DB.conn.commit()
858
-
859
-
2083
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2084
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2085
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2086
+ insert_func = sqlite.insert
2087
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2088
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2089
+ insert_func = postgresql.insert
2090
+ else:
2091
+ raise ValueError('Unsupported database dialect')
2092
+ insert_stmnt = insert_func(storage_table).values(
2093
+ name=storage_name,
2094
+ handle=handle,
2095
+ last_use=last_use,
2096
+ launched_at=storage_launched_at,
2097
+ status=storage_status.value)
2098
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
2099
+ index_elements=[storage_table.c.name],
2100
+ set_={
2101
+ storage_table.c.handle: handle,
2102
+ storage_table.c.last_use: last_use,
2103
+ storage_table.c.launched_at: storage_launched_at,
2104
+ storage_table.c.status: storage_status.value
2105
+ })
2106
+ session.execute(do_update_stmt)
2107
+ session.commit()
2108
+
2109
+
2110
+ @_init_db
2111
+ @metrics_lib.time_me
860
2112
  def remove_storage(storage_name: str):
861
2113
  """Removes Storage from Database"""
862
- _DB.cursor.execute('DELETE FROM storage WHERE name=(?)', (storage_name,))
863
- _DB.conn.commit()
2114
+ assert _SQLALCHEMY_ENGINE is not None
2115
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2116
+ session.query(storage_table).filter_by(name=storage_name).delete()
2117
+ session.commit()
864
2118
 
865
2119
 
2120
+ @_init_db
2121
+ @metrics_lib.time_me
866
2122
  def set_storage_status(storage_name: str,
867
2123
  status: status_lib.StorageStatus) -> None:
868
- _DB.cursor.execute('UPDATE storage SET status=(?) WHERE name=(?)', (
869
- status.value,
870
- storage_name,
871
- ))
872
- count = _DB.cursor.rowcount
873
- _DB.conn.commit()
2124
+ assert _SQLALCHEMY_ENGINE is not None
2125
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2126
+ count = session.query(storage_table).filter_by(
2127
+ name=storage_name).update({storage_table.c.status: status.value})
2128
+ session.commit()
874
2129
  assert count <= 1, count
875
2130
  if count == 0:
876
2131
  raise ValueError(f'Storage {storage_name} not found.')
877
2132
 
878
2133
 
2134
+ @_init_db
2135
+ @metrics_lib.time_me
879
2136
  def get_storage_status(storage_name: str) -> Optional[status_lib.StorageStatus]:
2137
+ assert _SQLALCHEMY_ENGINE is not None
880
2138
  assert storage_name is not None, 'storage_name cannot be None'
881
- rows = _DB.cursor.execute('SELECT status FROM storage WHERE name=(?)',
882
- (storage_name,))
883
- for (status,) in rows:
884
- return status_lib.StorageStatus[status]
2139
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2140
+ row = session.query(storage_table).filter_by(name=storage_name).first()
2141
+ if row:
2142
+ return status_lib.StorageStatus[row.status]
885
2143
  return None
886
2144
 
887
2145
 
2146
+ @_init_db
2147
+ @metrics_lib.time_me
888
2148
  def set_storage_handle(storage_name: str,
889
2149
  handle: 'Storage.StorageMetadata') -> None:
890
- _DB.cursor.execute('UPDATE storage SET handle=(?) WHERE name=(?)', (
891
- pickle.dumps(handle),
892
- storage_name,
893
- ))
894
- count = _DB.cursor.rowcount
895
- _DB.conn.commit()
2150
+ assert _SQLALCHEMY_ENGINE is not None
2151
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2152
+ count = session.query(storage_table).filter_by(
2153
+ name=storage_name).update(
2154
+ {storage_table.c.handle: pickle.dumps(handle)})
2155
+ session.commit()
896
2156
  assert count <= 1, count
897
2157
  if count == 0:
898
2158
  raise ValueError(f'Storage{storage_name} not found.')
899
2159
 
900
2160
 
2161
+ @_init_db
2162
+ @metrics_lib.time_me
901
2163
  def get_handle_from_storage_name(
902
2164
  storage_name: Optional[str]) -> Optional['Storage.StorageMetadata']:
2165
+ assert _SQLALCHEMY_ENGINE is not None
903
2166
  if storage_name is None:
904
2167
  return None
905
- rows = _DB.cursor.execute('SELECT handle FROM storage WHERE name=(?)',
906
- (storage_name,))
907
- for (handle,) in rows:
908
- if handle is None:
909
- return None
910
- return pickle.loads(handle)
2168
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2169
+ row = session.query(storage_table).filter_by(name=storage_name).first()
2170
+ if row:
2171
+ return pickle.loads(row.handle)
911
2172
  return None
912
2173
 
913
2174
 
2175
+ @_init_db
2176
+ @metrics_lib.time_me
914
2177
  def get_glob_storage_name(storage_name: str) -> List[str]:
2178
+ assert _SQLALCHEMY_ENGINE is not None
915
2179
  assert storage_name is not None, 'storage_name cannot be None'
916
- rows = _DB.cursor.execute('SELECT name FROM storage WHERE name GLOB (?)',
917
- (storage_name,))
918
- return [row[0] for row in rows]
919
-
920
-
2180
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2181
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2182
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2183
+ rows = session.query(storage_table).filter(
2184
+ storage_table.c.name.op('GLOB')(storage_name)).all()
2185
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2186
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2187
+ rows = session.query(storage_table).filter(
2188
+ storage_table.c.name.op('SIMILAR TO')(
2189
+ _glob_to_similar(storage_name))).all()
2190
+ else:
2191
+ raise ValueError('Unsupported database dialect')
2192
+ return [row.name for row in rows]
2193
+
2194
+
2195
+ @_init_db
2196
+ @metrics_lib.time_me
921
2197
  def get_storage_names_start_with(starts_with: str) -> List[str]:
922
- rows = _DB.cursor.execute('SELECT name FROM storage WHERE name LIKE (?)',
923
- (f'{starts_with}%',))
924
- return [row[0] for row in rows]
2198
+ assert _SQLALCHEMY_ENGINE is not None
2199
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2200
+ rows = session.query(storage_table).filter(
2201
+ storage_table.c.name.like(f'{starts_with}%')).all()
2202
+ return [row.name for row in rows]
925
2203
 
926
2204
 
2205
+ @_init_db
2206
+ @metrics_lib.time_me
927
2207
  def get_storage() -> List[Dict[str, Any]]:
928
- rows = _DB.cursor.execute('SELECT * FROM storage')
2208
+ assert _SQLALCHEMY_ENGINE is not None
2209
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2210
+ rows = session.query(storage_table).all()
929
2211
  records = []
930
- for name, launched_at, handle, last_use, status in rows:
2212
+ for row in rows:
931
2213
  # TODO: use namedtuple instead of dict
932
2214
  records.append({
933
- 'name': name,
934
- 'launched_at': launched_at,
935
- 'handle': pickle.loads(handle),
936
- 'last_use': last_use,
937
- 'status': status_lib.StorageStatus[status],
2215
+ 'name': row.name,
2216
+ 'launched_at': row.launched_at,
2217
+ 'handle': pickle.loads(row.handle),
2218
+ 'last_use': row.last_use,
2219
+ 'status': status_lib.StorageStatus[row.status],
2220
+ })
2221
+ return records
2222
+
2223
+
2224
+ @_init_db
2225
+ @metrics_lib.time_me
2226
+ def get_volume_names_start_with(starts_with: str) -> List[str]:
2227
+ assert _SQLALCHEMY_ENGINE is not None
2228
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2229
+ rows = session.query(volume_table).filter(
2230
+ volume_table.c.name.like(f'{starts_with}%')).all()
2231
+ return [row.name for row in rows]
2232
+
2233
+
2234
+ @_init_db
2235
+ @metrics_lib.time_me
2236
+ def get_volumes() -> List[Dict[str, Any]]:
2237
+ assert _SQLALCHEMY_ENGINE is not None
2238
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2239
+ rows = session.query(volume_table).all()
2240
+ records = []
2241
+ for row in rows:
2242
+ records.append({
2243
+ 'name': row.name,
2244
+ 'launched_at': row.launched_at,
2245
+ 'handle': pickle.loads(row.handle),
2246
+ 'user_hash': row.user_hash,
2247
+ 'workspace': row.workspace,
2248
+ 'last_attached_at': row.last_attached_at,
2249
+ 'last_use': row.last_use,
2250
+ 'status': status_lib.VolumeStatus[row.status],
938
2251
  })
939
2252
  return records
2253
+
2254
+
2255
+ @_init_db
2256
+ @metrics_lib.time_me
2257
+ def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
2258
+ assert _SQLALCHEMY_ENGINE is not None
2259
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2260
+ row = session.query(volume_table).filter_by(name=name).first()
2261
+ if row:
2262
+ return {
2263
+ 'name': row.name,
2264
+ 'launched_at': row.launched_at,
2265
+ 'handle': pickle.loads(row.handle),
2266
+ 'user_hash': row.user_hash,
2267
+ 'workspace': row.workspace,
2268
+ 'last_attached_at': row.last_attached_at,
2269
+ 'last_use': row.last_use,
2270
+ 'status': status_lib.VolumeStatus[row.status],
2271
+ }
2272
+ return None
2273
+
2274
+
2275
+ @_init_db
2276
+ @metrics_lib.time_me
2277
+ def add_volume(name: str, config: models.VolumeConfig,
2278
+ status: status_lib.VolumeStatus) -> None:
2279
+ assert _SQLALCHEMY_ENGINE is not None
2280
+ volume_launched_at = int(time.time())
2281
+ handle = pickle.dumps(config)
2282
+ last_use = common_utils.get_current_command()
2283
+ user_hash = common_utils.get_current_user().id
2284
+ active_workspace = skypilot_config.get_active_workspace()
2285
+
2286
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2287
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2288
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2289
+ insert_func = sqlite.insert
2290
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2291
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2292
+ insert_func = postgresql.insert
2293
+ else:
2294
+ raise ValueError('Unsupported database dialect')
2295
+ insert_stmnt = insert_func(volume_table).values(
2296
+ name=name,
2297
+ launched_at=volume_launched_at,
2298
+ handle=handle,
2299
+ user_hash=user_hash,
2300
+ workspace=active_workspace,
2301
+ last_attached_at=None,
2302
+ last_use=last_use,
2303
+ status=status.value,
2304
+ )
2305
+ do_update_stmt = insert_stmnt.on_conflict_do_nothing()
2306
+ session.execute(do_update_stmt)
2307
+ session.commit()
2308
+
2309
+
2310
+ @_init_db
2311
+ @metrics_lib.time_me
2312
+ def update_volume(name: str, last_attached_at: int,
2313
+ status: status_lib.VolumeStatus) -> None:
2314
+ assert _SQLALCHEMY_ENGINE is not None
2315
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2316
+ session.query(volume_table).filter_by(name=name).update({
2317
+ volume_table.c.last_attached_at: last_attached_at,
2318
+ volume_table.c.status: status.value,
2319
+ })
2320
+ session.commit()
2321
+
2322
+
2323
+ @_init_db
2324
+ @metrics_lib.time_me
2325
+ def update_volume_status(name: str, status: status_lib.VolumeStatus) -> None:
2326
+ assert _SQLALCHEMY_ENGINE is not None
2327
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2328
+ session.query(volume_table).filter_by(name=name).update({
2329
+ volume_table.c.status: status.value,
2330
+ })
2331
+ session.commit()
2332
+
2333
+
2334
+ @_init_db
2335
+ @metrics_lib.time_me
2336
+ def delete_volume(name: str) -> None:
2337
+ assert _SQLALCHEMY_ENGINE is not None
2338
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2339
+ session.query(volume_table).filter_by(name=name).delete()
2340
+ session.commit()
2341
+
2342
+
2343
+ @_init_db
2344
+ @metrics_lib.time_me
2345
+ def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
2346
+ assert _SQLALCHEMY_ENGINE is not None
2347
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2348
+ row = session.query(ssh_key_table).filter_by(
2349
+ user_hash=user_hash).first()
2350
+ if row:
2351
+ return row.ssh_public_key, row.ssh_private_key, True
2352
+ return '', '', False
2353
+
2354
+
2355
+ @_init_db
2356
+ @metrics_lib.time_me
2357
+ def set_ssh_keys(user_hash: str, ssh_public_key: str, ssh_private_key: str):
2358
+ assert _SQLALCHEMY_ENGINE is not None
2359
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2360
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2361
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2362
+ insert_func = sqlite.insert
2363
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2364
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2365
+ insert_func = postgresql.insert
2366
+ else:
2367
+ raise ValueError('Unsupported database dialect')
2368
+ insert_stmnt = insert_func(ssh_key_table).values(
2369
+ user_hash=user_hash,
2370
+ ssh_public_key=ssh_public_key,
2371
+ ssh_private_key=ssh_private_key)
2372
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
2373
+ index_elements=[ssh_key_table.c.user_hash],
2374
+ set_={
2375
+ ssh_key_table.c.ssh_public_key: ssh_public_key,
2376
+ ssh_key_table.c.ssh_private_key: ssh_private_key
2377
+ })
2378
+ session.execute(do_update_stmt)
2379
+ session.commit()
2380
+
2381
+
2382
+ @_init_db
2383
+ @metrics_lib.time_me
2384
+ def add_service_account_token(token_id: str,
2385
+ token_name: str,
2386
+ token_hash: str,
2387
+ creator_user_hash: str,
2388
+ service_account_user_id: str,
2389
+ expires_at: Optional[int] = None) -> None:
2390
+ """Add a service account token to the database."""
2391
+ assert _SQLALCHEMY_ENGINE is not None
2392
+ created_at = int(time.time())
2393
+
2394
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2395
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2396
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2397
+ insert_func = sqlite.insert
2398
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2399
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2400
+ insert_func = postgresql.insert
2401
+ else:
2402
+ raise ValueError('Unsupported database dialect')
2403
+
2404
+ insert_stmnt = insert_func(service_account_token_table).values(
2405
+ token_id=token_id,
2406
+ token_name=token_name,
2407
+ token_hash=token_hash,
2408
+ created_at=created_at,
2409
+ expires_at=expires_at,
2410
+ creator_user_hash=creator_user_hash,
2411
+ service_account_user_id=service_account_user_id)
2412
+ session.execute(insert_stmnt)
2413
+ session.commit()
2414
+
2415
+
2416
+ @_init_db
2417
+ @metrics_lib.time_me
2418
+ def get_service_account_token(token_id: str) -> Optional[Dict[str, Any]]:
2419
+ """Get a service account token by token_id."""
2420
+ assert _SQLALCHEMY_ENGINE is not None
2421
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2422
+ row = session.query(service_account_token_table).filter_by(
2423
+ token_id=token_id).first()
2424
+ if row is None:
2425
+ return None
2426
+ return {
2427
+ 'token_id': row.token_id,
2428
+ 'token_name': row.token_name,
2429
+ 'token_hash': row.token_hash,
2430
+ 'created_at': row.created_at,
2431
+ 'last_used_at': row.last_used_at,
2432
+ 'expires_at': row.expires_at,
2433
+ 'creator_user_hash': row.creator_user_hash,
2434
+ 'service_account_user_id': row.service_account_user_id,
2435
+ }
2436
+
2437
+
2438
+ @_init_db
2439
+ @metrics_lib.time_me
2440
+ def get_user_service_account_tokens(user_hash: str) -> List[Dict[str, Any]]:
2441
+ """Get all service account tokens for a user (as creator)."""
2442
+ assert _SQLALCHEMY_ENGINE is not None
2443
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2444
+ rows = session.query(service_account_token_table).filter_by(
2445
+ creator_user_hash=user_hash).all()
2446
+ return [{
2447
+ 'token_id': row.token_id,
2448
+ 'token_name': row.token_name,
2449
+ 'token_hash': row.token_hash,
2450
+ 'created_at': row.created_at,
2451
+ 'last_used_at': row.last_used_at,
2452
+ 'expires_at': row.expires_at,
2453
+ 'creator_user_hash': row.creator_user_hash,
2454
+ 'service_account_user_id': row.service_account_user_id,
2455
+ } for row in rows]
2456
+
2457
+
2458
+ @_init_db
2459
+ @metrics_lib.time_me
2460
+ def update_service_account_token_last_used(token_id: str) -> None:
2461
+ """Update the last_used_at timestamp for a service account token."""
2462
+ assert _SQLALCHEMY_ENGINE is not None
2463
+ last_used_at = int(time.time())
2464
+
2465
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2466
+ session.query(service_account_token_table).filter_by(
2467
+ token_id=token_id).update(
2468
+ {service_account_token_table.c.last_used_at: last_used_at})
2469
+ session.commit()
2470
+
2471
+
2472
+ @_init_db
2473
+ @metrics_lib.time_me
2474
+ def delete_service_account_token(token_id: str) -> bool:
2475
+ """Delete a service account token.
2476
+
2477
+ Returns:
2478
+ True if token was found and deleted.
2479
+ """
2480
+ assert _SQLALCHEMY_ENGINE is not None
2481
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2482
+ result = session.query(service_account_token_table).filter_by(
2483
+ token_id=token_id).delete()
2484
+ session.commit()
2485
+ return result > 0
2486
+
2487
+
2488
+ @_init_db
2489
+ @metrics_lib.time_me
2490
+ def rotate_service_account_token(token_id: str,
2491
+ new_token_hash: str,
2492
+ new_expires_at: Optional[int] = None) -> None:
2493
+ """Rotate a service account token by updating its hash and expiration.
2494
+
2495
+ Args:
2496
+ token_id: The token ID to rotate.
2497
+ new_token_hash: The new hashed token value.
2498
+ new_expires_at: New expiration timestamp, or None for no expiration.
2499
+ """
2500
+ assert _SQLALCHEMY_ENGINE is not None
2501
+ current_time = int(time.time())
2502
+
2503
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2504
+ count = session.query(service_account_token_table).filter_by(
2505
+ token_id=token_id
2506
+ ).update({
2507
+ service_account_token_table.c.token_hash: new_token_hash,
2508
+ service_account_token_table.c.expires_at: new_expires_at,
2509
+ service_account_token_table.c.last_used_at: None, # Reset last used
2510
+ # Update creation time
2511
+ service_account_token_table.c.created_at: current_time,
2512
+ })
2513
+ session.commit()
2514
+
2515
+ if count == 0:
2516
+ raise ValueError(f'Service account token {token_id} not found.')
2517
+
2518
+
2519
+ @_init_db
2520
+ @metrics_lib.time_me
2521
+ def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
2522
+ """Get the cluster yaml from the database or the local file system.
2523
+ If the cluster yaml is not in the database, check if it exists on the
2524
+ local file system and migrate it to the database.
2525
+
2526
+ It is assumed that the cluster yaml file is named as <cluster_name>.yml.
2527
+ """
2528
+ assert _SQLALCHEMY_ENGINE is not None
2529
+ if cluster_yaml_path is None:
2530
+ raise ValueError('Attempted to read a None YAML.')
2531
+ cluster_file_name = os.path.basename(cluster_yaml_path)
2532
+ cluster_name, _ = os.path.splitext(cluster_file_name)
2533
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2534
+ row = session.query(cluster_yaml_table).filter_by(
2535
+ cluster_name=cluster_name).first()
2536
+ if row is None:
2537
+ return _set_cluster_yaml_from_file(cluster_yaml_path, cluster_name)
2538
+ return row.yaml
2539
+
2540
+
2541
+ def get_cluster_yaml_str_multiple(cluster_yaml_paths: List[str]) -> List[str]:
2542
+ """Get the cluster yaml from the database or the local file system.
2543
+ """
2544
+ assert _SQLALCHEMY_ENGINE is not None
2545
+ cluster_names_to_yaml_paths = {}
2546
+ for cluster_yaml_path in cluster_yaml_paths:
2547
+ cluster_name, _ = os.path.splitext(os.path.basename(cluster_yaml_path))
2548
+ cluster_names_to_yaml_paths[cluster_name] = cluster_yaml_path
2549
+
2550
+ cluster_names = list(cluster_names_to_yaml_paths.keys())
2551
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2552
+ rows = session.query(cluster_yaml_table).filter(
2553
+ cluster_yaml_table.c.cluster_name.in_(cluster_names)).all()
2554
+ row_cluster_names_to_yaml = {row.cluster_name: row.yaml for row in rows}
2555
+
2556
+ yaml_strs = []
2557
+ for cluster_name in cluster_names:
2558
+ if cluster_name in row_cluster_names_to_yaml:
2559
+ yaml_strs.append(row_cluster_names_to_yaml[cluster_name])
2560
+ else:
2561
+ yaml_str = _set_cluster_yaml_from_file(
2562
+ cluster_names_to_yaml_paths[cluster_name], cluster_name)
2563
+ yaml_strs.append(yaml_str)
2564
+ return yaml_strs
2565
+
2566
+
2567
+ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
2568
+ cluster_name: str) -> Optional[str]:
2569
+ """Set the cluster yaml in the database from a file."""
2570
+ # If the cluster yaml is not in the database, check if it exists
2571
+ # on the local file system and migrate it to the database.
2572
+ # TODO(syang): remove this check once we have a way to migrate the
2573
+ # cluster from file to database. Remove on v0.12.0.
2574
+ if cluster_yaml_path is not None:
2575
+ # First try the exact path
2576
+ path_to_read = None
2577
+ if os.path.exists(cluster_yaml_path):
2578
+ path_to_read = cluster_yaml_path
2579
+ # Fallback: try with .debug suffix (when debug logging was enabled)
2580
+ # Debug logging causes YAML files to be saved with .debug suffix
2581
+ # but the path stored in the handle doesn't include it
2582
+ debug_path = cluster_yaml_path + '.debug'
2583
+ if os.path.exists(debug_path):
2584
+ path_to_read = debug_path
2585
+ if path_to_read is not None:
2586
+ with open(path_to_read, 'r', encoding='utf-8') as f:
2587
+ yaml_str = f.read()
2588
+ set_cluster_yaml(cluster_name, yaml_str)
2589
+ return yaml_str
2590
+ return None
2591
+
2592
+
2593
+ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
2594
+ """Get the cluster yaml as a dictionary from the database.
2595
+
2596
+ It is assumed that the cluster yaml file is named as <cluster_name>.yml.
2597
+ """
2598
+ yaml_str = get_cluster_yaml_str(cluster_yaml_path)
2599
+ if yaml_str is None:
2600
+ raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
2601
+ return yaml_utils.safe_load(yaml_str)
2602
+
2603
+
2604
+ def get_cluster_yaml_dict_multiple(
2605
+ cluster_yaml_paths: List[str]) -> List[Dict[str, Any]]:
2606
+ """Get the cluster yaml as a dictionary from the database."""
2607
+ yaml_strs = get_cluster_yaml_str_multiple(cluster_yaml_paths)
2608
+ yaml_dicts = []
2609
+ for idx, yaml_str in enumerate(yaml_strs):
2610
+ if yaml_str is None:
2611
+ raise ValueError(
2612
+ f'Cluster yaml {cluster_yaml_paths[idx]} not found.')
2613
+ yaml_dicts.append(yaml_utils.safe_load(yaml_str))
2614
+ return yaml_dicts
2615
+
2616
+
2617
+ @_init_db
2618
+ @metrics_lib.time_me
2619
+ def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
2620
+ """Set the cluster yaml in the database."""
2621
+ assert _SQLALCHEMY_ENGINE is not None
2622
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2623
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2624
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2625
+ insert_func = sqlite.insert
2626
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2627
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2628
+ insert_func = postgresql.insert
2629
+ else:
2630
+ raise ValueError('Unsupported database dialect')
2631
+ insert_stmnt = insert_func(cluster_yaml_table).values(
2632
+ cluster_name=cluster_name, yaml=yaml_str)
2633
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
2634
+ index_elements=[cluster_yaml_table.c.cluster_name],
2635
+ set_={cluster_yaml_table.c.yaml: yaml_str})
2636
+ session.execute(do_update_stmt)
2637
+ session.commit()
2638
+
2639
+
2640
+ @_init_db
2641
+ @metrics_lib.time_me
2642
+ def remove_cluster_yaml(cluster_name: str):
2643
+ assert _SQLALCHEMY_ENGINE is not None
2644
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2645
+ session.query(cluster_yaml_table).filter_by(
2646
+ cluster_name=cluster_name).delete()
2647
+ session.commit()
2648
+
2649
+
2650
+ @_init_db
2651
+ @metrics_lib.time_me
2652
+ def get_all_service_account_tokens() -> List[Dict[str, Any]]:
2653
+ """Get all service account tokens across all users (for admin access)."""
2654
+ assert _SQLALCHEMY_ENGINE is not None
2655
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2656
+ rows = session.query(service_account_token_table).all()
2657
+ return [{
2658
+ 'token_id': row.token_id,
2659
+ 'token_name': row.token_name,
2660
+ 'token_hash': row.token_hash,
2661
+ 'created_at': row.created_at,
2662
+ 'last_used_at': row.last_used_at,
2663
+ 'expires_at': row.expires_at,
2664
+ 'creator_user_hash': row.creator_user_hash,
2665
+ 'service_account_user_id': row.service_account_user_id,
2666
+ } for row in rows]
2667
+
2668
+
2669
+ @_init_db
2670
+ @metrics_lib.time_me
2671
+ def get_system_config(config_key: str) -> Optional[str]:
2672
+ """Get a system configuration value by key."""
2673
+ assert _SQLALCHEMY_ENGINE is not None
2674
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2675
+ row = session.query(system_config_table).filter_by(
2676
+ config_key=config_key).first()
2677
+ if row is None:
2678
+ return None
2679
+ return row.config_value
2680
+
2681
+
2682
+ @_init_db
2683
+ @metrics_lib.time_me
2684
+ def set_system_config(config_key: str, config_value: str) -> None:
2685
+ """Set a system configuration value."""
2686
+ assert _SQLALCHEMY_ENGINE is not None
2687
+ current_time = int(time.time())
2688
+
2689
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2690
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2691
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2692
+ insert_func = sqlite.insert
2693
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
2694
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2695
+ insert_func = postgresql.insert
2696
+ else:
2697
+ raise ValueError('Unsupported database dialect')
2698
+
2699
+ insert_stmnt = insert_func(system_config_table).values(
2700
+ config_key=config_key,
2701
+ config_value=config_value,
2702
+ created_at=current_time,
2703
+ updated_at=current_time)
2704
+
2705
+ upsert_stmnt = insert_stmnt.on_conflict_do_update(
2706
+ index_elements=[system_config_table.c.config_key],
2707
+ set_={
2708
+ system_config_table.c.config_value: config_value,
2709
+ system_config_table.c.updated_at: current_time,
2710
+ })
2711
+ session.execute(upsert_stmnt)
2712
+ session.commit()
2713
+
2714
+
2715
+ @_init_db
2716
+ def get_max_db_connections() -> Optional[int]:
2717
+ """Get the maximum number of connections for the engine."""
2718
+ assert _SQLALCHEMY_ENGINE is not None
2719
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
2720
+ db_utils.SQLAlchemyDialect.SQLITE.value):
2721
+ return None
2722
+ with sqlalchemy.orm.Session(_SQLALCHEMY_ENGINE) as session:
2723
+ max_connections = session.execute(
2724
+ sqlalchemy.text('SHOW max_connections')).scalar()
2725
+ if max_connections is None:
2726
+ return None
2727
+ return int(max_connections)