skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/global_user_state.py CHANGED
@@ -24,13 +24,15 @@ from sqlalchemy import exc as sqlalchemy_exc
24
24
  from sqlalchemy import orm
25
25
  from sqlalchemy.dialects import postgresql
26
26
  from sqlalchemy.dialects import sqlite
27
+ from sqlalchemy.ext import asyncio as sql_async
27
28
  from sqlalchemy.ext import declarative
28
29
 
29
30
  from sky import models
30
31
  from sky import sky_logging
31
32
  from sky import skypilot_config
32
- from sky.server import metrics as metrics_lib
33
+ from sky.metrics import utils as metrics_lib
33
34
  from sky.skylet import constants
35
+ from sky.utils import annotations
34
36
  from sky.utils import common_utils
35
37
  from sky.utils import context_utils
36
38
  from sky.utils import registry
@@ -51,6 +53,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
51
53
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
52
54
 
53
55
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
56
+ _SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
54
57
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
55
58
 
56
59
  DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
@@ -118,6 +121,9 @@ cluster_table = sqlalchemy.Table(
118
121
  sqlalchemy.Column('provision_log_path',
119
122
  sqlalchemy.Text,
120
123
  server_default=None),
124
+ sqlalchemy.Column('skylet_ssh_tunnel_metadata',
125
+ sqlalchemy.LargeBinary,
126
+ server_default=None),
121
127
  )
122
128
 
123
129
  storage_table = sqlalchemy.Table(
@@ -145,6 +151,7 @@ volume_table = sqlalchemy.Table(
145
151
  server_default=None),
146
152
  sqlalchemy.Column('last_use', sqlalchemy.Text),
147
153
  sqlalchemy.Column('status', sqlalchemy.Text),
154
+ sqlalchemy.Column('is_ephemeral', sqlalchemy.Integer, server_default='0'),
148
155
  )
149
156
 
150
157
  # Table for Cluster History
@@ -180,6 +187,14 @@ cluster_history_table = sqlalchemy.Table(
180
187
  sqlalchemy.Column('provision_log_path',
181
188
  sqlalchemy.Text,
182
189
  server_default=None),
190
+ sqlalchemy.Column('last_activity_time',
191
+ sqlalchemy.Integer,
192
+ server_default=None,
193
+ index=True),
194
+ sqlalchemy.Column('launched_at',
195
+ sqlalchemy.Integer,
196
+ server_default=None,
197
+ index=True),
183
198
  )
184
199
 
185
200
 
@@ -293,6 +308,20 @@ def create_table(engine: sqlalchemy.engine.Engine):
293
308
  migration_utils.GLOBAL_USER_STATE_VERSION)
294
309
 
295
310
 
311
+ def initialize_and_get_db_async() -> sql_async.AsyncEngine:
312
+ global _SQLALCHEMY_ENGINE_ASYNC
313
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
314
+ return _SQLALCHEMY_ENGINE_ASYNC
315
+ with _SQLALCHEMY_ENGINE_LOCK:
316
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
317
+ return _SQLALCHEMY_ENGINE_ASYNC
318
+
319
+ _SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('state',
320
+ async_engine=True)
321
+ initialize_and_get_db()
322
+ return _SQLALCHEMY_ENGINE_ASYNC
323
+
324
+
296
325
  # We wrap the sqlalchemy engine initialization in a thread
297
326
  # lock to ensure that multiple threads do not initialize the
298
327
  # engine which could result in a rare race condition where
@@ -315,9 +344,29 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
315
344
 
316
345
  # return engine
317
346
  _SQLALCHEMY_ENGINE = engine
347
+ # Cache the result of _sqlite_supports_returning()
348
+ # ahead of time, as it won't change throughout
349
+ # the lifetime of the engine.
350
+ _sqlite_supports_returning()
318
351
  return _SQLALCHEMY_ENGINE
319
352
 
320
353
 
354
+ def _init_db_async(func):
355
+ """Initialize the async database."""
356
+
357
+ @functools.wraps(func)
358
+ async def wrapper(*args, **kwargs):
359
+ if _SQLALCHEMY_ENGINE_ASYNC is None:
360
+ # this may happen multiple times since there is no locking
361
+ # here but thats fine, this is just a short circuit for the
362
+ # common case.
363
+ await context_utils.to_thread(initialize_and_get_db_async)
364
+
365
+ return await func(*args, **kwargs)
366
+
367
+ return wrapper
368
+
369
+
321
370
  def _init_db(func):
322
371
  """Initialize the database."""
323
372
 
@@ -329,19 +378,51 @@ def _init_db(func):
329
378
  return wrapper
330
379
 
331
380
 
381
+ @annotations.lru_cache(scope='global', maxsize=1)
382
+ def _sqlite_supports_returning() -> bool:
383
+ """Check if SQLite (3.35.0+) and SQLAlchemy (2.0+) support RETURNING.
384
+
385
+ See https://sqlite.org/lang_returning.html and
386
+ https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#insert-update-delete-returning # pylint: disable=line-too-long
387
+ """
388
+ sqlalchemy_version_parts = sqlalchemy.__version__.split('.')
389
+ assert len(sqlalchemy_version_parts) >= 1, \
390
+ f'Invalid SQLAlchemy version: {sqlalchemy.__version__}'
391
+ sqlalchemy_major = int(sqlalchemy_version_parts[0])
392
+ if sqlalchemy_major < 2:
393
+ return False
394
+
395
+ assert _SQLALCHEMY_ENGINE is not None
396
+ if (_SQLALCHEMY_ENGINE.dialect.name !=
397
+ db_utils.SQLAlchemyDialect.SQLITE.value):
398
+ return False
399
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
400
+ result = session.execute(sqlalchemy.text('SELECT sqlite_version()'))
401
+ version_str = result.scalar()
402
+ version_parts = version_str.split('.')
403
+ assert len(version_parts) >= 2, \
404
+ f'Invalid version string: {version_str}'
405
+ major, minor = int(version_parts[0]), int(version_parts[1])
406
+ return (major > 3) or (major == 3 and minor >= 35)
407
+
408
+
332
409
  @_init_db
333
410
  @metrics_lib.time_me
334
- def add_or_update_user(user: models.User,
335
- allow_duplicate_name: bool = True) -> bool:
411
+ def add_or_update_user(
412
+ user: models.User,
413
+ allow_duplicate_name: bool = True,
414
+ return_user: bool = False
415
+ ) -> typing.Union[bool, typing.Tuple[bool, models.User]]:
336
416
  """Store the mapping from user hash to user name for display purposes.
337
417
 
338
418
  Returns:
339
- Boolean: whether the user is newly added
419
+ If return_user=False: bool (whether the user is newly added)
420
+ If return_user=True: Tuple[bool, models.User]
340
421
  """
341
422
  assert _SQLALCHEMY_ENGINE is not None
342
423
 
343
424
  if user.name is None:
344
- return False
425
+ return (False, user) if return_user else False
345
426
 
346
427
  # Set created_at if not already set
347
428
  created_at = user.created_at
@@ -353,7 +434,7 @@ def add_or_update_user(user: models.User,
353
434
  existing_user = session.query(user_table).filter(
354
435
  user_table.c.name == user.name).first()
355
436
  if existing_user is not None:
356
- return False
437
+ return (False, user) if return_user else False
357
438
 
358
439
  if (_SQLALCHEMY_ENGINE.dialect.name ==
359
440
  db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -367,24 +448,57 @@ def add_or_update_user(user: models.User,
367
448
  name=user.name,
368
449
  password=user.password,
369
450
  created_at=created_at)
451
+ use_returning = return_user and _sqlite_supports_returning()
452
+ if use_returning:
453
+ insert_stmnt = insert_stmnt.returning(
454
+ user_table.c.id,
455
+ user_table.c.name,
456
+ user_table.c.password,
457
+ user_table.c.created_at,
458
+ )
370
459
  result = session.execute(insert_stmnt)
371
460
 
372
- # Check if the INSERT actually inserted a row
373
- was_inserted = result.rowcount > 0
461
+ row = None
462
+ if use_returning:
463
+ # With RETURNING, check if we got a row back.
464
+ row = result.fetchone()
465
+ was_inserted = row is not None
466
+ else:
467
+ # Without RETURNING, use rowcount.
468
+ was_inserted = result.rowcount > 0
374
469
 
375
470
  if not was_inserted:
376
471
  # User existed, so update it (but don't update created_at)
472
+ update_values = {user_table.c.name: user.name}
377
473
  if user.password:
378
- session.query(user_table).filter_by(id=user.id).update({
379
- user_table.c.name: user.name,
380
- user_table.c.password: user.password
381
- })
382
- else:
383
- session.query(user_table).filter_by(id=user.id).update(
384
- {user_table.c.name: user.name})
474
+ update_values[user_table.c.password] = user.password
475
+
476
+ update_stmnt = sqlalchemy.update(user_table).where(
477
+ user_table.c.id == user.id).values(update_values)
478
+ if use_returning:
479
+ update_stmnt = update_stmnt.returning(
480
+ user_table.c.id, user_table.c.name,
481
+ user_table.c.password, user_table.c.created_at)
482
+
483
+ result = session.execute(update_stmnt)
484
+ if use_returning:
485
+ row = result.fetchone()
385
486
 
386
487
  session.commit()
387
- return was_inserted
488
+
489
+ if return_user:
490
+ if row is None:
491
+ # row=None means the sqlite used has no RETURNING support,
492
+ # so we need to do a separate query
493
+ row = session.query(user_table).filter_by(
494
+ id=user.id).first()
495
+ updated_user = models.User(id=row.id,
496
+ name=row.name,
497
+ password=row.password,
498
+ created_at=row.created_at)
499
+ return was_inserted, updated_user
500
+ else:
501
+ return was_inserted
388
502
 
389
503
  elif (_SQLALCHEMY_ENGINE.dialect.name ==
390
504
  db_utils.SQLAlchemyDialect.POSTGRESQL.value):
@@ -409,6 +523,9 @@ def add_or_update_user(user: models.User,
409
523
  upsert_stmnt = insert_stmnt.on_conflict_do_update(
410
524
  index_elements=[user_table.c.id], set_=set_).returning(
411
525
  user_table.c.id,
526
+ user_table.c.name,
527
+ user_table.c.password,
528
+ user_table.c.created_at,
412
529
  # This will be True for INSERT, False for UPDATE
413
530
  sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
414
531
  ))
@@ -416,10 +533,17 @@ def add_or_update_user(user: models.User,
416
533
  result = session.execute(upsert_stmnt)
417
534
  row = result.fetchone()
418
535
 
419
- ret = bool(row.was_inserted) if row else False
536
+ was_inserted = bool(row.was_inserted) if row else False
420
537
  session.commit()
421
538
 
422
- return ret
539
+ if return_user:
540
+ updated_user = models.User(id=row.id,
541
+ name=row.name,
542
+ password=row.password,
543
+ created_at=row.created_at)
544
+ return was_inserted, updated_user
545
+ else:
546
+ return was_inserted
423
547
  else:
424
548
  raise ValueError('Unsupported database dialect')
425
549
 
@@ -440,7 +564,7 @@ def get_user(user_id: str) -> Optional[models.User]:
440
564
 
441
565
  @_init_db
442
566
  @metrics_lib.time_me
443
- def _get_users(user_ids: Set[str]) -> Dict[str, models.User]:
567
+ def get_users(user_ids: Set[str]) -> Dict[str, models.User]:
444
568
  assert _SQLALCHEMY_ENGINE is not None
445
569
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
446
570
  rows = session.query(user_table).filter(
@@ -512,7 +636,8 @@ def add_or_update_cluster(cluster_name: str,
512
636
  config_hash: Optional[str] = None,
513
637
  task_config: Optional[Dict[str, Any]] = None,
514
638
  is_managed: bool = False,
515
- provision_log_path: Optional[str] = None):
639
+ provision_log_path: Optional[str] = None,
640
+ existing_cluster_hash: Optional[str] = None):
516
641
  """Adds or updates cluster_name -> cluster_handle mapping.
517
642
 
518
643
  Args:
@@ -528,8 +653,12 @@ def add_or_update_cluster(cluster_name: str,
528
653
  is_managed: Whether the cluster is launched by the
529
654
  controller.
530
655
  provision_log_path: Absolute path to provision.log, if available.
656
+ existing_cluster_hash: If specified, the cluster will be updated
657
+ only if the cluster_hash matches. If a cluster does not exist,
658
+ it will not be inserted and an error will be raised.
531
659
  """
532
660
  assert _SQLALCHEMY_ENGINE is not None
661
+
533
662
  # FIXME: launched_at will be changed when `sky launch -c` is called.
534
663
  handle = pickle.dumps(cluster_handle)
535
664
  cluster_launched_at = int(time.time()) if is_launch else None
@@ -625,32 +754,44 @@ def add_or_update_cluster(cluster_name: str,
625
754
  session.rollback()
626
755
  raise ValueError('Unsupported database dialect')
627
756
 
628
- insert_stmnt = insert_func(cluster_table).values(
629
- name=cluster_name,
630
- **conditional_values,
631
- handle=handle,
632
- status=status.value,
633
- # set metadata to server default ('{}')
634
- # set owner to server default (null)
635
- cluster_hash=cluster_hash,
636
- # set storage_mounts_metadata to server default (null)
637
- status_updated_at=status_updated_at,
638
- is_managed=int(is_managed),
639
- )
640
- do_update_stmt = insert_stmnt.on_conflict_do_update(
641
- index_elements=[cluster_table.c.name],
642
- set_={
757
+ if existing_cluster_hash is not None:
758
+ count = session.query(cluster_table).filter_by(
759
+ name=cluster_name, cluster_hash=existing_cluster_hash).update({
760
+ **conditional_values, cluster_table.c.handle: handle,
761
+ cluster_table.c.status: status.value,
762
+ cluster_table.c.status_updated_at: status_updated_at
763
+ })
764
+ assert count <= 1
765
+ if count == 0:
766
+ raise ValueError(f'Cluster {cluster_name} with hash '
767
+ f'{existing_cluster_hash} not found.')
768
+ else:
769
+ insert_stmnt = insert_func(cluster_table).values(
770
+ name=cluster_name,
643
771
  **conditional_values,
644
- cluster_table.c.handle: handle,
645
- cluster_table.c.status: status.value,
646
- # do not update metadata value
647
- # do not update owner value
648
- cluster_table.c.cluster_hash: cluster_hash,
649
- # do not update storage_mounts_metadata
650
- cluster_table.c.status_updated_at: status_updated_at,
651
- # do not update user_hash
652
- })
653
- session.execute(do_update_stmt)
772
+ handle=handle,
773
+ status=status.value,
774
+ # set metadata to server default ('{}')
775
+ # set owner to server default (null)
776
+ cluster_hash=cluster_hash,
777
+ # set storage_mounts_metadata to server default (null)
778
+ status_updated_at=status_updated_at,
779
+ is_managed=int(is_managed),
780
+ )
781
+ insert_or_update_stmt = insert_stmnt.on_conflict_do_update(
782
+ index_elements=[cluster_table.c.name],
783
+ set_={
784
+ **conditional_values,
785
+ cluster_table.c.handle: handle,
786
+ cluster_table.c.status: status.value,
787
+ # do not update metadata value
788
+ # do not update owner value
789
+ cluster_table.c.cluster_hash: cluster_hash,
790
+ # do not update storage_mounts_metadata
791
+ cluster_table.c.status_updated_at: status_updated_at,
792
+ # do not update user_hash
793
+ })
794
+ session.execute(insert_or_update_stmt)
654
795
 
655
796
  # Modify cluster history table
656
797
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -668,6 +809,10 @@ def add_or_update_cluster(cluster_name: str,
668
809
  conditional_values.get('last_creation_command'),
669
810
  }
670
811
 
812
+ # Calculate last_activity_time and launched_at from usage_intervals
813
+ last_activity_time = _get_cluster_last_activity_time(usage_intervals)
814
+ launched_at = _get_cluster_launch_time(usage_intervals)
815
+
671
816
  insert_stmnt = insert_func(cluster_history_table).values(
672
817
  cluster_hash=cluster_hash,
673
818
  name=cluster_name,
@@ -678,6 +823,8 @@ def add_or_update_cluster(cluster_name: str,
678
823
  user_hash=user_hash,
679
824
  workspace=history_workspace,
680
825
  provision_log_path=provision_log_path,
826
+ last_activity_time=last_activity_time,
827
+ launched_at=launched_at,
681
828
  **creation_info,
682
829
  )
683
830
  do_update_stmt = insert_stmnt.on_conflict_do_update(
@@ -694,6 +841,8 @@ def add_or_update_cluster(cluster_name: str,
694
841
  cluster_history_table.c.user_hash: history_hash,
695
842
  cluster_history_table.c.workspace: history_workspace,
696
843
  cluster_history_table.c.provision_log_path: provision_log_path,
844
+ cluster_history_table.c.last_activity_time: last_activity_time,
845
+ cluster_history_table.c.launched_at: launched_at,
697
846
  **creation_info,
698
847
  })
699
848
  session.execute(do_update_stmt)
@@ -990,7 +1139,8 @@ def get_handle_from_cluster_name(
990
1139
  assert _SQLALCHEMY_ENGINE is not None
991
1140
  assert cluster_name is not None, 'cluster_name cannot be None'
992
1141
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
993
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1142
+ row = (session.query(
1143
+ cluster_table.c.handle).filter_by(name=cluster_name).first())
994
1144
  if row is None:
995
1145
  return None
996
1146
  return pickle.loads(row.handle)
@@ -998,21 +1148,95 @@ def get_handle_from_cluster_name(
998
1148
 
999
1149
  @_init_db
1000
1150
  @metrics_lib.time_me
1001
- def get_glob_cluster_names(cluster_name: str) -> List[str]:
1151
+ def get_handles_from_cluster_names(
1152
+ cluster_names: Set[str]
1153
+ ) -> Dict[str, Optional['backends.ResourceHandle']]:
1154
+ assert _SQLALCHEMY_ENGINE is not None
1155
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1156
+ rows = session.query(cluster_table.c.name,
1157
+ cluster_table.c.handle).filter(
1158
+ cluster_table.c.name.in_(cluster_names)).all()
1159
+ return {
1160
+ row.name: pickle.loads(row.handle) if row is not None else None
1161
+ for row in rows
1162
+ }
1163
+
1164
+
1165
+ @_init_db
1166
+ @metrics_lib.time_me
1167
+ def get_cluster_name_to_handle_map(
1168
+ is_managed: Optional[bool] = None,
1169
+ ) -> Dict[str, Optional['backends.ResourceHandle']]:
1170
+ assert _SQLALCHEMY_ENGINE is not None
1171
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1172
+ query = session.query(cluster_table.c.name, cluster_table.c.handle)
1173
+ if is_managed is not None:
1174
+ query = query.filter(cluster_table.c.is_managed == int(is_managed))
1175
+ rows = query.all()
1176
+ name_to_handle = {}
1177
+ for row in rows:
1178
+ if row.handle and len(row.handle) > 0:
1179
+ name_to_handle[row.name] = pickle.loads(row.handle)
1180
+ else:
1181
+ name_to_handle[row.name] = None
1182
+ return name_to_handle
1183
+
1184
+
1185
+ @_init_db_async
1186
+ @metrics_lib.time_me
1187
+ async def get_status_from_cluster_name_async(
1188
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1189
+ """Get the status of a cluster."""
1190
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1191
+ assert cluster_name is not None, 'cluster_name cannot be None'
1192
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1193
+ result = await session.execute(
1194
+ sqlalchemy.select(cluster_table.c.status).where(
1195
+ cluster_table.c.name == cluster_name))
1196
+ row = result.first()
1197
+
1198
+ if row is None:
1199
+ return None
1200
+ return status_lib.ClusterStatus(row[0])
1201
+
1202
+
1203
+ @_init_db
1204
+ @metrics_lib.time_me
1205
+ def get_status_from_cluster_name(
1206
+ cluster_name: str) -> Optional[status_lib.ClusterStatus]:
1207
+ assert _SQLALCHEMY_ENGINE is not None
1208
+ assert cluster_name is not None, 'cluster_name cannot be None'
1209
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1210
+ row = session.query(
1211
+ cluster_table.c.status).filter_by(name=cluster_name).first()
1212
+ if row is None:
1213
+ return None
1214
+ return status_lib.ClusterStatus[row.status]
1215
+
1216
+
1217
+ @_init_db
1218
+ @metrics_lib.time_me
1219
+ def get_glob_cluster_names(
1220
+ cluster_name: str,
1221
+ workspaces_filter: Optional[Set[str]] = None) -> List[str]:
1002
1222
  assert _SQLALCHEMY_ENGINE is not None
1003
1223
  assert cluster_name is not None, 'cluster_name cannot be None'
1004
1224
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1005
1225
  if (_SQLALCHEMY_ENGINE.dialect.name ==
1006
1226
  db_utils.SQLAlchemyDialect.SQLITE.value):
1007
- rows = session.query(cluster_table).filter(
1008
- cluster_table.c.name.op('GLOB')(cluster_name)).all()
1227
+ query = session.query(cluster_table.c.name).filter(
1228
+ cluster_table.c.name.op('GLOB')(cluster_name))
1009
1229
  elif (_SQLALCHEMY_ENGINE.dialect.name ==
1010
1230
  db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1011
- rows = session.query(cluster_table).filter(
1231
+ query = session.query(cluster_table.c.name).filter(
1012
1232
  cluster_table.c.name.op('SIMILAR TO')(
1013
- _glob_to_similar(cluster_name))).all()
1233
+ _glob_to_similar(cluster_name)))
1014
1234
  else:
1015
1235
  raise ValueError('Unsupported database dialect')
1236
+ if workspaces_filter is not None:
1237
+ query = query.filter(
1238
+ cluster_table.c.workspace.in_(workspaces_filter))
1239
+ rows = query.all()
1016
1240
  return [row.name for row in rows]
1017
1241
 
1018
1242
 
@@ -1056,7 +1280,8 @@ def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
1056
1280
  def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
1057
1281
  assert _SQLALCHEMY_ENGINE is not None
1058
1282
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1059
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1283
+ row = session.query(
1284
+ cluster_table.c.launched_at).filter_by(name=cluster_name).first()
1060
1285
  if row is None or row.launched_at is None:
1061
1286
  return None
1062
1287
  return int(row.launched_at)
@@ -1067,7 +1292,8 @@ def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
1067
1292
  def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
1068
1293
  assert _SQLALCHEMY_ENGINE is not None
1069
1294
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1070
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1295
+ row = session.query(
1296
+ cluster_table.c.metadata).filter_by(name=cluster_name).first()
1071
1297
  if row is None or row.metadata is None:
1072
1298
  return None
1073
1299
  return json.loads(row.metadata)
@@ -1147,7 +1373,8 @@ def get_cluster_storage_mounts_metadata(
1147
1373
  cluster_name: str) -> Optional[Dict[str, Any]]:
1148
1374
  assert _SQLALCHEMY_ENGINE is not None
1149
1375
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1150
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1376
+ row = (session.query(cluster_table.c.storage_mounts_metadata).filter_by(
1377
+ name=cluster_name).first())
1151
1378
  if row is None or row.storage_mounts_metadata is None:
1152
1379
  return None
1153
1380
  return pickle.loads(row.storage_mounts_metadata)
@@ -1170,6 +1397,39 @@ def set_cluster_storage_mounts_metadata(
1170
1397
  raise ValueError(f'Cluster {cluster_name} not found.')
1171
1398
 
1172
1399
 
1400
+ @_init_db
1401
+ @metrics_lib.time_me
1402
+ def get_cluster_skylet_ssh_tunnel_metadata(
1403
+ cluster_name: str) -> Optional[Tuple[int, int]]:
1404
+ assert _SQLALCHEMY_ENGINE is not None
1405
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1406
+ row = session.query(
1407
+ cluster_table.c.skylet_ssh_tunnel_metadata).filter_by(
1408
+ name=cluster_name).first()
1409
+ if row is None or row.skylet_ssh_tunnel_metadata is None:
1410
+ return None
1411
+ return pickle.loads(row.skylet_ssh_tunnel_metadata)
1412
+
1413
+
1414
+ @_init_db
1415
+ @metrics_lib.time_me
1416
+ def set_cluster_skylet_ssh_tunnel_metadata(
1417
+ cluster_name: str,
1418
+ skylet_ssh_tunnel_metadata: Optional[Tuple[int, int]]) -> None:
1419
+ assert _SQLALCHEMY_ENGINE is not None
1420
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1421
+ value = pickle.dumps(
1422
+ skylet_ssh_tunnel_metadata
1423
+ ) if skylet_ssh_tunnel_metadata is not None else None
1424
+ count = session.query(cluster_table).filter_by(
1425
+ name=cluster_name).update(
1426
+ {cluster_table.c.skylet_ssh_tunnel_metadata: value})
1427
+ session.commit()
1428
+ assert count <= 1, count
1429
+ if count == 0:
1430
+ raise ValueError(f'Cluster {cluster_name} not found.')
1431
+
1432
+
1173
1433
  @_init_db
1174
1434
  @metrics_lib.time_me
1175
1435
  def _get_cluster_usage_intervals(
@@ -1179,23 +1439,24 @@ def _get_cluster_usage_intervals(
1179
1439
  if cluster_hash is None:
1180
1440
  return None
1181
1441
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1182
- row = session.query(cluster_history_table).filter_by(
1442
+ row = session.query(cluster_history_table.c.usage_intervals).filter_by(
1183
1443
  cluster_hash=cluster_hash).first()
1184
1444
  if row is None or row.usage_intervals is None:
1185
1445
  return None
1186
1446
  return pickle.loads(row.usage_intervals)
1187
1447
 
1188
1448
 
1189
- def _get_cluster_launch_time(cluster_hash: str) -> Optional[int]:
1190
- usage_intervals = _get_cluster_usage_intervals(cluster_hash)
1449
+ def _get_cluster_launch_time(
1450
+ usage_intervals: Optional[List[Tuple[int,
1451
+ Optional[int]]]]) -> Optional[int]:
1191
1452
  if usage_intervals is None:
1192
1453
  return None
1193
1454
  return usage_intervals[0][0]
1194
1455
 
1195
1456
 
1196
- def _get_cluster_duration(cluster_hash: str) -> int:
1457
+ def _get_cluster_duration(
1458
+ usage_intervals: Optional[List[Tuple[int, Optional[int]]]]) -> int:
1197
1459
  total_duration = 0
1198
- usage_intervals = _get_cluster_usage_intervals(cluster_hash)
1199
1460
 
1200
1461
  if usage_intervals is None:
1201
1462
  return total_duration
@@ -1212,17 +1473,33 @@ def _get_cluster_duration(cluster_hash: str) -> int:
1212
1473
  return total_duration
1213
1474
 
1214
1475
 
1476
+ def _get_cluster_last_activity_time(
1477
+ usage_intervals: Optional[List[Tuple[int,
1478
+ Optional[int]]]]) -> Optional[int]:
1479
+ last_activity_time = None
1480
+ if usage_intervals:
1481
+ last_interval = usage_intervals[-1]
1482
+ last_activity_time = (last_interval[1] if last_interval[1] is not None
1483
+ else last_interval[0])
1484
+ return last_activity_time
1485
+
1486
+
1215
1487
  @_init_db
1216
1488
  @metrics_lib.time_me
1217
1489
  def _set_cluster_usage_intervals(
1218
1490
  cluster_hash: str, usage_intervals: List[Tuple[int,
1219
1491
  Optional[int]]]) -> None:
1220
1492
  assert _SQLALCHEMY_ENGINE is not None
1493
+
1494
+ # Calculate last_activity_time from usage_intervals
1495
+ last_activity_time = _get_cluster_last_activity_time(usage_intervals)
1496
+
1221
1497
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1222
1498
  count = session.query(cluster_history_table).filter_by(
1223
1499
  cluster_hash=cluster_hash).update({
1224
1500
  cluster_history_table.c.usage_intervals:
1225
- pickle.dumps(usage_intervals)
1501
+ pickle.dumps(usage_intervals),
1502
+ cluster_history_table.c.last_activity_time: last_activity_time,
1226
1503
  })
1227
1504
  session.commit()
1228
1505
  assert count <= 1, count
@@ -1253,7 +1530,8 @@ def set_owner_identity_for_cluster(cluster_name: str,
1253
1530
  def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
1254
1531
  assert _SQLALCHEMY_ENGINE is not None
1255
1532
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1256
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1533
+ row = (session.query(
1534
+ cluster_table.c.cluster_hash).filter_by(name=cluster_name).first())
1257
1535
  if row is None or row.cluster_hash is None:
1258
1536
  return None
1259
1537
  return row.cluster_hash
@@ -1265,8 +1543,10 @@ def get_launched_resources_from_cluster_hash(
1265
1543
  cluster_hash: str) -> Optional[Tuple[int, Any]]:
1266
1544
  assert _SQLALCHEMY_ENGINE is not None
1267
1545
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1268
- row = session.query(cluster_history_table).filter_by(
1269
- cluster_hash=cluster_hash).first()
1546
+ row = session.query(
1547
+ cluster_history_table.c.num_nodes,
1548
+ cluster_history_table.c.launched_resources).filter_by(
1549
+ cluster_hash=cluster_hash).first()
1270
1550
  if row is None:
1271
1551
  return None
1272
1552
  num_nodes = row.num_nodes
@@ -1310,17 +1590,46 @@ def _load_storage_mounts_metadata(
1310
1590
  @metrics_lib.time_me
1311
1591
  @context_utils.cancellation_guard
1312
1592
  def get_cluster_from_name(
1313
- cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
1314
- assert _SQLALCHEMY_ENGINE is not None
1315
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1316
- row = session.query(cluster_table).filter_by(name=cluster_name).first()
1593
+ cluster_name: Optional[str],
1594
+ *,
1595
+ include_user_info: bool = True,
1596
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
1597
+ assert _SQLALCHEMY_ENGINE is not None
1598
+ query_fields = [
1599
+ cluster_table.c.name,
1600
+ cluster_table.c.launched_at,
1601
+ cluster_table.c.handle,
1602
+ cluster_table.c.last_use,
1603
+ cluster_table.c.status,
1604
+ cluster_table.c.autostop,
1605
+ cluster_table.c.to_down,
1606
+ cluster_table.c.owner,
1607
+ cluster_table.c.metadata,
1608
+ cluster_table.c.cluster_hash,
1609
+ cluster_table.c.cluster_ever_up,
1610
+ cluster_table.c.status_updated_at,
1611
+ cluster_table.c.user_hash,
1612
+ cluster_table.c.config_hash,
1613
+ cluster_table.c.workspace,
1614
+ cluster_table.c.is_managed,
1615
+ ]
1616
+ if not summary_response:
1617
+ query_fields.extend([
1618
+ cluster_table.c.last_creation_yaml,
1619
+ cluster_table.c.last_creation_command,
1620
+ ])
1621
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1622
+ query = session.query(*query_fields)
1623
+ row = query.filter_by(name=cluster_name).first()
1317
1624
  if row is None:
1318
1625
  return None
1319
- user_hash = _get_user_hash_or_current_user(row.user_hash)
1320
- user = get_user(user_hash)
1321
- user_name = user.name if user is not None else None
1322
- last_event = get_last_cluster_event(
1323
- row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1626
+ if include_user_info:
1627
+ user_hash = _get_user_hash_or_current_user(row.user_hash)
1628
+ user = get_user(user_hash)
1629
+ user_name = user.name if user is not None else None
1630
+ if not summary_response:
1631
+ last_event = get_last_cluster_event(
1632
+ row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
1324
1633
  # TODO: use namedtuple instead of dict
1325
1634
  record = {
1326
1635
  'name': row.name,
@@ -1333,30 +1642,45 @@ def get_cluster_from_name(
1333
1642
  'owner': _load_owner(row.owner),
1334
1643
  'metadata': json.loads(row.metadata),
1335
1644
  'cluster_hash': row.cluster_hash,
1336
- 'storage_mounts_metadata': _load_storage_mounts_metadata(
1337
- row.storage_mounts_metadata),
1338
1645
  'cluster_ever_up': bool(row.cluster_ever_up),
1339
1646
  'status_updated_at': row.status_updated_at,
1340
- 'user_hash': user_hash,
1341
- 'user_name': user_name,
1342
- 'config_hash': row.config_hash,
1343
1647
  'workspace': row.workspace,
1344
- 'last_creation_yaml': row.last_creation_yaml,
1345
- 'last_creation_command': row.last_creation_command,
1346
1648
  'is_managed': bool(row.is_managed),
1347
- 'last_event': last_event,
1649
+ 'config_hash': row.config_hash,
1348
1650
  }
1651
+ if not summary_response:
1652
+ record['last_creation_yaml'] = row.last_creation_yaml
1653
+ record['last_creation_command'] = row.last_creation_command
1654
+ record['last_event'] = last_event
1655
+ if include_user_info:
1656
+ record['user_hash'] = user_hash
1657
+ record['user_name'] = user_name
1349
1658
 
1350
1659
  return record
1351
1660
 
1352
1661
 
1662
+ @_init_db
1663
+ @metrics_lib.time_me
1664
+ @context_utils.cancellation_guard
1665
+ def cluster_with_name_exists(cluster_name: str) -> bool:
1666
+ assert _SQLALCHEMY_ENGINE is not None
1667
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1668
+ row = session.query(
1669
+ cluster_table.c.name).filter_by(name=cluster_name).first()
1670
+ if row is None:
1671
+ return False
1672
+ return True
1673
+
1674
+
1353
1675
  @_init_db
1354
1676
  @metrics_lib.time_me
1355
1677
  def get_clusters(
1356
1678
  *, # keyword only separator
1357
1679
  exclude_managed_clusters: bool = False,
1358
- workspaces_filter: Optional[Set[str]] = None,
1680
+ workspaces_filter: Optional[Dict[str, Any]] = None,
1359
1681
  user_hashes_filter: Optional[Set[str]] = None,
1682
+ cluster_names: Optional[List[str]] = None,
1683
+ summary_response: bool = False,
1360
1684
  ) -> List[Dict[str, Any]]:
1361
1685
  """Get clusters from the database.
1362
1686
 
@@ -1367,13 +1691,41 @@ def get_clusters(
1367
1691
  that has workspace field set to one of the values.
1368
1692
  user_hashes_filter: If specified, only include clusters
1369
1693
  that has user_hash field set to one of the values.
1694
+ cluster_names: If specified, only include clusters
1695
+ that has name field set to one of the values.
1370
1696
  """
1371
1697
  # is a cluster has a null user_hash,
1372
1698
  # we treat it as belonging to the current user.
1373
1699
  current_user_hash = common_utils.get_user_hash()
1374
1700
  assert _SQLALCHEMY_ENGINE is not None
1375
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1376
- query = session.query(cluster_table)
1701
+ query_fields = [
1702
+ cluster_table.c.name,
1703
+ cluster_table.c.launched_at,
1704
+ cluster_table.c.handle,
1705
+ cluster_table.c.status,
1706
+ cluster_table.c.autostop,
1707
+ cluster_table.c.to_down,
1708
+ cluster_table.c.cluster_hash,
1709
+ cluster_table.c.cluster_ever_up,
1710
+ cluster_table.c.user_hash,
1711
+ cluster_table.c.workspace,
1712
+ user_table.c.name.label('user_name'),
1713
+ ]
1714
+ if not summary_response:
1715
+ query_fields.extend([
1716
+ cluster_table.c.last_creation_yaml,
1717
+ cluster_table.c.last_creation_command,
1718
+ cluster_table.c.config_hash,
1719
+ cluster_table.c.owner,
1720
+ cluster_table.c.metadata,
1721
+ cluster_table.c.last_use,
1722
+ cluster_table.c.status_updated_at,
1723
+ ])
1724
+ if not exclude_managed_clusters:
1725
+ query_fields.append(cluster_table.c.is_managed)
1726
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1727
+ query = session.query(*query_fields).outerjoin(
1728
+ user_table, cluster_table.c.user_hash == user_table.c.id)
1377
1729
  if exclude_managed_clusters:
1378
1730
  query = query.filter(cluster_table.c.is_managed == int(False))
1379
1731
  if workspaces_filter is not None:
@@ -1385,71 +1737,84 @@ def get_clusters(
1385
1737
  # If current_user_hash is in user_hashes_filter, we include
1386
1738
  # clusters that have a null user_hash.
1387
1739
  query = query.filter(
1388
- cluster_table.c.user_hash.in_(user_hashes_filter) |
1389
- (cluster_table.c.user_hash is None))
1740
+ (cluster_table.c.user_hash.in_(user_hashes_filter) |
1741
+ (cluster_table.c.user_hash is None)))
1390
1742
  else:
1391
1743
  query = query.filter(
1392
1744
  cluster_table.c.user_hash.in_(user_hashes_filter))
1745
+ if cluster_names is not None:
1746
+ query = query.filter(cluster_table.c.name.in_(cluster_names))
1393
1747
  query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
1394
1748
  rows = query.all()
1395
1749
  records = []
1396
1750
 
1397
- # get user hash for each row
1398
- row_to_user_hash = {}
1399
- for row in rows:
1400
- user_hash = (row.user_hash
1401
- if row.user_hash is not None else current_user_hash)
1402
- row_to_user_hash[row.cluster_hash] = user_hash
1403
-
1404
- # get all users needed for the rows at once
1405
- user_hashes = set(row_to_user_hash.values())
1406
- user_hash_to_user = _get_users(user_hashes)
1751
+ # Check if we need to fetch the current user's name,
1752
+ # for backwards compatibility, if user_hash is None.
1753
+ current_user_name = None
1754
+ needs_current_user = any(row.user_hash is None for row in rows)
1755
+ if needs_current_user:
1756
+ current_user = get_user(current_user_hash)
1757
+ current_user_name = (current_user.name
1758
+ if current_user is not None else None)
1407
1759
 
1408
1760
  # get last cluster event for each row
1409
- cluster_hashes = set(row_to_user_hash.keys())
1410
- last_cluster_event_dict = _get_last_cluster_event_multiple(
1411
- cluster_hashes, ClusterEventType.STATUS_CHANGE)
1761
+ if not summary_response:
1762
+ cluster_hashes = {row.cluster_hash for row in rows}
1763
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1764
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1412
1765
 
1413
- # get user for each row
1414
1766
  for row in rows:
1415
- user_hash = row_to_user_hash[row.cluster_hash]
1416
- user = user_hash_to_user.get(user_hash, None)
1417
- user_name = user.name if user is not None else None
1418
- last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1419
1767
  # TODO: use namedtuple instead of dict
1420
1768
  record = {
1421
1769
  'name': row.name,
1422
1770
  'launched_at': row.launched_at,
1423
1771
  'handle': pickle.loads(row.handle),
1424
- 'last_use': row.last_use,
1425
1772
  'status': status_lib.ClusterStatus[row.status],
1426
1773
  'autostop': row.autostop,
1427
1774
  'to_down': bool(row.to_down),
1428
- 'owner': _load_owner(row.owner),
1429
- 'metadata': json.loads(row.metadata),
1430
1775
  'cluster_hash': row.cluster_hash,
1431
- 'storage_mounts_metadata': _load_storage_mounts_metadata(
1432
- row.storage_mounts_metadata),
1433
1776
  'cluster_ever_up': bool(row.cluster_ever_up),
1434
- 'status_updated_at': row.status_updated_at,
1435
- 'user_hash': user_hash,
1436
- 'user_name': user_name,
1437
- 'config_hash': row.config_hash,
1777
+ 'user_hash': (row.user_hash
1778
+ if row.user_hash is not None else current_user_hash),
1779
+ 'user_name': (row.user_name
1780
+ if row.user_name is not None else current_user_name),
1438
1781
  'workspace': row.workspace,
1439
- 'last_creation_yaml': row.last_creation_yaml,
1440
- 'last_creation_command': row.last_creation_command,
1441
- 'is_managed': bool(row.is_managed),
1442
- 'last_event': last_event,
1782
+ 'is_managed': False
1783
+ if exclude_managed_clusters else bool(row.is_managed),
1443
1784
  }
1785
+ if not summary_response:
1786
+ record['last_creation_yaml'] = row.last_creation_yaml
1787
+ record['last_creation_command'] = row.last_creation_command
1788
+ record['last_event'] = last_cluster_event_dict.get(
1789
+ row.cluster_hash, None)
1790
+ record['config_hash'] = row.config_hash
1791
+ record['owner'] = _load_owner(row.owner)
1792
+ record['metadata'] = json.loads(row.metadata)
1793
+ record['last_use'] = row.last_use
1794
+ record['status_updated_at'] = row.status_updated_at
1444
1795
 
1445
1796
  records.append(record)
1446
1797
  return records
1447
1798
 
1448
1799
 
1800
+ @_init_db
1801
+ @metrics_lib.time_me
1802
+ def get_cluster_names(exclude_managed_clusters: bool = False,) -> List[str]:
1803
+ assert _SQLALCHEMY_ENGINE is not None
1804
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1805
+ query = session.query(cluster_table.c.name)
1806
+ if exclude_managed_clusters:
1807
+ query = query.filter(cluster_table.c.is_managed == int(False))
1808
+ rows = query.all()
1809
+ return [row[0] for row in rows]
1810
+
1811
+
1449
1812
  @_init_db
1450
1813
  @metrics_lib.time_me
1451
1814
  def get_clusters_from_history(
1452
- days: Optional[int] = None) -> List[Dict[str, Any]]:
1815
+ days: Optional[int] = None,
1816
+ abbreviate_response: bool = False,
1817
+ cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
1453
1818
  """Get cluster reports from history.
1454
1819
 
1455
1820
  Args:
@@ -1462,69 +1827,103 @@ def get_clusters_from_history(
1462
1827
  List of cluster records with history information.
1463
1828
  """
1464
1829
  assert _SQLALCHEMY_ENGINE is not None
1465
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1466
- # Explicitly select columns from both tables to avoid ambiguity
1467
- query = session.query(
1468
- cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
1469
- cluster_history_table.c.num_nodes,
1470
- cluster_history_table.c.requested_resources,
1471
- cluster_history_table.c.launched_resources,
1472
- cluster_history_table.c.usage_intervals,
1473
- cluster_history_table.c.user_hash,
1474
- cluster_history_table.c.last_creation_yaml,
1475
- cluster_history_table.c.last_creation_command,
1476
- cluster_history_table.c.workspace.label('history_workspace'),
1477
- cluster_table.c.status, cluster_table.c.workspace,
1478
- cluster_table.c.status_updated_at).select_from(
1479
- cluster_history_table.join(cluster_table,
1480
- cluster_history_table.c.cluster_hash
1481
- == cluster_table.c.cluster_hash,
1482
- isouter=True))
1483
1830
 
1484
- rows = query.all()
1831
+ current_user_hash = common_utils.get_user_hash()
1485
1832
 
1486
1833
  # Prepare filtering parameters
1487
- cutoff_time = None
1834
+ cutoff_time = 0
1488
1835
  if days is not None:
1489
1836
  cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
1490
1837
 
1838
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1839
+ # Explicitly select columns from both tables to avoid ambiguity
1840
+ if abbreviate_response:
1841
+ query = session.query(
1842
+ cluster_history_table.c.cluster_hash,
1843
+ cluster_history_table.c.name, cluster_history_table.c.num_nodes,
1844
+ cluster_history_table.c.launched_resources,
1845
+ cluster_history_table.c.usage_intervals,
1846
+ cluster_history_table.c.user_hash,
1847
+ cluster_history_table.c.workspace.label('history_workspace'),
1848
+ cluster_history_table.c.last_activity_time,
1849
+ cluster_history_table.c.launched_at, cluster_table.c.status,
1850
+ cluster_table.c.workspace)
1851
+ else:
1852
+ query = session.query(
1853
+ cluster_history_table.c.cluster_hash,
1854
+ cluster_history_table.c.name, cluster_history_table.c.num_nodes,
1855
+ cluster_history_table.c.launched_resources,
1856
+ cluster_history_table.c.usage_intervals,
1857
+ cluster_history_table.c.user_hash,
1858
+ cluster_history_table.c.last_creation_yaml,
1859
+ cluster_history_table.c.last_creation_command,
1860
+ cluster_history_table.c.workspace.label('history_workspace'),
1861
+ cluster_history_table.c.last_activity_time,
1862
+ cluster_history_table.c.launched_at, cluster_table.c.status,
1863
+ cluster_table.c.workspace)
1864
+
1865
+ query = query.select_from(
1866
+ cluster_history_table.join(cluster_table,
1867
+ cluster_history_table.c.cluster_hash ==
1868
+ cluster_table.c.cluster_hash,
1869
+ isouter=True))
1870
+
1871
+ # Only include clusters that are either active (status is not None)
1872
+ # or are within the cutoff time (cutoff_time <= last_activity_time).
1873
+ # If days is not specified, we include all clusters by setting
1874
+ # cutoff_time to 0.
1875
+ query = query.filter(
1876
+ (cluster_table.c.status.isnot(None) |
1877
+ (cluster_history_table.c.last_activity_time >= cutoff_time)))
1878
+
1879
+ # Order by launched_at descending (most recent first)
1880
+ query = query.order_by(
1881
+ sqlalchemy.desc(cluster_history_table.c.launched_at))
1882
+
1883
+ if cluster_hashes is not None:
1884
+ query = query.filter(
1885
+ cluster_history_table.c.cluster_hash.in_(cluster_hashes))
1886
+ rows = query.all()
1887
+
1888
+ usage_intervals_dict = {}
1889
+ row_to_user_hash = {}
1890
+ for row in rows:
1891
+ row_usage_intervals: List[Tuple[int, Optional[int]]] = []
1892
+ if row.usage_intervals:
1893
+ try:
1894
+ row_usage_intervals = pickle.loads(row.usage_intervals)
1895
+ except (pickle.PickleError, AttributeError):
1896
+ pass
1897
+ usage_intervals_dict[row.cluster_hash] = row_usage_intervals
1898
+ user_hash = (row.user_hash
1899
+ if row.user_hash is not None else current_user_hash)
1900
+ row_to_user_hash[row.cluster_hash] = user_hash
1901
+
1902
+ user_hashes = set(row_to_user_hash.values())
1903
+ user_hash_to_user = get_users(user_hashes)
1904
+ cluster_hashes = set(row_to_user_hash.keys())
1905
+ if not abbreviate_response:
1906
+ last_cluster_event_dict = _get_last_cluster_event_multiple(
1907
+ cluster_hashes, ClusterEventType.STATUS_CHANGE)
1908
+
1491
1909
  records = []
1492
1910
  for row in rows:
1493
- user_hash = _get_user_hash_or_current_user(row.user_hash)
1494
- launched_at = _get_cluster_launch_time(row.cluster_hash)
1495
- duration = _get_cluster_duration(row.cluster_hash)
1911
+ user_hash = row_to_user_hash[row.cluster_hash]
1912
+ user = user_hash_to_user.get(user_hash, None)
1913
+ user_name = user.name if user is not None else None
1914
+ if not abbreviate_response:
1915
+ last_event = last_cluster_event_dict.get(row.cluster_hash, None)
1916
+ launched_at = row.launched_at
1917
+ usage_intervals: Optional[List[Tuple[
1918
+ int,
1919
+ Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
1920
+ duration = _get_cluster_duration(usage_intervals)
1496
1921
 
1497
1922
  # Parse status
1498
1923
  status = None
1499
1924
  if row.status:
1500
1925
  status = status_lib.ClusterStatus[row.status]
1501
1926
 
1502
- # Apply filtering: always include active clusters, filter historical
1503
- # ones by time
1504
- if cutoff_time is not None and status is None: # Historical cluster
1505
- # For historical clusters, check if they were used recently
1506
- # Use the most recent activity from usage_intervals to determine
1507
- # last use
1508
- usage_intervals = []
1509
- if row.usage_intervals:
1510
- try:
1511
- usage_intervals = pickle.loads(row.usage_intervals)
1512
- except (pickle.PickleError, AttributeError):
1513
- usage_intervals = []
1514
-
1515
- # Find the most recent activity time from usage_intervals
1516
- last_activity_time = None
1517
- if usage_intervals:
1518
- # Get the end time of the last interval (or start time if
1519
- # still running)
1520
- last_interval = usage_intervals[-1]
1521
- last_activity_time = (last_interval[1] if last_interval[1]
1522
- is not None else last_interval[0])
1523
-
1524
- # Skip historical clusters that haven't been used recently
1525
- if last_activity_time is None or last_activity_time < cutoff_time:
1526
- continue
1527
-
1528
1927
  # Parse launched resources safely
1529
1928
  launched_resources = None
1530
1929
  if row.launched_resources:
@@ -1533,17 +1932,6 @@ def get_clusters_from_history(
1533
1932
  except (pickle.PickleError, AttributeError):
1534
1933
  launched_resources = None
1535
1934
 
1536
- # Parse usage intervals safely
1537
- usage_intervals = []
1538
- if row.usage_intervals:
1539
- try:
1540
- usage_intervals = pickle.loads(row.usage_intervals)
1541
- except (pickle.PickleError, AttributeError):
1542
- usage_intervals = []
1543
-
1544
- # Get user name from user hash
1545
- user = get_user(user_hash)
1546
- user_name = user.name if user is not None else None
1547
1935
  workspace = (row.history_workspace
1548
1936
  if row.history_workspace else row.workspace)
1549
1937
 
@@ -1559,11 +1947,11 @@ def get_clusters_from_history(
1559
1947
  'user_hash': user_hash,
1560
1948
  'user_name': user_name,
1561
1949
  'workspace': workspace,
1562
- 'last_creation_yaml': row.last_creation_yaml,
1563
- 'last_creation_command': row.last_creation_command,
1564
- 'last_event': get_last_cluster_event(
1565
- row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE),
1566
1950
  }
1951
+ if not abbreviate_response:
1952
+ record['last_creation_yaml'] = row.last_creation_yaml
1953
+ record['last_creation_command'] = row.last_creation_command
1954
+ record['last_event'] = last_event
1567
1955
 
1568
1956
  records.append(record)
1569
1957
 
@@ -1846,10 +2234,14 @@ def get_volume_names_start_with(starts_with: str) -> List[str]:
1846
2234
 
1847
2235
  @_init_db
1848
2236
  @metrics_lib.time_me
1849
- def get_volumes() -> List[Dict[str, Any]]:
2237
+ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
1850
2238
  assert _SQLALCHEMY_ENGINE is not None
1851
2239
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1852
- rows = session.query(volume_table).all()
2240
+ if is_ephemeral is None:
2241
+ rows = session.query(volume_table).all()
2242
+ else:
2243
+ rows = session.query(volume_table).filter_by(
2244
+ is_ephemeral=int(is_ephemeral)).all()
1853
2245
  records = []
1854
2246
  for row in rows:
1855
2247
  records.append({
@@ -1861,6 +2253,7 @@ def get_volumes() -> List[Dict[str, Any]]:
1861
2253
  'last_attached_at': row.last_attached_at,
1862
2254
  'last_use': row.last_use,
1863
2255
  'status': status_lib.VolumeStatus[row.status],
2256
+ 'is_ephemeral': bool(row.is_ephemeral),
1864
2257
  })
1865
2258
  return records
1866
2259
 
@@ -1887,14 +2280,23 @@ def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
1887
2280
 
1888
2281
  @_init_db
1889
2282
  @metrics_lib.time_me
1890
- def add_volume(name: str, config: models.VolumeConfig,
1891
- status: status_lib.VolumeStatus) -> None:
2283
+ def add_volume(
2284
+ name: str,
2285
+ config: models.VolumeConfig,
2286
+ status: status_lib.VolumeStatus,
2287
+ is_ephemeral: bool = False,
2288
+ ) -> None:
1892
2289
  assert _SQLALCHEMY_ENGINE is not None
1893
2290
  volume_launched_at = int(time.time())
1894
2291
  handle = pickle.dumps(config)
1895
2292
  last_use = common_utils.get_current_command()
1896
2293
  user_hash = common_utils.get_current_user().id
1897
2294
  active_workspace = skypilot_config.get_active_workspace()
2295
+ if is_ephemeral:
2296
+ last_attached_at = int(time.time())
2297
+ status = status_lib.VolumeStatus.IN_USE
2298
+ else:
2299
+ last_attached_at = None
1898
2300
 
1899
2301
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1900
2302
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -1911,9 +2313,10 @@ def add_volume(name: str, config: models.VolumeConfig,
1911
2313
  handle=handle,
1912
2314
  user_hash=user_hash,
1913
2315
  workspace=active_workspace,
1914
- last_attached_at=None,
2316
+ last_attached_at=last_attached_at,
1915
2317
  last_use=last_use,
1916
2318
  status=status.value,
2319
+ is_ephemeral=int(is_ephemeral),
1917
2320
  )
1918
2321
  do_update_stmt = insert_stmnt.on_conflict_do_nothing()
1919
2322
  session.execute(do_update_stmt)
@@ -2184,11 +2587,22 @@ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
2184
2587
  # on the local file system and migrate it to the database.
2185
2588
  # TODO(syang): remove this check once we have a way to migrate the
2186
2589
  # cluster from file to database. Remove on v0.12.0.
2187
- if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
2188
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2189
- yaml_str = f.read()
2190
- set_cluster_yaml(cluster_name, yaml_str)
2191
- return yaml_str
2590
+ if cluster_yaml_path is not None:
2591
+ # First try the exact path
2592
+ path_to_read = None
2593
+ if os.path.exists(cluster_yaml_path):
2594
+ path_to_read = cluster_yaml_path
2595
+ # Fallback: try with .debug suffix (when debug logging was enabled)
2596
+ # Debug logging causes YAML files to be saved with .debug suffix
2597
+ # but the path stored in the handle doesn't include it
2598
+ debug_path = cluster_yaml_path + '.debug'
2599
+ if os.path.exists(debug_path):
2600
+ path_to_read = debug_path
2601
+ if path_to_read is not None:
2602
+ with open(path_to_read, 'r', encoding='utf-8') as f:
2603
+ yaml_str = f.read()
2604
+ set_cluster_yaml(cluster_name, yaml_str)
2605
+ return yaml_str
2192
2606
  return None
2193
2607
 
2194
2608