skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,34 +1,42 @@
1
1
  """Utilities for REST API."""
2
+ import asyncio
3
+ import atexit
2
4
  import contextlib
3
5
  import dataclasses
4
6
  import enum
5
7
  import functools
6
- import json
7
8
  import os
8
9
  import pathlib
9
10
  import shutil
10
11
  import signal
11
12
  import sqlite3
13
+ import threading
12
14
  import time
13
15
  import traceback
14
- from typing import Any, Callable, Dict, List, Optional, Tuple
16
+ from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
17
+ Tuple)
18
+ import uuid
15
19
 
20
+ import anyio
16
21
  import colorama
17
22
  import filelock
23
+ import orjson
18
24
 
19
25
  from sky import exceptions
20
26
  from sky import global_user_state
21
27
  from sky import sky_logging
28
+ from sky import skypilot_config
29
+ from sky.metrics import utils as metrics_lib
22
30
  from sky.server import common as server_common
23
31
  from sky.server import constants as server_constants
32
+ from sky.server import daemons
24
33
  from sky.server.requests import payloads
25
34
  from sky.server.requests.serializers import decoders
26
35
  from sky.server.requests.serializers import encoders
27
- from sky.utils import common
36
+ from sky.utils import asyncio_utils
28
37
  from sky.utils import common_utils
29
- from sky.utils import db_utils
30
- from sky.utils import env_options
31
38
  from sky.utils import ux_utils
39
+ from sky.utils.db import db_utils
32
40
 
33
41
  logger = sky_logging.init_logger(__name__)
34
42
 
@@ -37,8 +45,12 @@ REQUEST_TABLE = 'requests'
37
45
  COL_CLUSTER_NAME = 'cluster_name'
38
46
  COL_USER_ID = 'user_id'
39
47
  COL_STATUS_MSG = 'status_msg'
48
+ COL_SHOULD_RETRY = 'should_retry'
49
+ COL_FINISHED_AT = 'finished_at'
40
50
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
41
51
 
52
+ DEFAULT_REQUESTS_RETENTION_HOURS = 24 # 1 day
53
+
42
54
  # TODO(zhwu): For scalability, there are several TODOs:
43
55
  # [x] Have a way to queue requests.
44
56
  # [ ] Move logs to persistent place.
@@ -62,6 +74,10 @@ class RequestStatus(enum.Enum):
62
74
  color = _STATUS_TO_COLOR[self]
63
75
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
64
76
 
77
+ @classmethod
78
+ def finished_status(cls) -> List['RequestStatus']:
79
+ return [cls.SUCCEEDED, cls.FAILED, cls.CANCELLED]
80
+
65
81
 
66
82
  _STATUS_TO_COLOR = {
67
83
  RequestStatus.PENDING: colorama.Fore.BLUE,
@@ -85,6 +101,8 @@ REQUEST_COLUMNS = [
85
101
  'schedule_type',
86
102
  COL_USER_ID,
87
103
  COL_STATUS_MSG,
104
+ COL_SHOULD_RETRY,
105
+ COL_FINISHED_AT,
88
106
  ]
89
107
 
90
108
 
@@ -95,27 +113,6 @@ class ScheduleType(enum.Enum):
95
113
  SHORT = 'short'
96
114
 
97
115
 
98
- @dataclasses.dataclass
99
- class RequestPayload:
100
- """The payload for the requests."""
101
-
102
- request_id: str
103
- name: str
104
- entrypoint: str
105
- request_body: str
106
- status: str
107
- created_at: float
108
- user_id: str
109
- return_value: str
110
- error: str
111
- pid: Optional[int]
112
- schedule_type: str
113
- user_name: Optional[str] = None
114
- # Resources the request operates on.
115
- cluster_name: Optional[str] = None
116
- status_msg: Optional[str] = None
117
-
118
-
119
116
  @dataclasses.dataclass
120
117
  class Request:
121
118
  """A SkyPilot API request."""
@@ -136,6 +133,10 @@ class Request:
136
133
  cluster_name: Optional[str] = None
137
134
  # Status message of the request, indicates the reason of current status.
138
135
  status_msg: Optional[str] = None
136
+ # Whether the request should be retried.
137
+ should_retry: bool = False
138
+ # When the request finished.
139
+ finished_at: Optional[float] = None
139
140
 
140
141
  @property
141
142
  def log_path(self) -> pathlib.Path:
@@ -179,7 +180,7 @@ class Request:
179
180
  @classmethod
180
181
  def from_row(cls, row: Tuple[Any, ...]) -> 'Request':
181
182
  content = dict(zip(REQUEST_COLUMNS, row))
182
- return cls.decode(RequestPayload(**content))
183
+ return cls.decode(payloads.RequestPayload(**content))
183
184
 
184
185
  def to_row(self) -> Tuple[Any, ...]:
185
186
  payload = self.encode()
@@ -188,7 +189,7 @@ class Request:
188
189
  row.append(getattr(payload, k))
189
190
  return tuple(row)
190
191
 
191
- def readable_encode(self) -> RequestPayload:
192
+ def readable_encode(self) -> payloads.RequestPayload:
192
193
  """Serialize the SkyPilot API request for display purposes.
193
194
 
194
195
  This function should be called on the server side to serialize the
@@ -204,15 +205,16 @@ class Request:
204
205
  """
205
206
  assert isinstance(self.request_body,
206
207
  payloads.RequestBody), (self.name, self.request_body)
207
- user_name = global_user_state.get_user(self.user_id).name
208
- return RequestPayload(
208
+ user = global_user_state.get_user(self.user_id)
209
+ user_name = user.name if user is not None else None
210
+ return payloads.RequestPayload(
209
211
  request_id=self.request_id,
210
212
  name=self.name,
211
213
  entrypoint=self.entrypoint.__name__,
212
214
  request_body=self.request_body.model_dump_json(),
213
215
  status=self.status.value,
214
- return_value=json.dumps(None),
215
- error=json.dumps(None),
216
+ return_value=orjson.dumps(None).decode('utf-8'),
217
+ error=orjson.dumps(None).decode('utf-8'),
216
218
  pid=None,
217
219
  created_at=self.created_at,
218
220
  schedule_type=self.schedule_type.value,
@@ -220,27 +222,31 @@ class Request:
220
222
  user_name=user_name,
221
223
  cluster_name=self.cluster_name,
222
224
  status_msg=self.status_msg,
225
+ should_retry=self.should_retry,
226
+ finished_at=self.finished_at,
223
227
  )
224
228
 
225
- def encode(self) -> RequestPayload:
229
+ def encode(self) -> payloads.RequestPayload:
226
230
  """Serialize the SkyPilot API request."""
227
231
  assert isinstance(self.request_body,
228
232
  payloads.RequestBody), (self.name, self.request_body)
229
233
  try:
230
- return RequestPayload(
234
+ return payloads.RequestPayload(
231
235
  request_id=self.request_id,
232
236
  name=self.name,
233
237
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
234
238
  request_body=encoders.pickle_and_encode(self.request_body),
235
239
  status=self.status.value,
236
- return_value=json.dumps(self.return_value),
237
- error=json.dumps(self.error),
240
+ return_value=orjson.dumps(self.return_value).decode('utf-8'),
241
+ error=orjson.dumps(self.error).decode('utf-8'),
238
242
  pid=self.pid,
239
243
  created_at=self.created_at,
240
244
  schedule_type=self.schedule_type.value,
241
245
  user_id=self.user_id,
242
246
  cluster_name=self.cluster_name,
243
247
  status_msg=self.status_msg,
248
+ should_retry=self.should_retry,
249
+ finished_at=self.finished_at,
244
250
  )
245
251
  except (TypeError, ValueError) as e:
246
252
  # The error is unexpected, so we don't suppress the stack trace.
@@ -255,7 +261,7 @@ class Request:
255
261
  raise
256
262
 
257
263
  @classmethod
258
- def decode(cls, payload: RequestPayload) -> 'Request':
264
+ def decode(cls, payload: payloads.RequestPayload) -> 'Request':
259
265
  """Deserialize the SkyPilot API request."""
260
266
  try:
261
267
  return cls(
@@ -264,14 +270,16 @@ class Request:
264
270
  entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
265
271
  request_body=decoders.decode_and_unpickle(payload.request_body),
266
272
  status=RequestStatus(payload.status),
267
- return_value=json.loads(payload.return_value),
268
- error=json.loads(payload.error),
273
+ return_value=orjson.loads(payload.return_value),
274
+ error=orjson.loads(payload.error),
269
275
  pid=payload.pid,
270
276
  created_at=payload.created_at,
271
277
  schedule_type=ScheduleType(payload.schedule_type),
272
278
  user_id=payload.user_id,
273
279
  cluster_name=payload.cluster_name,
274
280
  status_msg=payload.status_msg,
281
+ should_retry=payload.should_retry,
282
+ finished_at=payload.finished_at,
275
283
  )
276
284
  except (TypeError, ValueError) as e:
277
285
  logger.error(
@@ -286,113 +294,104 @@ class Request:
286
294
  raise
287
295
 
288
296
 
289
- def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
290
- """Kill all pending and running requests for a cluster.
291
-
292
- Args:
293
- cluster_name: the name of the cluster.
294
- exclude_request_names: exclude requests with these names. This is to
295
- prevent killing the caller request.
296
- """
297
- request_ids = [
298
- request_task.request_id for request_task in get_request_tasks(
299
- cluster_names=[cluster_name],
300
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
301
- exclude_request_names=[exclude_request_name])
302
- ]
303
- kill_requests(request_ids)
304
-
305
-
306
- def refresh_cluster_status_event():
307
- """Periodically refresh the cluster status."""
308
- # pylint: disable=import-outside-toplevel
309
- from sky import core
310
-
311
- # Disable logging for periodic refresh to avoid the usage message being
312
- # sent multiple times.
313
- os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
314
-
315
- while True:
316
- logger.info('=== Refreshing cluster status ===')
317
- # This periodically refresh will hold the lock for the cluster being
318
- # refreshed, but it is OK because other operations will just wait for
319
- # the lock and get the just refreshed status without refreshing again.
320
- core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
321
- logger.info(
322
- 'Status refreshed. Sleeping '
323
- f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
324
- ' seconds for the next refresh...\n')
325
- time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
326
-
297
+ def get_new_request_id() -> str:
298
+ """Get a new request ID."""
299
+ return str(uuid.uuid4())
327
300
 
328
- @dataclasses.dataclass
329
- class InternalRequestDaemon:
330
- id: str
331
- name: str
332
- event_fn: Callable[[], None]
333
301
 
302
+ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
303
+ """Serialize the SkyPilot API request for display purposes.
334
304
 
335
- # Register the events to run in the background.
336
- INTERNAL_REQUEST_DAEMONS = [
337
- # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
338
- # set to updated status automatically, without showing users the hint of
339
- # cluster being stopped or down when `sky status -r` is called.
340
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
341
- name='status',
342
- event_fn=refresh_cluster_status_event)
343
- ]
344
-
345
-
346
- def kill_requests(request_ids: Optional[List[str]] = None,
347
- user_id: Optional[str] = None) -> List[str]:
348
- """Kill a SkyPilot API request and set its status to cancelled.
349
-
350
- Args:
351
- request_ids: The request IDs to kill. If None, all requests for the
352
- user are killed.
353
- user_id: The user ID to kill requests for. If None, all users are
354
- killed.
355
-
356
- Returns:
357
- A list of request IDs that were cancelled.
358
- """
359
- if request_ids is None:
360
- request_ids = [
361
- request_task.request_id for request_task in get_request_tasks(
362
- user_id=user_id,
363
- status=[RequestStatus.RUNNING, RequestStatus.PENDING],
364
- # Avoid cancelling the cancel request itself.
365
- exclude_request_names=['sky.api_cancel'])
366
- ]
367
- cancelled_request_ids = []
368
- for request_id in request_ids:
369
- with update_request(request_id) as request_record:
370
- if request_record is None:
371
- logger.debug(f'No request ID {request_id}')
372
- continue
373
- # Skip internal requests. The internal requests are scheduled with
374
- # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
375
- if request_record.request_id in set(
376
- event.id for event in INTERNAL_REQUEST_DAEMONS):
377
- continue
378
- if request_record.status > RequestStatus.RUNNING:
379
- logger.debug(f'Request {request_id} already finished')
380
- continue
381
- if request_record.pid is not None:
382
- logger.debug(f'Killing request process {request_record.pid}')
383
- # Use SIGTERM instead of SIGKILL:
384
- # - The executor can handle SIGTERM gracefully
385
- # - After SIGTERM, the executor can reuse the request process
386
- # for other requests, avoiding the overhead of forking a new
387
- # process for each request.
388
- os.kill(request_record.pid, signal.SIGTERM)
389
- request_record.status = RequestStatus.CANCELLED
390
- cancelled_request_ids.append(request_id)
391
- return cancelled_request_ids
305
+ This function should be called on the server side to serialize the
306
+ request body into human readable format, e.g., the entrypoint should
307
+ be a string, and the pid, error, or return value are not needed.
392
308
 
309
+ The returned value will then be displayed on the client side in request
310
+ table.
393
311
 
394
- _DB_PATH = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
395
- pathlib.Path(_DB_PATH).parents[0].mkdir(parents=True, exist_ok=True)
312
+ We do not use `encode` for display to avoid a large amount of data being
313
+ sent to the client side, especially for the request table could include
314
+ all the requests.
315
+ """
316
+ encoded_requests = []
317
+ all_users = global_user_state.get_all_users()
318
+ all_users_map = {user.id: user.name for user in all_users}
319
+ for request in requests:
320
+ if request.request_body is not None:
321
+ assert isinstance(request.request_body,
322
+ payloads.RequestBody), (request.name,
323
+ request.request_body)
324
+ user_name = all_users_map.get(request.user_id)
325
+ payload = payloads.RequestPayload(
326
+ request_id=request.request_id,
327
+ name=request.name,
328
+ entrypoint=request.entrypoint.__name__
329
+ if request.entrypoint is not None else '',
330
+ request_body=request.request_body.model_dump_json()
331
+ if request.request_body is not None else
332
+ orjson.dumps(None).decode('utf-8'),
333
+ status=request.status.value,
334
+ return_value=orjson.dumps(None).decode('utf-8'),
335
+ error=orjson.dumps(None).decode('utf-8'),
336
+ pid=None,
337
+ created_at=request.created_at,
338
+ schedule_type=request.schedule_type.value,
339
+ user_id=request.user_id,
340
+ user_name=user_name,
341
+ cluster_name=request.cluster_name,
342
+ status_msg=request.status_msg,
343
+ should_retry=request.should_retry,
344
+ finished_at=request.finished_at,
345
+ )
346
+ encoded_requests.append(payload)
347
+ return encoded_requests
348
+
349
+
350
+ def _update_request_row_fields(
351
+ row: Tuple[Any, ...],
352
+ fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
353
+ """Update the request row fields."""
354
+ if not fields:
355
+ return row
356
+
357
+ # Convert tuple to dictionary for easier manipulation
358
+ content = dict(zip(fields, row))
359
+
360
+ # Required fields in RequestPayload
361
+ if 'request_id' not in fields:
362
+ content['request_id'] = ''
363
+ if 'name' not in fields:
364
+ content['name'] = ''
365
+ if 'entrypoint' not in fields:
366
+ content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
367
+ if 'request_body' not in fields:
368
+ content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
369
+ if 'status' not in fields:
370
+ content['status'] = RequestStatus.PENDING.value
371
+ if 'created_at' not in fields:
372
+ content['created_at'] = 0
373
+ if 'user_id' not in fields:
374
+ content['user_id'] = ''
375
+ if 'return_value' not in fields:
376
+ content['return_value'] = orjson.dumps(None).decode('utf-8')
377
+ if 'error' not in fields:
378
+ content['error'] = orjson.dumps(None).decode('utf-8')
379
+ if 'schedule_type' not in fields:
380
+ content['schedule_type'] = ScheduleType.SHORT.value
381
+ # Optional fields in RequestPayload
382
+ if 'pid' not in fields:
383
+ content['pid'] = None
384
+ if 'cluster_name' not in fields:
385
+ content['cluster_name'] = None
386
+ if 'status_msg' not in fields:
387
+ content['status_msg'] = None
388
+ if 'should_retry' not in fields:
389
+ content['should_retry'] = False
390
+ if 'finished_at' not in fields:
391
+ content['finished_at'] = None
392
+
393
+ # Convert back to tuple in the same order as REQUEST_COLUMNS
394
+ return tuple(content[col] for col in REQUEST_COLUMNS)
396
395
 
397
396
 
398
397
  def create_table(cursor, conn):
@@ -425,13 +424,45 @@ def create_table(cursor, conn):
425
424
  {COL_CLUSTER_NAME} TEXT,
426
425
  schedule_type TEXT,
427
426
  {COL_USER_ID} TEXT,
428
- {COL_STATUS_MSG} TEXT)""")
427
+ {COL_STATUS_MSG} TEXT,
428
+ {COL_SHOULD_RETRY} INTEGER,
429
+ {COL_FINISHED_AT} REAL
430
+ )""")
429
431
 
430
432
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
431
433
  'TEXT')
434
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
435
+ 'INTEGER')
436
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
437
+ 'REAL')
438
+
439
+ # Add an index on (status, name) to speed up queries
440
+ # that filter on these columns.
441
+ cursor.execute(f"""\
442
+ CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
443
+ """)
444
+ # Add an index on cluster_name to speed up queries
445
+ # that filter on this column.
446
+ cursor.execute(f"""\
447
+ CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
448
+ """)
449
+ # Add an index on created_at to speed up queries that sort on this column.
450
+ cursor.execute(f"""\
451
+ CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
452
+ """)
432
453
 
433
454
 
434
455
  _DB = None
456
+ _init_db_lock = threading.Lock()
457
+
458
+
459
+ def _init_db_within_lock():
460
+ global _DB
461
+ if _DB is None:
462
+ db_path = os.path.expanduser(
463
+ server_constants.API_SERVER_REQUEST_DB_PATH)
464
+ pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
465
+ _DB = db_utils.SQLiteConn(db_path, create_table)
435
466
 
436
467
 
437
468
  def init_db(func):
@@ -439,21 +470,65 @@ def init_db(func):
439
470
 
440
471
  @functools.wraps(func)
441
472
  def wrapper(*args, **kwargs):
442
- global _DB
443
- if _DB is None:
444
- _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
473
+ if _DB is not None:
474
+ return func(*args, **kwargs)
475
+ with _init_db_lock:
476
+ _init_db_within_lock()
445
477
  return func(*args, **kwargs)
446
478
 
447
479
  return wrapper
448
480
 
449
481
 
482
+ def init_db_async(func):
483
+ """Async version of init_db."""
484
+
485
+ @functools.wraps(func)
486
+ async def wrapper(*args, **kwargs):
487
+ if _DB is not None:
488
+ return await func(*args, **kwargs)
489
+ # If _DB is not initialized, init_db_async will be blocked if there
490
+ # is a thread initializing _DB, this is fine since it occurs on process
491
+ # startup.
492
+ with _init_db_lock:
493
+ _init_db_within_lock()
494
+ return await func(*args, **kwargs)
495
+
496
+ return wrapper
497
+
498
+
450
499
  def reset_db_and_logs():
451
500
  """Create the database."""
501
+ logger.debug('clearing local API server database')
452
502
  server_common.clear_local_api_server_database()
503
+ logger.debug(
504
+ f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
505
+ )
453
506
  shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
454
507
  ignore_errors=True)
508
+ logger.debug('clearing local API server client directory at '
509
+ f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
455
510
  shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
456
511
  ignore_errors=True)
512
+ with _init_db_lock:
513
+ _init_db_within_lock()
514
+ assert _DB is not None
515
+ with _DB.conn:
516
+ cursor = _DB.conn.cursor()
517
+ cursor.execute('SELECT sqlite_version()')
518
+ row = cursor.fetchone()
519
+ if row is None:
520
+ raise RuntimeError('Failed to get SQLite version')
521
+ version_str = row[0]
522
+ version_parts = version_str.split('.')
523
+ assert len(version_parts) >= 2, \
524
+ f'Invalid version string: {version_str}'
525
+ major, minor = int(version_parts[0]), int(version_parts[1])
526
+ # SQLite 3.35.0+ supports RETURNING statements.
527
+ # 3.35.0 was released in March 2021.
528
+ if not ((major > 3) or (major == 3 and minor >= 35)):
529
+ raise RuntimeError(
530
+ f'SQLite version {version_str} is not supported. '
531
+ 'Please upgrade to SQLite 3.35.0 or later.')
457
532
 
458
533
 
459
534
  def request_lock_path(request_id: str) -> str:
@@ -462,69 +537,349 @@ def request_lock_path(request_id: str) -> str:
462
537
  return os.path.join(lock_path, f'.{request_id}.lock')
463
538
 
464
539
 
465
- @contextlib.contextmanager
466
- @init_db
467
- def update_request(request_id: str):
468
- """Get a SkyPilot API request."""
469
- request = _get_request_no_lock(request_id)
470
- yield request
471
- if request is not None:
472
- _add_or_update_request_no_lock(request)
540
+ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
541
+ """Kill all pending and running requests for a cluster.
542
+
543
+ Args:
544
+ cluster_name: the name of the cluster.
545
+ exclude_request_names: exclude requests with these names. This is to
546
+ prevent killing the caller request.
547
+ """
548
+ request_ids = [
549
+ request_task.request_id
550
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
551
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
552
+ exclude_request_names=[exclude_request_name],
553
+ cluster_names=[cluster_name],
554
+ fields=['request_id']))
555
+ ]
556
+ _kill_requests(request_ids)
473
557
 
474
558
 
475
- def _get_request_no_lock(request_id: str) -> Optional[Request]:
559
+ def kill_requests(request_ids: Optional[List[str]] = None,
560
+ user_id: Optional[str] = None) -> List[str]:
561
+ """Kill requests with a given request ID prefix."""
562
+ expanded_request_ids: Optional[List[str]] = None
563
+ if request_ids is not None:
564
+ expanded_request_ids = []
565
+ for request_id in request_ids:
566
+ request_tasks = get_requests_with_prefix(request_id,
567
+ fields=['request_id'])
568
+ if request_tasks is None or len(request_tasks) == 0:
569
+ continue
570
+ if len(request_tasks) > 1:
571
+ raise ValueError(f'Multiple requests found for '
572
+ f'request ID prefix: {request_id}')
573
+ expanded_request_ids.append(request_tasks[0].request_id)
574
+ return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
575
+
576
+
577
+ # needed for backward compatibility. Remove by v0.10.7 or v0.12.0
578
+ # and rename kill_requests to kill_requests_with_prefix.
579
+ kill_requests_with_prefix = kill_requests
580
+
581
+
582
+ def _should_kill_request(request_id: str,
583
+ request_record: Optional[Request]) -> bool:
584
+ if request_record is None:
585
+ logger.debug(f'No request ID {request_id}')
586
+ return False
587
+ # Skip internal requests. The internal requests are scheduled with
588
+ # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
589
+ if request_record.request_id in set(
590
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
591
+ return False
592
+ if request_record.status > RequestStatus.RUNNING:
593
+ logger.debug(f'Request {request_id} already finished')
594
+ return False
595
+ return True
596
+
597
+
598
+ def _kill_requests(request_ids: Optional[List[str]] = None,
599
+ user_id: Optional[str] = None) -> List[str]:
600
+ """Kill a SkyPilot API request and set its status to cancelled.
601
+
602
+ Args:
603
+ request_ids: The request IDs to kill. If None, all requests for the
604
+ user are killed.
605
+ user_id: The user ID to kill requests for. If None, all users are
606
+ killed.
607
+
608
+ Returns:
609
+ A list of request IDs that were cancelled.
610
+ """
611
+ if request_ids is None:
612
+ request_ids = [
613
+ request_task.request_id
614
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
615
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
616
+ # Avoid cancelling the cancel request itself.
617
+ exclude_request_names=['sky.api_cancel'],
618
+ user_id=user_id,
619
+ fields=['request_id']))
620
+ ]
621
+ cancelled_request_ids = []
622
+ for request_id in request_ids:
623
+ with update_request(request_id) as request_record:
624
+ if not _should_kill_request(request_id, request_record):
625
+ continue
626
+ if request_record.pid is not None:
627
+ logger.debug(f'Killing request process {request_record.pid}')
628
+ # Use SIGTERM instead of SIGKILL:
629
+ # - The executor can handle SIGTERM gracefully
630
+ # - After SIGTERM, the executor can reuse the request process
631
+ # for other requests, avoiding the overhead of forking a new
632
+ # process for each request.
633
+ os.kill(request_record.pid, signal.SIGTERM)
634
+ request_record.status = RequestStatus.CANCELLED
635
+ request_record.finished_at = time.time()
636
+ cancelled_request_ids.append(request_id)
637
+ return cancelled_request_ids
638
+
639
+
640
+ @init_db_async
641
+ @asyncio_utils.shield
642
+ async def kill_request_async(request_id: str) -> bool:
643
+ """Kill a SkyPilot API request and set its status to cancelled.
644
+
645
+ Returns:
646
+ True if the request was killed, False otherwise.
647
+ """
648
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
649
+ request = await _get_request_no_lock_async(request_id)
650
+ if not _should_kill_request(request_id, request):
651
+ return False
652
+ assert request is not None
653
+ if request.pid is not None:
654
+ logger.debug(f'Killing request process {request.pid}')
655
+ # Use SIGTERM instead of SIGKILL:
656
+ # - The executor can handle SIGTERM gracefully
657
+ # - After SIGTERM, the executor can reuse the request process
658
+ # for other requests, avoiding the overhead of forking a new
659
+ # process for each request.
660
+ os.kill(request.pid, signal.SIGTERM)
661
+ request.status = RequestStatus.CANCELLED
662
+ request.finished_at = time.time()
663
+ await _add_or_update_request_no_lock_async(request)
664
+ return True
665
+
666
+
667
+ @contextlib.contextmanager
668
+ @init_db
669
+ @metrics_lib.time_me
670
+ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
671
+ """Get and update a SkyPilot API request."""
672
+ # Acquire the lock to avoid race conditions between multiple request
673
+ # operations, e.g. execute and cancel.
674
+ with filelock.FileLock(request_lock_path(request_id)):
675
+ request = _get_request_no_lock(request_id)
676
+ yield request
677
+ if request is not None:
678
+ _add_or_update_request_no_lock(request)
679
+
680
+
681
+ @init_db_async
682
+ @metrics_lib.time_me
683
+ @asyncio_utils.shield
684
+ async def update_status_async(request_id: str, status: RequestStatus) -> None:
685
+ """Update the status of a request"""
686
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
687
+ request = await _get_request_no_lock_async(request_id)
688
+ if request is not None:
689
+ request.status = status
690
+ await _add_or_update_request_no_lock_async(request)
691
+
692
+
693
+ @init_db_async
694
+ @metrics_lib.time_me
695
+ @asyncio_utils.shield
696
+ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
697
+ """Update the status message of a request"""
698
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
699
+ request = await _get_request_no_lock_async(request_id)
700
+ if request is not None:
701
+ request.status_msg = status_msg
702
+ await _add_or_update_request_no_lock_async(request)
703
+
704
+
705
+ def _get_request_no_lock(
706
+ request_id: str,
707
+ fields: Optional[List[str]] = None) -> Optional[Request]:
476
708
  """Get a SkyPilot API request."""
477
709
  assert _DB is not None
478
710
  columns_str = ', '.join(REQUEST_COLUMNS)
711
+ if fields:
712
+ columns_str = ', '.join(fields)
479
713
  with _DB.conn:
480
714
  cursor = _DB.conn.cursor()
481
- cursor.execute(
482
- f'SELECT {columns_str} FROM {REQUEST_TABLE} '
483
- 'WHERE request_id LIKE ?', (request_id + '%',))
715
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
716
+ 'WHERE request_id LIKE ?'), (request_id + '%',))
484
717
  row = cursor.fetchone()
485
718
  if row is None:
486
719
  return None
720
+ if fields:
721
+ row = _update_request_row_fields(row, fields)
487
722
  return Request.from_row(row)
488
723
 
489
724
 
490
- @init_db
491
- def get_latest_request_id() -> Optional[str]:
725
+ async def _get_request_no_lock_async(
726
+ request_id: str,
727
+ fields: Optional[List[str]] = None) -> Optional[Request]:
728
+ """Async version of _get_request_no_lock."""
729
+ assert _DB is not None
730
+ columns_str = ', '.join(REQUEST_COLUMNS)
731
+ if fields:
732
+ columns_str = ', '.join(fields)
733
+ async with _DB.execute_fetchall_async(
734
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
735
+ 'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
736
+ row = rows[0] if rows else None
737
+ if row is None:
738
+ return None
739
+ if fields:
740
+ row = _update_request_row_fields(row, fields)
741
+ return Request.from_row(row)
742
+
743
+
744
+ @init_db_async
745
+ @metrics_lib.time_me
746
+ async def get_latest_request_id_async() -> Optional[str]:
492
747
  """Get the latest request ID."""
493
748
  assert _DB is not None
494
- with _DB.conn:
495
- cursor = _DB.conn.cursor()
496
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
497
- 'ORDER BY created_at DESC LIMIT 1')
498
- row = cursor.fetchone()
499
- return row[0] if row else None
749
+ async with _DB.execute_fetchall_async(
750
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
751
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
752
+ return rows[0][0] if rows else None
500
753
 
501
754
 
502
755
  @init_db
503
- def get_request(request_id: str) -> Optional[Request]:
756
+ @metrics_lib.time_me
757
+ def get_request(request_id: str,
758
+ fields: Optional[List[str]] = None) -> Optional[Request]:
504
759
  """Get a SkyPilot API request."""
505
760
  with filelock.FileLock(request_lock_path(request_id)):
506
- return _get_request_no_lock(request_id)
761
+ return _get_request_no_lock(request_id, fields)
507
762
 
508
763
 
509
- @init_db
510
- def create_if_not_exists(request: Request) -> bool:
511
- """Create a SkyPilot API request if it does not exist."""
512
- with filelock.FileLock(request_lock_path(request.request_id)):
513
- if _get_request_no_lock(request.request_id) is not None:
514
- return False
515
- _add_or_update_request_no_lock(request)
516
- return True
764
+ @init_db_async
765
+ @metrics_lib.time_me_async
766
+ @asyncio_utils.shield
767
+ async def get_request_async(
768
+ request_id: str,
769
+ fields: Optional[List[str]] = None) -> Optional[Request]:
770
+ """Async version of get_request."""
771
+ # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
772
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
773
+ return await _get_request_no_lock_async(request_id, fields)
517
774
 
518
775
 
519
776
  @init_db
520
- def get_request_tasks(
521
- status: Optional[List[RequestStatus]] = None,
522
- cluster_names: Optional[List[str]] = None,
523
- user_id: Optional[str] = None,
524
- exclude_request_names: Optional[List[str]] = None,
525
- include_request_names: Optional[List[str]] = None,
526
- ) -> List[Request]:
527
- """Get a list of requests that match the given filters.
777
+ @metrics_lib.time_me
778
+ def get_requests_with_prefix(
779
+ request_id_prefix: str,
780
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
781
+ """Get requests with a given request ID prefix."""
782
+ assert _DB is not None
783
+ if fields:
784
+ columns_str = ', '.join(fields)
785
+ else:
786
+ columns_str = ', '.join(REQUEST_COLUMNS)
787
+ with _DB.conn:
788
+ cursor = _DB.conn.cursor()
789
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
790
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',))
791
+ rows = cursor.fetchall()
792
+ if not rows:
793
+ return None
794
+ if fields:
795
+ rows = [_update_request_row_fields(row, fields) for row in rows]
796
+ return [Request.from_row(row) for row in rows]
797
+
798
+
799
+ @init_db_async
800
+ @metrics_lib.time_me_async
801
+ @asyncio_utils.shield
802
+ async def get_requests_async_with_prefix(
803
+ request_id_prefix: str,
804
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
805
+ """Async version of get_request_with_prefix."""
806
+ assert _DB is not None
807
+ if fields:
808
+ columns_str = ', '.join(fields)
809
+ else:
810
+ columns_str = ', '.join(REQUEST_COLUMNS)
811
+ async with _DB.execute_fetchall_async(
812
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
813
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',)) as rows:
814
+ if not rows:
815
+ return None
816
+ if fields:
817
+ rows = [_update_request_row_fields(row, fields) for row in rows]
818
+ return [Request.from_row(row) for row in rows]
819
+
820
+
821
+ class StatusWithMsg(NamedTuple):
822
+ status: RequestStatus
823
+ status_msg: Optional[str] = None
824
+
825
+
826
+ @init_db_async
827
+ @metrics_lib.time_me_async
828
+ async def get_request_status_async(
829
+ request_id: str,
830
+ include_msg: bool = False,
831
+ ) -> Optional[StatusWithMsg]:
832
+ """Get the status of a request.
833
+
834
+ Args:
835
+ request_id: The ID of the request.
836
+ include_msg: Whether to include the status message.
837
+
838
+ Returns:
839
+ The status of the request. If the request is not found, returns
840
+ None.
841
+ """
842
+ assert _DB is not None
843
+ columns = 'status'
844
+ if include_msg:
845
+ columns += ', status_msg'
846
+ sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
847
+ async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
848
+ if rows is None or len(rows) == 0:
849
+ return None
850
+ status = RequestStatus(rows[0][0])
851
+ status_msg = rows[0][1] if include_msg else None
852
+ return StatusWithMsg(status, status_msg)
853
+
854
+
855
+ @init_db_async
856
+ @metrics_lib.time_me_async
857
+ @asyncio_utils.shield
858
+ async def create_if_not_exists_async(request: Request) -> bool:
859
+ """Create a request if it does not exist, otherwise do nothing.
860
+
861
+ Returns:
862
+ True if a new request is created, False if the request already exists.
863
+ """
864
+ assert _DB is not None
865
+ request_columns = ', '.join(REQUEST_COLUMNS)
866
+ values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
867
+ sql_statement = (
868
+ f'INSERT INTO {REQUEST_TABLE} '
869
+ f'({request_columns}) VALUES '
870
+ f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
871
+ request_row = request.to_row()
872
+ # Execute the SQL statement without getting the request lock.
873
+ # The request lock is used to prevent racing with cancellation codepath,
874
+ # but a request cannot be cancelled before it is created.
875
+ row = await _DB.execute_get_returning_value_async(sql_statement,
876
+ request_row)
877
+ return True if row else False
878
+
879
+
880
+ @dataclasses.dataclass
881
+ class RequestTaskFilter:
882
+ """Filter for requests.
528
883
 
529
884
  Args:
530
885
  status: a list of statuses of the requests to filter on.
@@ -535,74 +890,315 @@ def get_request_tasks(
535
890
  If None, all users are included.
536
891
  include_request_names: a list of request names to filter on.
537
892
  Mutually exclusive with exclude_request_names.
893
+ finished_before: if provided, only include requests finished before this
894
+ timestamp.
895
+ limit: the number of requests to show. If None, show all requests.
538
896
 
539
897
  Raises:
540
898
  ValueError: If both exclude_request_names and include_request_names are
541
899
  provided.
542
900
  """
543
- if exclude_request_names is not None and include_request_names is not None:
544
- raise ValueError(
545
- 'Only one of exclude_request_names or include_request_names can be '
546
- 'provided, not both.')
547
-
548
- filters = []
549
- filter_params = []
550
- if status is not None:
551
- status_list_str = ','.join(repr(status.value) for status in status)
552
- filters.append(f'status IN ({status_list_str})')
553
- if exclude_request_names is not None:
554
- exclude_request_names_str = ','.join(
555
- repr(name) for name in exclude_request_names)
556
- filters.append(f'name NOT IN ({exclude_request_names_str})')
557
- if cluster_names is not None:
558
- cluster_names_str = ','.join(repr(name) for name in cluster_names)
559
- filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
560
- if user_id is not None:
561
- filters.append(f'{COL_USER_ID} = ?')
562
- filter_params.append(user_id)
563
- if include_request_names is not None:
564
- request_names_str = ','.join(
565
- repr(name) for name in include_request_names)
566
- filters.append(f'name IN ({request_names_str})')
567
- assert _DB is not None
568
- with _DB.conn:
569
- cursor = _DB.conn.cursor()
901
+ status: Optional[List[RequestStatus]] = None
902
+ cluster_names: Optional[List[str]] = None
903
+ user_id: Optional[str] = None
904
+ exclude_request_names: Optional[List[str]] = None
905
+ include_request_names: Optional[List[str]] = None
906
+ finished_before: Optional[float] = None
907
+ limit: Optional[int] = None
908
+ fields: Optional[List[str]] = None
909
+ sort: bool = False
910
+
911
+ def __post_init__(self):
912
+ if (self.exclude_request_names is not None and
913
+ self.include_request_names is not None):
914
+ raise ValueError(
915
+ 'Only one of exclude_request_names or include_request_names '
916
+ 'can be provided, not both.')
917
+
918
+ def build_query(self) -> Tuple[str, List[Any]]:
919
+ """Build the SQL query and filter parameters.
920
+
921
+ Returns:
922
+ A tuple of (SQL, SQL parameters).
923
+ """
924
+ filters = []
925
+ filter_params: List[Any] = []
926
+ if self.status is not None:
927
+ status_list_str = ','.join(
928
+ repr(status.value) for status in self.status)
929
+ filters.append(f'status IN ({status_list_str})')
930
+ if self.include_request_names is not None:
931
+ request_names_str = ','.join(
932
+ repr(name) for name in self.include_request_names)
933
+ filters.append(f'name IN ({request_names_str})')
934
+ if self.exclude_request_names is not None:
935
+ exclude_request_names_str = ','.join(
936
+ repr(name) for name in self.exclude_request_names)
937
+ filters.append(f'name NOT IN ({exclude_request_names_str})')
938
+ if self.cluster_names is not None:
939
+ cluster_names_str = ','.join(
940
+ repr(name) for name in self.cluster_names)
941
+ filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
942
+ if self.user_id is not None:
943
+ filters.append(f'{COL_USER_ID} = ?')
944
+ filter_params.append(self.user_id)
945
+ if self.finished_before is not None:
946
+ filters.append('finished_at < ?')
947
+ filter_params.append(self.finished_before)
570
948
  filter_str = ' AND '.join(filters)
571
949
  if filter_str:
572
950
  filter_str = f' WHERE {filter_str}'
573
951
  columns_str = ', '.join(REQUEST_COLUMNS)
574
- cursor.execute(
575
- f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
576
- 'ORDER BY created_at DESC', filter_params)
952
+ if self.fields:
953
+ columns_str = ', '.join(self.fields)
954
+ sort_str = ''
955
+ if self.sort:
956
+ sort_str = ' ORDER BY created_at DESC'
957
+ query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str}'
958
+ f'{sort_str}')
959
+ if self.limit is not None:
960
+ query_str += f' LIMIT {self.limit}'
961
+ return query_str, filter_params
962
+
963
+
964
+ @init_db
965
+ @metrics_lib.time_me
966
+ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
967
+ """Get a list of requests that match the given filters.
968
+
969
+ Args:
970
+ req_filter: the filter to apply to the requests. Refer to
971
+ RequestTaskFilter for the details.
972
+ """
973
+ assert _DB is not None
974
+ with _DB.conn:
975
+ cursor = _DB.conn.cursor()
976
+ cursor.execute(*req_filter.build_query())
577
977
  rows = cursor.fetchall()
578
978
  if rows is None:
579
979
  return []
580
- requests = []
581
- for row in rows:
582
- request = Request.from_row(row)
583
- requests.append(request)
584
- return requests
980
+ if req_filter.fields:
981
+ rows = [
982
+ _update_request_row_fields(row, req_filter.fields) for row in rows
983
+ ]
984
+ return [Request.from_row(row) for row in rows]
985
+
986
+
987
+ @init_db_async
988
+ @metrics_lib.time_me_async
989
+ async def get_request_tasks_async(
990
+ req_filter: RequestTaskFilter) -> List[Request]:
991
+ """Async version of get_request_tasks."""
992
+ assert _DB is not None
993
+ async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
994
+ if not rows:
995
+ return []
996
+ if req_filter.fields:
997
+ rows = [
998
+ _update_request_row_fields(row, req_filter.fields) for row in rows
999
+ ]
1000
+ return [Request.from_row(row) for row in rows]
1001
+
1002
+
1003
+ @init_db_async
1004
+ @metrics_lib.time_me_async
1005
+ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
1006
+ """Get a list of API request ids for shell completion."""
1007
+ assert _DB is not None
1008
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
1009
+ # then order by creation time (newest first) within each category.
1010
+ async with _DB.execute_fetchall_async(
1011
+ f"""SELECT request_id FROM {REQUEST_TABLE}
1012
+ WHERE request_id LIKE ?
1013
+ ORDER BY
1014
+ CASE
1015
+ WHEN status IN ('PENDING', 'RUNNING') THEN 0
1016
+ ELSE 1
1017
+ END,
1018
+ created_at DESC
1019
+ LIMIT 1000""", (f'{incomplete}%',)) as rows:
1020
+ if not rows:
1021
+ return []
1022
+ return [row[0] for row in rows]
1023
+
1024
+
1025
+ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
1026
+ f'({", ".join(REQUEST_COLUMNS)}) VALUES '
1027
+ f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
585
1028
 
586
1029
 
587
1030
  def _add_or_update_request_no_lock(request: Request):
588
1031
  """Add or update a REST request into the database."""
589
- row = request.to_row()
590
- key_str = ', '.join(REQUEST_COLUMNS)
591
- fill_str = ', '.join(['?'] * len(row))
592
1032
  assert _DB is not None
593
1033
  with _DB.conn:
594
1034
  cursor = _DB.conn.cursor()
595
- cursor.execute(
596
- f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
597
- f'VALUES ({fill_str})', row)
1035
+ cursor.execute(_add_or_update_request_sql, request.to_row())
598
1036
 
599
1037
 
600
- def set_request_failed(request_id: str, e: BaseException) -> None:
601
- """Set a request to failed and populate the error message."""
1038
+ async def _add_or_update_request_no_lock_async(request: Request):
1039
+ """Async version of _add_or_update_request_no_lock."""
1040
+ assert _DB is not None
1041
+ await _DB.execute_and_commit_async(_add_or_update_request_sql,
1042
+ request.to_row())
1043
+
1044
+
1045
+ def set_exception_stacktrace(e: BaseException) -> None:
602
1046
  with ux_utils.enable_traceback():
603
1047
  stacktrace = traceback.format_exc()
604
1048
  setattr(e, 'stacktrace', stacktrace)
1049
+
1050
+
1051
+ def set_request_failed(request_id: str, e: BaseException) -> None:
1052
+ """Set a request to failed and populate the error message."""
1053
+ set_exception_stacktrace(e)
605
1054
  with update_request(request_id) as request_task:
606
1055
  assert request_task is not None, request_id
607
1056
  request_task.status = RequestStatus.FAILED
1057
+ request_task.finished_at = time.time()
608
1058
  request_task.set_error(e)
1059
+
1060
+
1061
+ @init_db_async
1062
+ @metrics_lib.time_me_async
1063
+ @asyncio_utils.shield
1064
+ async def set_request_failed_async(request_id: str, e: BaseException) -> None:
1065
+ """Set a request to failed and populate the error message."""
1066
+ set_exception_stacktrace(e)
1067
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1068
+ request_task = await _get_request_no_lock_async(request_id)
1069
+ assert request_task is not None, request_id
1070
+ request_task.status = RequestStatus.FAILED
1071
+ request_task.finished_at = time.time()
1072
+ request_task.set_error(e)
1073
+ await _add_or_update_request_no_lock_async(request_task)
1074
+
1075
+
1076
+ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
1077
+ """Set a request to succeeded and populate the result."""
1078
+ with update_request(request_id) as request_task:
1079
+ assert request_task is not None, request_id
1080
+ request_task.status = RequestStatus.SUCCEEDED
1081
+ request_task.finished_at = time.time()
1082
+ if result is not None:
1083
+ request_task.set_return_value(result)
1084
+
1085
+
1086
+ @init_db_async
1087
+ @metrics_lib.time_me_async
1088
+ @asyncio_utils.shield
1089
+ async def set_request_succeeded_async(request_id: str,
1090
+ result: Optional[Any]) -> None:
1091
+ """Set a request to succeeded and populate the result."""
1092
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1093
+ request_task = await _get_request_no_lock_async(request_id)
1094
+ assert request_task is not None, request_id
1095
+ request_task.status = RequestStatus.SUCCEEDED
1096
+ request_task.finished_at = time.time()
1097
+ if result is not None:
1098
+ request_task.set_return_value(result)
1099
+ await _add_or_update_request_no_lock_async(request_task)
1100
+
1101
+
1102
+ @init_db_async
1103
+ @metrics_lib.time_me_async
1104
+ @asyncio_utils.shield
1105
+ async def set_request_cancelled_async(request_id: str) -> None:
1106
+ """Set a pending or running request to cancelled."""
1107
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1108
+ request_task = await _get_request_no_lock_async(request_id)
1109
+ assert request_task is not None, request_id
1110
+ # Already finished or cancelled.
1111
+ if request_task.status > RequestStatus.RUNNING:
1112
+ return
1113
+ request_task.finished_at = time.time()
1114
+ request_task.status = RequestStatus.CANCELLED
1115
+ await _add_or_update_request_no_lock_async(request_task)
1116
+
1117
+
1118
+ @init_db
1119
+ @metrics_lib.time_me
1120
+ async def _delete_requests(request_ids: List[str]):
1121
+ """Clean up requests by their IDs."""
1122
+ id_list_str = ','.join(repr(request_id) for request_id in request_ids)
1123
+ assert _DB is not None
1124
+ await _DB.execute_and_commit_async(
1125
+ f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
1126
+
1127
+
1128
+ async def clean_finished_requests_with_retention(retention_seconds: int,
1129
+ batch_size: int = 1000):
1130
+ """Clean up finished requests older than the retention period.
1131
+
1132
+ This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
1133
+ from the database and cleans up their associated log files.
1134
+
1135
+ Args:
1136
+ retention_seconds: Requests older than this many seconds will be
1137
+ deleted.
1138
+ batch_size: batch delete 'batch_size' requests at a time to
1139
+ avoid using too much memory and once and to let each
1140
+ db query complete in a reasonable time. All stale
1141
+ requests older than the retention period will be deleted
1142
+ regardless of the batch size.
1143
+ """
1144
+ total_deleted = 0
1145
+ while True:
1146
+ reqs = await get_request_tasks_async(
1147
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
1148
+ finished_before=time.time() -
1149
+ retention_seconds,
1150
+ limit=batch_size,
1151
+ fields=['request_id']))
1152
+ if len(reqs) == 0:
1153
+ break
1154
+ futs = []
1155
+ for req in reqs:
1156
+ # req.log_path is derived from request_id,
1157
+ # so it's ok to just grab the request_id in the above query.
1158
+ futs.append(
1159
+ asyncio.create_task(
1160
+ anyio.Path(
1161
+ req.log_path.absolute()).unlink(missing_ok=True)))
1162
+ await asyncio.gather(*futs)
1163
+
1164
+ await _delete_requests([req.request_id for req in reqs])
1165
+ total_deleted += len(reqs)
1166
+ if len(reqs) < batch_size:
1167
+ break
1168
+
1169
+ # To avoid leakage of the log file, logs must be deleted before the
1170
+ # request task in the database.
1171
+ logger.info(f'Cleaned up {total_deleted} finished requests '
1172
+ f'older than {retention_seconds} seconds')
1173
+
1174
+
1175
+ async def requests_gc_daemon():
1176
+ """Garbage collect finished requests periodically."""
1177
+ while True:
1178
+ logger.info('Running requests GC daemon...')
1179
+ # Use the latest config.
1180
+ skypilot_config.reload_config()
1181
+ retention_seconds = skypilot_config.get_nested(
1182
+ ('api_server', 'requests_retention_hours'),
1183
+ DEFAULT_REQUESTS_RETENTION_HOURS) * 3600
1184
+ try:
1185
+ # Negative value disables the requests GC
1186
+ if retention_seconds >= 0:
1187
+ await clean_finished_requests_with_retention(retention_seconds)
1188
+ except asyncio.CancelledError:
1189
+ logger.info('Requests GC daemon cancelled')
1190
+ break
1191
+ except Exception as e: # pylint: disable=broad-except
1192
+ logger.error(f'Error running requests GC daemon: {e}'
1193
+ f'traceback: {traceback.format_exc()}')
1194
+ # Run the daemon at most once every hour to avoid too frequent
1195
+ # cleanup.
1196
+ await asyncio.sleep(max(retention_seconds, 3600))
1197
+
1198
+
1199
+ def _cleanup():
1200
+ if _DB is not None:
1201
+ asyncio.run(_DB.close())
1202
+
1203
+
1204
+ atexit.register(_cleanup)