skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,42 @@
1
1
  """Utilities for REST API."""
2
+ import asyncio
3
+ import atexit
2
4
  import contextlib
3
5
  import dataclasses
4
6
  import enum
5
7
  import functools
6
- import json
7
8
  import os
8
9
  import pathlib
9
10
  import shutil
10
11
  import signal
11
12
  import sqlite3
13
+ import threading
12
14
  import time
13
15
  import traceback
14
- from typing import Any, Callable, Dict, List, Optional, Tuple
16
+ from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
17
+ Tuple)
18
+ import uuid
15
19
 
20
+ import anyio
16
21
  import colorama
17
22
  import filelock
23
+ import orjson
18
24
 
19
25
  from sky import exceptions
20
26
  from sky import global_user_state
21
27
  from sky import sky_logging
28
+ from sky import skypilot_config
29
+ from sky.metrics import utils as metrics_lib
22
30
  from sky.server import common as server_common
23
31
  from sky.server import constants as server_constants
32
+ from sky.server import daemons
24
33
  from sky.server.requests import payloads
25
34
  from sky.server.requests.serializers import decoders
26
35
  from sky.server.requests.serializers import encoders
27
- from sky.utils import common
36
+ from sky.utils import asyncio_utils
28
37
  from sky.utils import common_utils
29
- from sky.utils import db_utils
30
- from sky.utils import env_options
31
38
  from sky.utils import ux_utils
39
+ from sky.utils.db import db_utils
32
40
 
33
41
  logger = sky_logging.init_logger(__name__)
34
42
 
@@ -37,8 +45,12 @@ REQUEST_TABLE = 'requests'
37
45
  COL_CLUSTER_NAME = 'cluster_name'
38
46
  COL_USER_ID = 'user_id'
39
47
  COL_STATUS_MSG = 'status_msg'
48
+ COL_SHOULD_RETRY = 'should_retry'
49
+ COL_FINISHED_AT = 'finished_at'
40
50
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
41
51
 
52
+ DEFAULT_REQUESTS_RETENTION_HOURS = 24 # 1 day
53
+
42
54
  # TODO(zhwu): For scalability, there are several TODOs:
43
55
  # [x] Have a way to queue requests.
44
56
  # [ ] Move logs to persistent place.
@@ -62,6 +74,10 @@ class RequestStatus(enum.Enum):
62
74
  color = _STATUS_TO_COLOR[self]
63
75
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
64
76
 
77
+ @classmethod
78
+ def finished_status(cls) -> List['RequestStatus']:
79
+ return [cls.SUCCEEDED, cls.FAILED, cls.CANCELLED]
80
+
65
81
 
66
82
  _STATUS_TO_COLOR = {
67
83
  RequestStatus.PENDING: colorama.Fore.BLUE,
@@ -85,6 +101,8 @@ REQUEST_COLUMNS = [
85
101
  'schedule_type',
86
102
  COL_USER_ID,
87
103
  COL_STATUS_MSG,
104
+ COL_SHOULD_RETRY,
105
+ COL_FINISHED_AT,
88
106
  ]
89
107
 
90
108
 
@@ -95,27 +113,6 @@ class ScheduleType(enum.Enum):
95
113
  SHORT = 'short'
96
114
 
97
115
 
98
- @dataclasses.dataclass
99
- class RequestPayload:
100
- """The payload for the requests."""
101
-
102
- request_id: str
103
- name: str
104
- entrypoint: str
105
- request_body: str
106
- status: str
107
- created_at: float
108
- user_id: str
109
- return_value: str
110
- error: str
111
- pid: Optional[int]
112
- schedule_type: str
113
- user_name: Optional[str] = None
114
- # Resources the request operates on.
115
- cluster_name: Optional[str] = None
116
- status_msg: Optional[str] = None
117
-
118
-
119
116
  @dataclasses.dataclass
120
117
  class Request:
121
118
  """A SkyPilot API request."""
@@ -136,6 +133,10 @@ class Request:
136
133
  cluster_name: Optional[str] = None
137
134
  # Status message of the request, indicates the reason of current status.
138
135
  status_msg: Optional[str] = None
136
+ # Whether the request should be retried.
137
+ should_retry: bool = False
138
+ # When the request finished.
139
+ finished_at: Optional[float] = None
139
140
 
140
141
  @property
141
142
  def log_path(self) -> pathlib.Path:
@@ -179,7 +180,7 @@ class Request:
179
180
  @classmethod
180
181
  def from_row(cls, row: Tuple[Any, ...]) -> 'Request':
181
182
  content = dict(zip(REQUEST_COLUMNS, row))
182
- return cls.decode(RequestPayload(**content))
183
+ return cls.decode(payloads.RequestPayload(**content))
183
184
 
184
185
  def to_row(self) -> Tuple[Any, ...]:
185
186
  payload = self.encode()
@@ -188,7 +189,7 @@ class Request:
188
189
  row.append(getattr(payload, k))
189
190
  return tuple(row)
190
191
 
191
- def readable_encode(self) -> RequestPayload:
192
+ def readable_encode(self) -> payloads.RequestPayload:
192
193
  """Serialize the SkyPilot API request for display purposes.
193
194
 
194
195
  This function should be called on the server side to serialize the
@@ -204,15 +205,16 @@ class Request:
204
205
  """
205
206
  assert isinstance(self.request_body,
206
207
  payloads.RequestBody), (self.name, self.request_body)
207
- user_name = global_user_state.get_user(self.user_id).name
208
- return RequestPayload(
208
+ user = global_user_state.get_user(self.user_id)
209
+ user_name = user.name if user is not None else None
210
+ return payloads.RequestPayload(
209
211
  request_id=self.request_id,
210
212
  name=self.name,
211
213
  entrypoint=self.entrypoint.__name__,
212
214
  request_body=self.request_body.model_dump_json(),
213
215
  status=self.status.value,
214
- return_value=json.dumps(None),
215
- error=json.dumps(None),
216
+ return_value=orjson.dumps(None).decode('utf-8'),
217
+ error=orjson.dumps(None).decode('utf-8'),
216
218
  pid=None,
217
219
  created_at=self.created_at,
218
220
  schedule_type=self.schedule_type.value,
@@ -220,27 +222,31 @@ class Request:
220
222
  user_name=user_name,
221
223
  cluster_name=self.cluster_name,
222
224
  status_msg=self.status_msg,
225
+ should_retry=self.should_retry,
226
+ finished_at=self.finished_at,
223
227
  )
224
228
 
225
- def encode(self) -> RequestPayload:
229
+ def encode(self) -> payloads.RequestPayload:
226
230
  """Serialize the SkyPilot API request."""
227
231
  assert isinstance(self.request_body,
228
232
  payloads.RequestBody), (self.name, self.request_body)
229
233
  try:
230
- return RequestPayload(
234
+ return payloads.RequestPayload(
231
235
  request_id=self.request_id,
232
236
  name=self.name,
233
237
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
234
238
  request_body=encoders.pickle_and_encode(self.request_body),
235
239
  status=self.status.value,
236
- return_value=json.dumps(self.return_value),
237
- error=json.dumps(self.error),
240
+ return_value=orjson.dumps(self.return_value).decode('utf-8'),
241
+ error=orjson.dumps(self.error).decode('utf-8'),
238
242
  pid=self.pid,
239
243
  created_at=self.created_at,
240
244
  schedule_type=self.schedule_type.value,
241
245
  user_id=self.user_id,
242
246
  cluster_name=self.cluster_name,
243
247
  status_msg=self.status_msg,
248
+ should_retry=self.should_retry,
249
+ finished_at=self.finished_at,
244
250
  )
245
251
  except (TypeError, ValueError) as e:
246
252
  # The error is unexpected, so we don't suppress the stack trace.
@@ -255,7 +261,7 @@ class Request:
255
261
  raise
256
262
 
257
263
  @classmethod
258
- def decode(cls, payload: RequestPayload) -> 'Request':
264
+ def decode(cls, payload: payloads.RequestPayload) -> 'Request':
259
265
  """Deserialize the SkyPilot API request."""
260
266
  try:
261
267
  return cls(
@@ -264,14 +270,16 @@ class Request:
264
270
  entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
265
271
  request_body=decoders.decode_and_unpickle(payload.request_body),
266
272
  status=RequestStatus(payload.status),
267
- return_value=json.loads(payload.return_value),
268
- error=json.loads(payload.error),
273
+ return_value=orjson.loads(payload.return_value),
274
+ error=orjson.loads(payload.error),
269
275
  pid=payload.pid,
270
276
  created_at=payload.created_at,
271
277
  schedule_type=ScheduleType(payload.schedule_type),
272
278
  user_id=payload.user_id,
273
279
  cluster_name=payload.cluster_name,
274
280
  status_msg=payload.status_msg,
281
+ should_retry=payload.should_retry,
282
+ finished_at=payload.finished_at,
275
283
  )
276
284
  except (TypeError, ValueError) as e:
277
285
  logger.error(
@@ -286,113 +294,104 @@ class Request:
286
294
  raise
287
295
 
288
296
 
289
- def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
290
- """Kill all pending and running requests for a cluster.
297
+ def get_new_request_id() -> str:
298
+ """Get a new request ID."""
299
+ return str(uuid.uuid4())
291
300
 
292
- Args:
293
- cluster_name: the name of the cluster.
294
- exclude_request_names: exclude requests with these names. This is to
295
- prevent killing the caller request.
296
- """
297
- request_ids = [
298
- request_task.request_id for request_task in get_request_tasks(
299
- cluster_names=[cluster_name],
300
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
301
- exclude_request_names=[exclude_request_name])
302
- ]
303
- kill_requests(request_ids)
304
301
 
302
+ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
303
+ """Serialize the SkyPilot API request for display purposes.
305
304
 
306
- def refresh_cluster_status_event():
307
- """Periodically refresh the cluster status."""
308
- # pylint: disable=import-outside-toplevel
309
- from sky import core
310
-
311
- # Disable logging for periodic refresh to avoid the usage message being
312
- # sent multiple times.
313
- os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
314
-
315
- while True:
316
- logger.info('=== Refreshing cluster status ===')
317
- # This periodically refresh will hold the lock for the cluster being
318
- # refreshed, but it is OK because other operations will just wait for
319
- # the lock and get the just refreshed status without refreshing again.
320
- core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
321
- logger.info(
322
- 'Status refreshed. Sleeping '
323
- f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
324
- ' seconds for the next refresh...\n')
325
- time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
326
-
327
-
328
- @dataclasses.dataclass
329
- class InternalRequestDaemon:
330
- id: str
331
- name: str
332
- event_fn: Callable[[], None]
333
-
334
-
335
- # Register the events to run in the background.
336
- INTERNAL_REQUEST_DAEMONS = [
337
- # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
338
- # set to updated status automatically, without showing users the hint of
339
- # cluster being stopped or down when `sky status -r` is called.
340
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
341
- name='status',
342
- event_fn=refresh_cluster_status_event)
343
- ]
344
-
345
-
346
- def kill_requests(request_ids: Optional[List[str]] = None,
347
- user_id: Optional[str] = None) -> List[str]:
348
- """Kill a SkyPilot API request and set its status to cancelled.
349
-
350
- Args:
351
- request_ids: The request IDs to kill. If None, all requests for the
352
- user are killed.
353
- user_id: The user ID to kill requests for. If None, all users are
354
- killed.
355
-
356
- Returns:
357
- A list of request IDs that were cancelled.
358
- """
359
- if request_ids is None:
360
- request_ids = [
361
- request_task.request_id for request_task in get_request_tasks(
362
- user_id=user_id,
363
- status=[RequestStatus.RUNNING, RequestStatus.PENDING],
364
- # Avoid cancelling the cancel request itself.
365
- exclude_request_names=['sky.api_cancel'])
366
- ]
367
- cancelled_request_ids = []
368
- for request_id in request_ids:
369
- with update_request(request_id) as request_record:
370
- if request_record is None:
371
- logger.debug(f'No request ID {request_id}')
372
- continue
373
- # Skip internal requests. The internal requests are scheduled with
374
- # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
375
- if request_record.request_id in set(
376
- event.id for event in INTERNAL_REQUEST_DAEMONS):
377
- continue
378
- if request_record.status > RequestStatus.RUNNING:
379
- logger.debug(f'Request {request_id} already finished')
380
- continue
381
- if request_record.pid is not None:
382
- logger.debug(f'Killing request process {request_record.pid}')
383
- # Use SIGTERM instead of SIGKILL:
384
- # - The executor can handle SIGTERM gracefully
385
- # - After SIGTERM, the executor can reuse the request process
386
- # for other requests, avoiding the overhead of forking a new
387
- # process for each request.
388
- os.kill(request_record.pid, signal.SIGTERM)
389
- request_record.status = RequestStatus.CANCELLED
390
- cancelled_request_ids.append(request_id)
391
- return cancelled_request_ids
305
+ This function should be called on the server side to serialize the
306
+ request body into human readable format, e.g., the entrypoint should
307
+ be a string, and the pid, error, or return value are not needed.
392
308
 
309
+ The returned value will then be displayed on the client side in request
310
+ table.
393
311
 
394
- _DB_PATH = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
395
- pathlib.Path(_DB_PATH).parents[0].mkdir(parents=True, exist_ok=True)
312
+ We do not use `encode` for display to avoid a large amount of data being
313
+ sent to the client side, especially for the request table could include
314
+ all the requests.
315
+ """
316
+ encoded_requests = []
317
+ all_users = global_user_state.get_all_users()
318
+ all_users_map = {user.id: user.name for user in all_users}
319
+ for request in requests:
320
+ if request.request_body is not None:
321
+ assert isinstance(request.request_body,
322
+ payloads.RequestBody), (request.name,
323
+ request.request_body)
324
+ user_name = all_users_map.get(request.user_id)
325
+ payload = payloads.RequestPayload(
326
+ request_id=request.request_id,
327
+ name=request.name,
328
+ entrypoint=request.entrypoint.__name__
329
+ if request.entrypoint is not None else '',
330
+ request_body=request.request_body.model_dump_json()
331
+ if request.request_body is not None else
332
+ orjson.dumps(None).decode('utf-8'),
333
+ status=request.status.value,
334
+ return_value=orjson.dumps(None).decode('utf-8'),
335
+ error=orjson.dumps(None).decode('utf-8'),
336
+ pid=None,
337
+ created_at=request.created_at,
338
+ schedule_type=request.schedule_type.value,
339
+ user_id=request.user_id,
340
+ user_name=user_name,
341
+ cluster_name=request.cluster_name,
342
+ status_msg=request.status_msg,
343
+ should_retry=request.should_retry,
344
+ finished_at=request.finished_at,
345
+ )
346
+ encoded_requests.append(payload)
347
+ return encoded_requests
348
+
349
+
350
+ def _update_request_row_fields(
351
+ row: Tuple[Any, ...],
352
+ fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
353
+ """Update the request row fields."""
354
+ if not fields:
355
+ return row
356
+
357
+ # Convert tuple to dictionary for easier manipulation
358
+ content = dict(zip(fields, row))
359
+
360
+ # Required fields in RequestPayload
361
+ if 'request_id' not in fields:
362
+ content['request_id'] = ''
363
+ if 'name' not in fields:
364
+ content['name'] = ''
365
+ if 'entrypoint' not in fields:
366
+ content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
367
+ if 'request_body' not in fields:
368
+ content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
369
+ if 'status' not in fields:
370
+ content['status'] = RequestStatus.PENDING.value
371
+ if 'created_at' not in fields:
372
+ content['created_at'] = 0
373
+ if 'user_id' not in fields:
374
+ content['user_id'] = ''
375
+ if 'return_value' not in fields:
376
+ content['return_value'] = orjson.dumps(None).decode('utf-8')
377
+ if 'error' not in fields:
378
+ content['error'] = orjson.dumps(None).decode('utf-8')
379
+ if 'schedule_type' not in fields:
380
+ content['schedule_type'] = ScheduleType.SHORT.value
381
+ # Optional fields in RequestPayload
382
+ if 'pid' not in fields:
383
+ content['pid'] = None
384
+ if 'cluster_name' not in fields:
385
+ content['cluster_name'] = None
386
+ if 'status_msg' not in fields:
387
+ content['status_msg'] = None
388
+ if 'should_retry' not in fields:
389
+ content['should_retry'] = False
390
+ if 'finished_at' not in fields:
391
+ content['finished_at'] = None
392
+
393
+ # Convert back to tuple in the same order as REQUEST_COLUMNS
394
+ return tuple(content[col] for col in REQUEST_COLUMNS)
396
395
 
397
396
 
398
397
  def create_table(cursor, conn):
@@ -425,13 +424,45 @@ def create_table(cursor, conn):
425
424
  {COL_CLUSTER_NAME} TEXT,
426
425
  schedule_type TEXT,
427
426
  {COL_USER_ID} TEXT,
428
- {COL_STATUS_MSG} TEXT)""")
427
+ {COL_STATUS_MSG} TEXT,
428
+ {COL_SHOULD_RETRY} INTEGER,
429
+ {COL_FINISHED_AT} REAL
430
+ )""")
429
431
 
430
432
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
431
433
  'TEXT')
434
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
435
+ 'INTEGER')
436
+ db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
437
+ 'REAL')
438
+
439
+ # Add an index on (status, name) to speed up queries
440
+ # that filter on these columns.
441
+ cursor.execute(f"""\
442
+ CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
443
+ """)
444
+ # Add an index on cluster_name to speed up queries
445
+ # that filter on this column.
446
+ cursor.execute(f"""\
447
+ CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
448
+ """)
449
+ # Add an index on created_at to speed up queries that sort on this column.
450
+ cursor.execute(f"""\
451
+ CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
452
+ """)
432
453
 
433
454
 
434
455
  _DB = None
456
+ _init_db_lock = threading.Lock()
457
+
458
+
459
+ def _init_db_within_lock():
460
+ global _DB
461
+ if _DB is None:
462
+ db_path = os.path.expanduser(
463
+ server_constants.API_SERVER_REQUEST_DB_PATH)
464
+ pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
465
+ _DB = db_utils.SQLiteConn(db_path, create_table)
435
466
 
436
467
 
437
468
  def init_db(func):
@@ -439,21 +470,65 @@ def init_db(func):
439
470
 
440
471
  @functools.wraps(func)
441
472
  def wrapper(*args, **kwargs):
442
- global _DB
443
- if _DB is None:
444
- _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
473
+ if _DB is not None:
474
+ return func(*args, **kwargs)
475
+ with _init_db_lock:
476
+ _init_db_within_lock()
445
477
  return func(*args, **kwargs)
446
478
 
447
479
  return wrapper
448
480
 
449
481
 
482
+ def init_db_async(func):
483
+ """Async version of init_db."""
484
+
485
+ @functools.wraps(func)
486
+ async def wrapper(*args, **kwargs):
487
+ if _DB is not None:
488
+ return await func(*args, **kwargs)
489
+ # If _DB is not initialized, init_db_async will be blocked if there
490
+ # is a thread initializing _DB, this is fine since it occurs on process
491
+ # startup.
492
+ with _init_db_lock:
493
+ _init_db_within_lock()
494
+ return await func(*args, **kwargs)
495
+
496
+ return wrapper
497
+
498
+
450
499
  def reset_db_and_logs():
451
500
  """Create the database."""
501
+ logger.debug('clearing local API server database')
452
502
  server_common.clear_local_api_server_database()
503
+ logger.debug(
504
+ f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
505
+ )
453
506
  shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
454
507
  ignore_errors=True)
508
+ logger.debug('clearing local API server client directory at '
509
+ f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
455
510
  shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
456
511
  ignore_errors=True)
512
+ with _init_db_lock:
513
+ _init_db_within_lock()
514
+ assert _DB is not None
515
+ with _DB.conn:
516
+ cursor = _DB.conn.cursor()
517
+ cursor.execute('SELECT sqlite_version()')
518
+ row = cursor.fetchone()
519
+ if row is None:
520
+ raise RuntimeError('Failed to get SQLite version')
521
+ version_str = row[0]
522
+ version_parts = version_str.split('.')
523
+ assert len(version_parts) >= 2, \
524
+ f'Invalid version string: {version_str}'
525
+ major, minor = int(version_parts[0]), int(version_parts[1])
526
+ # SQLite 3.35.0+ supports RETURNING statements.
527
+ # 3.35.0 was released in March 2021.
528
+ if not ((major > 3) or (major == 3 and minor >= 35)):
529
+ raise RuntimeError(
530
+ f'SQLite version {version_str} is not supported. '
531
+ 'Please upgrade to SQLite 3.35.0 or later.')
457
532
 
458
533
 
459
534
  def request_lock_path(request_id: str) -> str:
@@ -462,69 +537,348 @@ def request_lock_path(request_id: str) -> str:
462
537
  return os.path.join(lock_path, f'.{request_id}.lock')
463
538
 
464
539
 
465
- @contextlib.contextmanager
466
- @init_db
467
- def update_request(request_id: str):
468
- """Get a SkyPilot API request."""
469
- request = _get_request_no_lock(request_id)
470
- yield request
471
- if request is not None:
472
- _add_or_update_request_no_lock(request)
540
+ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
541
+ """Kill all pending and running requests for a cluster.
542
+
543
+ Args:
544
+ cluster_name: the name of the cluster.
545
+ exclude_request_names: exclude requests with these names. This is to
546
+ prevent killing the caller request.
547
+ """
548
+ request_ids = [
549
+ request_task.request_id
550
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
551
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
552
+ exclude_request_names=[exclude_request_name],
553
+ cluster_names=[cluster_name],
554
+ fields=['request_id']))
555
+ ]
556
+ _kill_requests(request_ids)
557
+
558
+
559
+ def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
560
+ user_id: Optional[str] = None) -> List[str]:
561
+ """Kill requests with a given request ID prefix."""
562
+ expanded_request_ids: Optional[List[str]] = None
563
+ if request_ids is not None:
564
+ expanded_request_ids = []
565
+ for request_id in request_ids:
566
+ request_tasks = get_requests_with_prefix(request_id,
567
+ fields=['request_id'])
568
+ if request_tasks is None or len(request_tasks) == 0:
569
+ continue
570
+ if len(request_tasks) > 1:
571
+ raise ValueError(f'Multiple requests found for '
572
+ f'request ID prefix: {request_id}')
573
+ expanded_request_ids.append(request_tasks[0].request_id)
574
+ return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
575
+
576
+
577
+ # needed for backward compatibility. Remove by v0.10.7 or v0.11.0
578
+ kill_requests = kill_requests_with_prefix
579
+
580
+
581
+ def _should_kill_request(request_id: str,
582
+ request_record: Optional[Request]) -> bool:
583
+ if request_record is None:
584
+ logger.debug(f'No request ID {request_id}')
585
+ return False
586
+ # Skip internal requests. The internal requests are scheduled with
587
+ # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
588
+ if request_record.request_id in set(
589
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
590
+ return False
591
+ if request_record.status > RequestStatus.RUNNING:
592
+ logger.debug(f'Request {request_id} already finished')
593
+ return False
594
+ return True
595
+
596
+
597
+ def _kill_requests(request_ids: Optional[List[str]] = None,
598
+ user_id: Optional[str] = None) -> List[str]:
599
+ """Kill a SkyPilot API request and set its status to cancelled.
600
+
601
+ Args:
602
+ request_ids: The request IDs to kill. If None, all requests for the
603
+ user are killed.
604
+ user_id: The user ID to kill requests for. If None, all users are
605
+ killed.
606
+
607
+ Returns:
608
+ A list of request IDs that were cancelled.
609
+ """
610
+ if request_ids is None:
611
+ request_ids = [
612
+ request_task.request_id
613
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
614
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
615
+ # Avoid cancelling the cancel request itself.
616
+ exclude_request_names=['sky.api_cancel'],
617
+ user_id=user_id,
618
+ fields=['request_id']))
619
+ ]
620
+ cancelled_request_ids = []
621
+ for request_id in request_ids:
622
+ with update_request(request_id) as request_record:
623
+ if not _should_kill_request(request_id, request_record):
624
+ continue
625
+ if request_record.pid is not None:
626
+ logger.debug(f'Killing request process {request_record.pid}')
627
+ # Use SIGTERM instead of SIGKILL:
628
+ # - The executor can handle SIGTERM gracefully
629
+ # - After SIGTERM, the executor can reuse the request process
630
+ # for other requests, avoiding the overhead of forking a new
631
+ # process for each request.
632
+ os.kill(request_record.pid, signal.SIGTERM)
633
+ request_record.status = RequestStatus.CANCELLED
634
+ request_record.finished_at = time.time()
635
+ cancelled_request_ids.append(request_id)
636
+ return cancelled_request_ids
473
637
 
474
638
 
475
- def _get_request_no_lock(request_id: str) -> Optional[Request]:
639
+ @init_db_async
640
+ @asyncio_utils.shield
641
+ async def kill_request_async(request_id: str) -> bool:
642
+ """Kill a SkyPilot API request and set its status to cancelled.
643
+
644
+ Returns:
645
+ True if the request was killed, False otherwise.
646
+ """
647
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
648
+ request = await _get_request_no_lock_async(request_id)
649
+ if not _should_kill_request(request_id, request):
650
+ return False
651
+ assert request is not None
652
+ if request.pid is not None:
653
+ logger.debug(f'Killing request process {request.pid}')
654
+ # Use SIGTERM instead of SIGKILL:
655
+ # - The executor can handle SIGTERM gracefully
656
+ # - After SIGTERM, the executor can reuse the request process
657
+ # for other requests, avoiding the overhead of forking a new
658
+ # process for each request.
659
+ os.kill(request.pid, signal.SIGTERM)
660
+ request.status = RequestStatus.CANCELLED
661
+ request.finished_at = time.time()
662
+ await _add_or_update_request_no_lock_async(request)
663
+ return True
664
+
665
+
666
+ @contextlib.contextmanager
667
+ @init_db
668
+ @metrics_lib.time_me
669
+ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
670
+ """Get and update a SkyPilot API request."""
671
+ # Acquire the lock to avoid race conditions between multiple request
672
+ # operations, e.g. execute and cancel.
673
+ with filelock.FileLock(request_lock_path(request_id)):
674
+ request = _get_request_no_lock(request_id)
675
+ yield request
676
+ if request is not None:
677
+ _add_or_update_request_no_lock(request)
678
+
679
+
680
+ @init_db_async
681
+ @metrics_lib.time_me
682
+ @asyncio_utils.shield
683
+ async def update_status_async(request_id: str, status: RequestStatus) -> None:
684
+ """Update the status of a request"""
685
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
686
+ request = await _get_request_no_lock_async(request_id)
687
+ if request is not None:
688
+ request.status = status
689
+ await _add_or_update_request_no_lock_async(request)
690
+
691
+
692
+ @init_db_async
693
+ @metrics_lib.time_me
694
+ @asyncio_utils.shield
695
+ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
696
+ """Update the status message of a request"""
697
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
698
+ request = await _get_request_no_lock_async(request_id)
699
+ if request is not None:
700
+ request.status_msg = status_msg
701
+ await _add_or_update_request_no_lock_async(request)
702
+
703
+
704
+ def _get_request_no_lock(
705
+ request_id: str,
706
+ fields: Optional[List[str]] = None) -> Optional[Request]:
476
707
  """Get a SkyPilot API request."""
477
708
  assert _DB is not None
478
709
  columns_str = ', '.join(REQUEST_COLUMNS)
710
+ if fields:
711
+ columns_str = ', '.join(fields)
479
712
  with _DB.conn:
480
713
  cursor = _DB.conn.cursor()
481
- cursor.execute(
482
- f'SELECT {columns_str} FROM {REQUEST_TABLE} '
483
- 'WHERE request_id LIKE ?', (request_id + '%',))
714
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
715
+ 'WHERE request_id LIKE ?'), (request_id + '%',))
484
716
  row = cursor.fetchone()
485
717
  if row is None:
486
718
  return None
719
+ if fields:
720
+ row = _update_request_row_fields(row, fields)
487
721
  return Request.from_row(row)
488
722
 
489
723
 
490
- @init_db
491
- def get_latest_request_id() -> Optional[str]:
724
+ async def _get_request_no_lock_async(
725
+ request_id: str,
726
+ fields: Optional[List[str]] = None) -> Optional[Request]:
727
+ """Async version of _get_request_no_lock."""
728
+ assert _DB is not None
729
+ columns_str = ', '.join(REQUEST_COLUMNS)
730
+ if fields:
731
+ columns_str = ', '.join(fields)
732
+ async with _DB.execute_fetchall_async(
733
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
734
+ 'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
735
+ row = rows[0] if rows else None
736
+ if row is None:
737
+ return None
738
+ if fields:
739
+ row = _update_request_row_fields(row, fields)
740
+ return Request.from_row(row)
741
+
742
+
743
+ @init_db_async
744
+ @metrics_lib.time_me
745
+ async def get_latest_request_id_async() -> Optional[str]:
492
746
  """Get the latest request ID."""
493
747
  assert _DB is not None
494
- with _DB.conn:
495
- cursor = _DB.conn.cursor()
496
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
497
- 'ORDER BY created_at DESC LIMIT 1')
498
- row = cursor.fetchone()
499
- return row[0] if row else None
748
+ async with _DB.execute_fetchall_async(
749
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
750
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
751
+ return rows[0][0] if rows else None
500
752
 
501
753
 
502
754
  @init_db
503
- def get_request(request_id: str) -> Optional[Request]:
755
+ @metrics_lib.time_me
756
+ def get_request(request_id: str,
757
+ fields: Optional[List[str]] = None) -> Optional[Request]:
504
758
  """Get a SkyPilot API request."""
505
759
  with filelock.FileLock(request_lock_path(request_id)):
506
- return _get_request_no_lock(request_id)
760
+ return _get_request_no_lock(request_id, fields)
507
761
 
508
762
 
509
- @init_db
510
- def create_if_not_exists(request: Request) -> bool:
511
- """Create a SkyPilot API request if it does not exist."""
512
- with filelock.FileLock(request_lock_path(request.request_id)):
513
- if _get_request_no_lock(request.request_id) is not None:
514
- return False
515
- _add_or_update_request_no_lock(request)
516
- return True
763
+ @init_db_async
764
+ @metrics_lib.time_me_async
765
+ @asyncio_utils.shield
766
+ async def get_request_async(
767
+ request_id: str,
768
+ fields: Optional[List[str]] = None) -> Optional[Request]:
769
+ """Async version of get_request."""
770
+ # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
771
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
772
+ return await _get_request_no_lock_async(request_id, fields)
517
773
 
518
774
 
519
775
  @init_db
520
- def get_request_tasks(
521
- status: Optional[List[RequestStatus]] = None,
522
- cluster_names: Optional[List[str]] = None,
523
- user_id: Optional[str] = None,
524
- exclude_request_names: Optional[List[str]] = None,
525
- include_request_names: Optional[List[str]] = None,
526
- ) -> List[Request]:
527
- """Get a list of requests that match the given filters.
776
+ @metrics_lib.time_me
777
+ def get_requests_with_prefix(
778
+ request_id_prefix: str,
779
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
780
+ """Get requests with a given request ID prefix."""
781
+ assert _DB is not None
782
+ if fields:
783
+ columns_str = ', '.join(fields)
784
+ else:
785
+ columns_str = ', '.join(REQUEST_COLUMNS)
786
+ with _DB.conn:
787
+ cursor = _DB.conn.cursor()
788
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
789
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',))
790
+ rows = cursor.fetchall()
791
+ if not rows:
792
+ return None
793
+ if fields:
794
+ rows = [_update_request_row_fields(row, fields) for row in rows]
795
+ return [Request.from_row(row) for row in rows]
796
+
797
+
798
+ @init_db_async
799
+ @metrics_lib.time_me_async
800
+ @asyncio_utils.shield
801
+ async def get_requests_async_with_prefix(
802
+ request_id_prefix: str,
803
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
804
+ """Async version of get_request_with_prefix."""
805
+ assert _DB is not None
806
+ if fields:
807
+ columns_str = ', '.join(fields)
808
+ else:
809
+ columns_str = ', '.join(REQUEST_COLUMNS)
810
+ async with _DB.execute_fetchall_async(
811
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
812
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',)) as rows:
813
+ if not rows:
814
+ return None
815
+ if fields:
816
+ rows = [_update_request_row_fields(row, fields) for row in rows]
817
+ return [Request.from_row(row) for row in rows]
818
+
819
+
820
+ class StatusWithMsg(NamedTuple):
821
+ status: RequestStatus
822
+ status_msg: Optional[str] = None
823
+
824
+
825
+ @init_db_async
826
+ @metrics_lib.time_me_async
827
+ async def get_request_status_async(
828
+ request_id: str,
829
+ include_msg: bool = False,
830
+ ) -> Optional[StatusWithMsg]:
831
+ """Get the status of a request.
832
+
833
+ Args:
834
+ request_id: The ID of the request.
835
+ include_msg: Whether to include the status message.
836
+
837
+ Returns:
838
+ The status of the request. If the request is not found, returns
839
+ None.
840
+ """
841
+ assert _DB is not None
842
+ columns = 'status'
843
+ if include_msg:
844
+ columns += ', status_msg'
845
+ sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
846
+ async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
847
+ if rows is None or len(rows) == 0:
848
+ return None
849
+ status = RequestStatus(rows[0][0])
850
+ status_msg = rows[0][1] if include_msg else None
851
+ return StatusWithMsg(status, status_msg)
852
+
853
+
854
+ @init_db_async
855
+ @metrics_lib.time_me_async
856
+ @asyncio_utils.shield
857
+ async def create_if_not_exists_async(request: Request) -> bool:
858
+ """Create a request if it does not exist, otherwise do nothing.
859
+
860
+ Returns:
861
+ True if a new request is created, False if the request already exists.
862
+ """
863
+ assert _DB is not None
864
+ request_columns = ', '.join(REQUEST_COLUMNS)
865
+ values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
866
+ sql_statement = (
867
+ f'INSERT INTO {REQUEST_TABLE} '
868
+ f'({request_columns}) VALUES '
869
+ f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
870
+ request_row = request.to_row()
871
+ # Execute the SQL statement without getting the request lock.
872
+ # The request lock is used to prevent racing with cancellation codepath,
873
+ # but a request cannot be cancelled before it is created.
874
+ row = await _DB.execute_get_returning_value_async(sql_statement,
875
+ request_row)
876
+ return True if row else False
877
+
878
+
879
+ @dataclasses.dataclass
880
+ class RequestTaskFilter:
881
+ """Filter for requests.
528
882
 
529
883
  Args:
530
884
  status: a list of statuses of the requests to filter on.
@@ -535,66 +889,156 @@ def get_request_tasks(
535
889
  If None, all users are included.
536
890
  include_request_names: a list of request names to filter on.
537
891
  Mutually exclusive with exclude_request_names.
892
+ finished_before: if provided, only include requests finished before this
893
+ timestamp.
894
+ limit: the number of requests to show. If None, show all requests.
538
895
 
539
896
  Raises:
540
897
  ValueError: If both exclude_request_names and include_request_names are
541
898
  provided.
542
899
  """
543
- if exclude_request_names is not None and include_request_names is not None:
544
- raise ValueError(
545
- 'Only one of exclude_request_names or include_request_names can be '
546
- 'provided, not both.')
547
-
548
- filters = []
549
- filter_params = []
550
- if status is not None:
551
- status_list_str = ','.join(repr(status.value) for status in status)
552
- filters.append(f'status IN ({status_list_str})')
553
- if exclude_request_names is not None:
554
- exclude_request_names_str = ','.join(
555
- repr(name) for name in exclude_request_names)
556
- filters.append(f'name NOT IN ({exclude_request_names_str})')
557
- if cluster_names is not None:
558
- cluster_names_str = ','.join(repr(name) for name in cluster_names)
559
- filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
560
- if user_id is not None:
561
- filters.append(f'{COL_USER_ID} = ?')
562
- filter_params.append(user_id)
563
- if include_request_names is not None:
564
- request_names_str = ','.join(
565
- repr(name) for name in include_request_names)
566
- filters.append(f'name IN ({request_names_str})')
567
- assert _DB is not None
568
- with _DB.conn:
569
- cursor = _DB.conn.cursor()
900
+ status: Optional[List[RequestStatus]] = None
901
+ cluster_names: Optional[List[str]] = None
902
+ user_id: Optional[str] = None
903
+ exclude_request_names: Optional[List[str]] = None
904
+ include_request_names: Optional[List[str]] = None
905
+ finished_before: Optional[float] = None
906
+ limit: Optional[int] = None
907
+ fields: Optional[List[str]] = None
908
+ sort: bool = False
909
+
910
+ def __post_init__(self):
911
+ if (self.exclude_request_names is not None and
912
+ self.include_request_names is not None):
913
+ raise ValueError(
914
+ 'Only one of exclude_request_names or include_request_names '
915
+ 'can be provided, not both.')
916
+
917
+ def build_query(self) -> Tuple[str, List[Any]]:
918
+ """Build the SQL query and filter parameters.
919
+
920
+ Returns:
921
+ A tuple of (SQL, SQL parameters).
922
+ """
923
+ filters = []
924
+ filter_params: List[Any] = []
925
+ if self.status is not None:
926
+ status_list_str = ','.join(
927
+ repr(status.value) for status in self.status)
928
+ filters.append(f'status IN ({status_list_str})')
929
+ if self.include_request_names is not None:
930
+ request_names_str = ','.join(
931
+ repr(name) for name in self.include_request_names)
932
+ filters.append(f'name IN ({request_names_str})')
933
+ if self.exclude_request_names is not None:
934
+ exclude_request_names_str = ','.join(
935
+ repr(name) for name in self.exclude_request_names)
936
+ filters.append(f'name NOT IN ({exclude_request_names_str})')
937
+ if self.cluster_names is not None:
938
+ cluster_names_str = ','.join(
939
+ repr(name) for name in self.cluster_names)
940
+ filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
941
+ if self.user_id is not None:
942
+ filters.append(f'{COL_USER_ID} = ?')
943
+ filter_params.append(self.user_id)
944
+ if self.finished_before is not None:
945
+ filters.append('finished_at < ?')
946
+ filter_params.append(self.finished_before)
570
947
  filter_str = ' AND '.join(filters)
571
948
  if filter_str:
572
949
  filter_str = f' WHERE {filter_str}'
573
950
  columns_str = ', '.join(REQUEST_COLUMNS)
574
- cursor.execute(
575
- f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
576
- 'ORDER BY created_at DESC', filter_params)
951
+ if self.fields:
952
+ columns_str = ', '.join(self.fields)
953
+ sort_str = ''
954
+ if self.sort:
955
+ sort_str = ' ORDER BY created_at DESC'
956
+ query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str}'
957
+ f'{sort_str}')
958
+ if self.limit is not None:
959
+ query_str += f' LIMIT {self.limit}'
960
+ return query_str, filter_params
961
+
962
+
963
+ @init_db
964
+ @metrics_lib.time_me
965
+ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
966
+ """Get a list of requests that match the given filters.
967
+
968
+ Args:
969
+ req_filter: the filter to apply to the requests. Refer to
970
+ RequestTaskFilter for the details.
971
+ """
972
+ assert _DB is not None
973
+ with _DB.conn:
974
+ cursor = _DB.conn.cursor()
975
+ cursor.execute(*req_filter.build_query())
577
976
  rows = cursor.fetchall()
578
977
  if rows is None:
579
978
  return []
580
- requests = []
581
- for row in rows:
582
- request = Request.from_row(row)
583
- requests.append(request)
584
- return requests
979
+ if req_filter.fields:
980
+ rows = [
981
+ _update_request_row_fields(row, req_filter.fields) for row in rows
982
+ ]
983
+ return [Request.from_row(row) for row in rows]
984
+
985
+
986
+ @init_db_async
987
+ @metrics_lib.time_me_async
988
+ async def get_request_tasks_async(
989
+ req_filter: RequestTaskFilter) -> List[Request]:
990
+ """Async version of get_request_tasks."""
991
+ assert _DB is not None
992
+ async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
993
+ if not rows:
994
+ return []
995
+ if req_filter.fields:
996
+ rows = [
997
+ _update_request_row_fields(row, req_filter.fields) for row in rows
998
+ ]
999
+ return [Request.from_row(row) for row in rows]
1000
+
1001
+
1002
+ @init_db_async
1003
+ @metrics_lib.time_me_async
1004
+ async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
1005
+ """Get a list of API request ids for shell completion."""
1006
+ assert _DB is not None
1007
+ # Prioritize alive requests (PENDING, RUNNING) over finished ones,
1008
+ # then order by creation time (newest first) within each category.
1009
+ async with _DB.execute_fetchall_async(
1010
+ f"""SELECT request_id FROM {REQUEST_TABLE}
1011
+ WHERE request_id LIKE ?
1012
+ ORDER BY
1013
+ CASE
1014
+ WHEN status IN ('PENDING', 'RUNNING') THEN 0
1015
+ ELSE 1
1016
+ END,
1017
+ created_at DESC
1018
+ LIMIT 1000""", (f'{incomplete}%',)) as rows:
1019
+ if not rows:
1020
+ return []
1021
+ return [row[0] for row in rows]
1022
+
1023
+
1024
+ _add_or_update_request_sql = (f'INSERT OR REPLACE INTO {REQUEST_TABLE} '
1025
+ f'({", ".join(REQUEST_COLUMNS)}) VALUES '
1026
+ f'({", ".join(["?"] * len(REQUEST_COLUMNS))})')
585
1027
 
586
1028
 
587
1029
  def _add_or_update_request_no_lock(request: Request):
588
1030
  """Add or update a REST request into the database."""
589
- row = request.to_row()
590
- key_str = ', '.join(REQUEST_COLUMNS)
591
- fill_str = ', '.join(['?'] * len(row))
592
1031
  assert _DB is not None
593
1032
  with _DB.conn:
594
1033
  cursor = _DB.conn.cursor()
595
- cursor.execute(
596
- f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
597
- f'VALUES ({fill_str})', row)
1034
+ cursor.execute(_add_or_update_request_sql, request.to_row())
1035
+
1036
+
1037
+ async def _add_or_update_request_no_lock_async(request: Request):
1038
+ """Async version of _add_or_update_request_no_lock."""
1039
+ assert _DB is not None
1040
+ await _DB.execute_and_commit_async(_add_or_update_request_sql,
1041
+ request.to_row())
598
1042
 
599
1043
 
600
1044
  def set_request_failed(request_id: str, e: BaseException) -> None:
@@ -605,4 +1049,153 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
605
1049
  with update_request(request_id) as request_task:
606
1050
  assert request_task is not None, request_id
607
1051
  request_task.status = RequestStatus.FAILED
1052
+ request_task.finished_at = time.time()
1053
+ request_task.set_error(e)
1054
+
1055
+
1056
+ @init_db_async
1057
+ @metrics_lib.time_me_async
1058
+ @asyncio_utils.shield
1059
+ async def set_request_failed_async(request_id: str, e: BaseException) -> None:
1060
+ """Set a request to failed and populate the error message."""
1061
+ with ux_utils.enable_traceback():
1062
+ stacktrace = traceback.format_exc()
1063
+ setattr(e, 'stacktrace', stacktrace)
1064
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1065
+ request_task = await _get_request_no_lock_async(request_id)
1066
+ assert request_task is not None, request_id
1067
+ request_task.status = RequestStatus.FAILED
1068
+ request_task.finished_at = time.time()
608
1069
  request_task.set_error(e)
1070
+ await _add_or_update_request_no_lock_async(request_task)
1071
+
1072
+
1073
+ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
1074
+ """Set a request to succeeded and populate the result."""
1075
+ with update_request(request_id) as request_task:
1076
+ assert request_task is not None, request_id
1077
+ request_task.status = RequestStatus.SUCCEEDED
1078
+ request_task.finished_at = time.time()
1079
+ if result is not None:
1080
+ request_task.set_return_value(result)
1081
+
1082
+
1083
+ @init_db_async
1084
+ @metrics_lib.time_me_async
1085
+ @asyncio_utils.shield
1086
+ async def set_request_succeeded_async(request_id: str,
1087
+ result: Optional[Any]) -> None:
1088
+ """Set a request to succeeded and populate the result."""
1089
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1090
+ request_task = await _get_request_no_lock_async(request_id)
1091
+ assert request_task is not None, request_id
1092
+ request_task.status = RequestStatus.SUCCEEDED
1093
+ request_task.finished_at = time.time()
1094
+ if result is not None:
1095
+ request_task.set_return_value(result)
1096
+ await _add_or_update_request_no_lock_async(request_task)
1097
+
1098
+
1099
+ @init_db_async
1100
+ @metrics_lib.time_me_async
1101
+ @asyncio_utils.shield
1102
+ async def set_request_cancelled_async(request_id: str) -> None:
1103
+ """Set a pending or running request to cancelled."""
1104
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1105
+ request_task = await _get_request_no_lock_async(request_id)
1106
+ assert request_task is not None, request_id
1107
+ # Already finished or cancelled.
1108
+ if request_task.status > RequestStatus.RUNNING:
1109
+ return
1110
+ request_task.finished_at = time.time()
1111
+ request_task.status = RequestStatus.CANCELLED
1112
+ await _add_or_update_request_no_lock_async(request_task)
1113
+
1114
+
1115
+ @init_db
1116
+ @metrics_lib.time_me
1117
+ async def _delete_requests(request_ids: List[str]):
1118
+ """Clean up requests by their IDs."""
1119
+ id_list_str = ','.join(repr(request_id) for request_id in request_ids)
1120
+ assert _DB is not None
1121
+ await _DB.execute_and_commit_async(
1122
+ f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
1123
+
1124
+
1125
+ async def clean_finished_requests_with_retention(retention_seconds: int,
1126
+ batch_size: int = 1000):
1127
+ """Clean up finished requests older than the retention period.
1128
+
1129
+ This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
1130
+ from the database and cleans up their associated log files.
1131
+
1132
+ Args:
1133
+ retention_seconds: Requests older than this many seconds will be
1134
+ deleted.
1135
+ batch_size: batch delete 'batch_size' requests at a time to
1136
+ avoid using too much memory and once and to let each
1137
+ db query complete in a reasonable time. All stale
1138
+ requests older than the retention period will be deleted
1139
+ regardless of the batch size.
1140
+ """
1141
+ total_deleted = 0
1142
+ while True:
1143
+ reqs = await get_request_tasks_async(
1144
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
1145
+ finished_before=time.time() -
1146
+ retention_seconds,
1147
+ limit=batch_size,
1148
+ fields=['request_id']))
1149
+ if len(reqs) == 0:
1150
+ break
1151
+ futs = []
1152
+ for req in reqs:
1153
+ # req.log_path is derived from request_id,
1154
+ # so it's ok to just grab the request_id in the above query.
1155
+ futs.append(
1156
+ asyncio.create_task(
1157
+ anyio.Path(
1158
+ req.log_path.absolute()).unlink(missing_ok=True)))
1159
+ await asyncio.gather(*futs)
1160
+
1161
+ await _delete_requests([req.request_id for req in reqs])
1162
+ total_deleted += len(reqs)
1163
+ if len(reqs) < batch_size:
1164
+ break
1165
+
1166
+ # To avoid leakage of the log file, logs must be deleted before the
1167
+ # request task in the database.
1168
+ logger.info(f'Cleaned up {total_deleted} finished requests '
1169
+ f'older than {retention_seconds} seconds')
1170
+
1171
+
1172
+ async def requests_gc_daemon():
1173
+ """Garbage collect finished requests periodically."""
1174
+ while True:
1175
+ logger.info('Running requests GC daemon...')
1176
+ # Use the latest config.
1177
+ skypilot_config.reload_config()
1178
+ retention_seconds = skypilot_config.get_nested(
1179
+ ('api_server', 'requests_retention_hours'),
1180
+ DEFAULT_REQUESTS_RETENTION_HOURS) * 3600
1181
+ try:
1182
+ # Negative value disables the requests GC
1183
+ if retention_seconds >= 0:
1184
+ await clean_finished_requests_with_retention(retention_seconds)
1185
+ except asyncio.CancelledError:
1186
+ logger.info('Requests GC daemon cancelled')
1187
+ break
1188
+ except Exception as e: # pylint: disable=broad-except
1189
+ logger.error(f'Error running requests GC daemon: {e}'
1190
+ f'traceback: {traceback.format_exc()}')
1191
+ # Run the daemon at most once every hour to avoid too frequent
1192
+ # cleanup.
1193
+ await asyncio.sleep(max(retention_seconds, 3600))
1194
+
1195
+
1196
+ def _cleanup():
1197
+ if _DB is not None:
1198
+ asyncio.run(_DB.close())
1199
+
1200
+
1201
+ atexit.register(_cleanup)