skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,6 @@ import contextlib
5
5
  import dataclasses
6
6
  import enum
7
7
  import functools
8
- import json
9
8
  import os
10
9
  import pathlib
11
10
  import shutil
@@ -14,24 +13,27 @@ import sqlite3
14
13
  import threading
15
14
  import time
16
15
  import traceback
17
- from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
18
- NamedTuple, Optional, Tuple)
16
+ from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
17
+ Tuple)
18
+ import uuid
19
19
 
20
20
  import anyio
21
21
  import colorama
22
22
  import filelock
23
+ import orjson
23
24
 
24
25
  from sky import exceptions
25
26
  from sky import global_user_state
26
27
  from sky import sky_logging
27
28
  from sky import skypilot_config
29
+ from sky.metrics import utils as metrics_lib
28
30
  from sky.server import common as server_common
29
31
  from sky.server import constants as server_constants
30
32
  from sky.server import daemons
31
- from sky.server import metrics as metrics_lib
32
33
  from sky.server.requests import payloads
33
34
  from sky.server.requests.serializers import decoders
34
35
  from sky.server.requests.serializers import encoders
36
+ from sky.utils import asyncio_utils
35
37
  from sky.utils import common_utils
36
38
  from sky.utils import ux_utils
37
39
  from sky.utils.db import db_utils
@@ -211,8 +213,8 @@ class Request:
211
213
  entrypoint=self.entrypoint.__name__,
212
214
  request_body=self.request_body.model_dump_json(),
213
215
  status=self.status.value,
214
- return_value=json.dumps(None),
215
- error=json.dumps(None),
216
+ return_value=orjson.dumps(None).decode('utf-8'),
217
+ error=orjson.dumps(None).decode('utf-8'),
216
218
  pid=None,
217
219
  created_at=self.created_at,
218
220
  schedule_type=self.schedule_type.value,
@@ -235,8 +237,8 @@ class Request:
235
237
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
236
238
  request_body=encoders.pickle_and_encode(self.request_body),
237
239
  status=self.status.value,
238
- return_value=json.dumps(self.return_value),
239
- error=json.dumps(self.error),
240
+ return_value=orjson.dumps(self.return_value).decode('utf-8'),
241
+ error=orjson.dumps(self.error).decode('utf-8'),
240
242
  pid=self.pid,
241
243
  created_at=self.created_at,
242
244
  schedule_type=self.schedule_type.value,
@@ -268,8 +270,8 @@ class Request:
268
270
  entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
269
271
  request_body=decoders.decode_and_unpickle(payload.request_body),
270
272
  status=RequestStatus(payload.status),
271
- return_value=json.loads(payload.return_value),
272
- error=json.loads(payload.error),
273
+ return_value=orjson.loads(payload.return_value),
274
+ error=orjson.loads(payload.error),
273
275
  pid=payload.pid,
274
276
  created_at=payload.created_at,
275
277
  schedule_type=ScheduleType(payload.schedule_type),
@@ -292,72 +294,104 @@ class Request:
292
294
  raise
293
295
 
294
296
 
295
- def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
296
- """Kill all pending and running requests for a cluster.
297
+ def get_new_request_id() -> str:
298
+ """Get a new request ID."""
299
+ return str(uuid.uuid4())
297
300
 
298
- Args:
299
- cluster_name: the name of the cluster.
300
- exclude_request_names: exclude requests with these names. This is to
301
- prevent killing the caller request.
302
- """
303
- request_ids = [
304
- request_task.request_id
305
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
306
- cluster_names=[cluster_name],
307
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
308
- exclude_request_names=[exclude_request_name]))
309
- ]
310
- kill_requests(request_ids)
311
301
 
302
+ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
303
+ """Serialize the SkyPilot API request for display purposes.
312
304
 
313
- def kill_requests(request_ids: Optional[List[str]] = None,
314
- user_id: Optional[str] = None) -> List[str]:
315
- """Kill a SkyPilot API request and set its status to cancelled.
305
+ This function should be called on the server side to serialize the
306
+ request body into human readable format, e.g., the entrypoint should
307
+ be a string, and the pid, error, or return value are not needed.
316
308
 
317
- Args:
318
- request_ids: The request IDs to kill. If None, all requests for the
319
- user are killed.
320
- user_id: The user ID to kill requests for. If None, all users are
321
- killed.
309
+ The returned value will then be displayed on the client side in request
310
+ table.
322
311
 
323
- Returns:
324
- A list of request IDs that were cancelled.
325
- """
326
- if request_ids is None:
327
- request_ids = [
328
- request_task.request_id
329
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
330
- user_id=user_id,
331
- status=[RequestStatus.RUNNING, RequestStatus.PENDING],
332
- # Avoid cancelling the cancel request itself.
333
- exclude_request_names=['sky.api_cancel']))
334
- ]
335
- cancelled_request_ids = []
336
- for request_id in request_ids:
337
- with update_request(request_id) as request_record:
338
- if request_record is None:
339
- logger.debug(f'No request ID {request_id}')
340
- continue
341
- # Skip internal requests. The internal requests are scheduled with
342
- # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
343
- if request_record.request_id in set(
344
- event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
345
- continue
346
- if request_record.status > RequestStatus.RUNNING:
347
- logger.debug(f'Request {request_id} already finished')
348
- continue
349
- if request_record.pid is not None:
350
- logger.debug(f'Killing request process {request_record.pid}')
351
- # Use SIGTERM instead of SIGKILL:
352
- # - The executor can handle SIGTERM gracefully
353
- # - After SIGTERM, the executor can reuse the request process
354
- # for other requests, avoiding the overhead of forking a new
355
- # process for each request.
356
- os.kill(request_record.pid, signal.SIGTERM)
357
- request_record.status = RequestStatus.CANCELLED
358
- request_record.finished_at = time.time()
359
- cancelled_request_ids.append(request_id)
360
- return cancelled_request_ids
312
+ We do not use `encode` for display to avoid a large amount of data being
313
+ sent to the client side, especially for the request table could include
314
+ all the requests.
315
+ """
316
+ encoded_requests = []
317
+ all_users = global_user_state.get_all_users()
318
+ all_users_map = {user.id: user.name for user in all_users}
319
+ for request in requests:
320
+ if request.request_body is not None:
321
+ assert isinstance(request.request_body,
322
+ payloads.RequestBody), (request.name,
323
+ request.request_body)
324
+ user_name = all_users_map.get(request.user_id)
325
+ payload = payloads.RequestPayload(
326
+ request_id=request.request_id,
327
+ name=request.name,
328
+ entrypoint=request.entrypoint.__name__
329
+ if request.entrypoint is not None else '',
330
+ request_body=request.request_body.model_dump_json()
331
+ if request.request_body is not None else
332
+ orjson.dumps(None).decode('utf-8'),
333
+ status=request.status.value,
334
+ return_value=orjson.dumps(None).decode('utf-8'),
335
+ error=orjson.dumps(None).decode('utf-8'),
336
+ pid=None,
337
+ created_at=request.created_at,
338
+ schedule_type=request.schedule_type.value,
339
+ user_id=request.user_id,
340
+ user_name=user_name,
341
+ cluster_name=request.cluster_name,
342
+ status_msg=request.status_msg,
343
+ should_retry=request.should_retry,
344
+ finished_at=request.finished_at,
345
+ )
346
+ encoded_requests.append(payload)
347
+ return encoded_requests
348
+
349
+
350
+ def _update_request_row_fields(
351
+ row: Tuple[Any, ...],
352
+ fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
353
+ """Update the request row fields."""
354
+ if not fields:
355
+ return row
356
+
357
+ # Convert tuple to dictionary for easier manipulation
358
+ content = dict(zip(fields, row))
359
+
360
+ # Required fields in RequestPayload
361
+ if 'request_id' not in fields:
362
+ content['request_id'] = ''
363
+ if 'name' not in fields:
364
+ content['name'] = ''
365
+ if 'entrypoint' not in fields:
366
+ content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
367
+ if 'request_body' not in fields:
368
+ content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
369
+ if 'status' not in fields:
370
+ content['status'] = RequestStatus.PENDING.value
371
+ if 'created_at' not in fields:
372
+ content['created_at'] = 0
373
+ if 'user_id' not in fields:
374
+ content['user_id'] = ''
375
+ if 'return_value' not in fields:
376
+ content['return_value'] = orjson.dumps(None).decode('utf-8')
377
+ if 'error' not in fields:
378
+ content['error'] = orjson.dumps(None).decode('utf-8')
379
+ if 'schedule_type' not in fields:
380
+ content['schedule_type'] = ScheduleType.SHORT.value
381
+ # Optional fields in RequestPayload
382
+ if 'pid' not in fields:
383
+ content['pid'] = None
384
+ if 'cluster_name' not in fields:
385
+ content['cluster_name'] = None
386
+ if 'status_msg' not in fields:
387
+ content['status_msg'] = None
388
+ if 'should_retry' not in fields:
389
+ content['should_retry'] = False
390
+ if 'finished_at' not in fields:
391
+ content['finished_at'] = None
392
+
393
+ # Convert back to tuple in the same order as REQUEST_COLUMNS
394
+ return tuple(content[col] for col in REQUEST_COLUMNS)
361
395
 
362
396
 
363
397
  def create_table(cursor, conn):
@@ -402,6 +436,21 @@ def create_table(cursor, conn):
402
436
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
403
437
  'REAL')
404
438
 
439
+ # Add an index on (status, name) to speed up queries
440
+ # that filter on these columns.
441
+ cursor.execute(f"""\
442
+ CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
443
+ """)
444
+ # Add an index on cluster_name to speed up queries
445
+ # that filter on this column.
446
+ cursor.execute(f"""\
447
+ CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
448
+ """)
449
+ # Add an index on created_at to speed up queries that sort on this column.
450
+ cursor.execute(f"""\
451
+ CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
452
+ """)
453
+
405
454
 
406
455
  _DB = None
407
456
  _init_db_lock = threading.Lock()
@@ -449,11 +498,37 @@ def init_db_async(func):
449
498
 
450
499
  def reset_db_and_logs():
451
500
  """Create the database."""
501
+ logger.debug('clearing local API server database')
452
502
  server_common.clear_local_api_server_database()
503
+ logger.debug(
504
+ f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
505
+ )
453
506
  shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
454
507
  ignore_errors=True)
508
+ logger.debug('clearing local API server client directory at '
509
+ f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
455
510
  shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
456
511
  ignore_errors=True)
512
+ with _init_db_lock:
513
+ _init_db_within_lock()
514
+ assert _DB is not None
515
+ with _DB.conn:
516
+ cursor = _DB.conn.cursor()
517
+ cursor.execute('SELECT sqlite_version()')
518
+ row = cursor.fetchone()
519
+ if row is None:
520
+ raise RuntimeError('Failed to get SQLite version')
521
+ version_str = row[0]
522
+ version_parts = version_str.split('.')
523
+ assert len(version_parts) >= 2, \
524
+ f'Invalid version string: {version_str}'
525
+ major, minor = int(version_parts[0]), int(version_parts[1])
526
+ # SQLite 3.35.0+ supports RETURNING statements.
527
+ # 3.35.0 was released in March 2021.
528
+ if not ((major > 3) or (major == 3 and minor >= 35)):
529
+ raise RuntimeError(
530
+ f'SQLite version {version_str} is not supported. '
531
+ 'Please upgrade to SQLite 3.35.0 or later.')
457
532
 
458
533
 
459
534
  def request_lock_path(request_id: str) -> str:
@@ -462,93 +537,285 @@ def request_lock_path(request_id: str) -> str:
462
537
  return os.path.join(lock_path, f'.{request_id}.lock')
463
538
 
464
539
 
540
+ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
541
+ """Kill all pending and running requests for a cluster.
542
+
543
+ Args:
544
+ cluster_name: the name of the cluster.
545
+ exclude_request_names: exclude requests with these names. This is to
546
+ prevent killing the caller request.
547
+ """
548
+ request_ids = [
549
+ request_task.request_id
550
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
551
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
552
+ exclude_request_names=[exclude_request_name],
553
+ cluster_names=[cluster_name],
554
+ fields=['request_id']))
555
+ ]
556
+ _kill_requests(request_ids)
557
+
558
+
559
+ def kill_requests(request_ids: Optional[List[str]] = None,
560
+ user_id: Optional[str] = None) -> List[str]:
561
+ """Kill requests with a given request ID prefix."""
562
+ expanded_request_ids: Optional[List[str]] = None
563
+ if request_ids is not None:
564
+ expanded_request_ids = []
565
+ for request_id in request_ids:
566
+ request_tasks = get_requests_with_prefix(request_id,
567
+ fields=['request_id'])
568
+ if request_tasks is None or len(request_tasks) == 0:
569
+ continue
570
+ if len(request_tasks) > 1:
571
+ raise ValueError(f'Multiple requests found for '
572
+ f'request ID prefix: {request_id}')
573
+ expanded_request_ids.append(request_tasks[0].request_id)
574
+ return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
575
+
576
+
577
+ # needed for backward compatibility. Remove by v0.10.7 or v0.12.0
578
+ # and rename kill_requests to kill_requests_with_prefix.
579
+ kill_requests_with_prefix = kill_requests
580
+
581
+
582
+ def _should_kill_request(request_id: str,
583
+ request_record: Optional[Request]) -> bool:
584
+ if request_record is None:
585
+ logger.debug(f'No request ID {request_id}')
586
+ return False
587
+ # Skip internal requests. The internal requests are scheduled with
588
+ # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
589
+ if request_record.request_id in set(
590
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
591
+ return False
592
+ if request_record.status > RequestStatus.RUNNING:
593
+ logger.debug(f'Request {request_id} already finished')
594
+ return False
595
+ return True
596
+
597
+
598
+ def _kill_requests(request_ids: Optional[List[str]] = None,
599
+ user_id: Optional[str] = None) -> List[str]:
600
+ """Kill a SkyPilot API request and set its status to cancelled.
601
+
602
+ Args:
603
+ request_ids: The request IDs to kill. If None, all requests for the
604
+ user are killed.
605
+ user_id: The user ID to kill requests for. If None, all users are
606
+ killed.
607
+
608
+ Returns:
609
+ A list of request IDs that were cancelled.
610
+ """
611
+ if request_ids is None:
612
+ request_ids = [
613
+ request_task.request_id
614
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
615
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
616
+ # Avoid cancelling the cancel request itself.
617
+ exclude_request_names=['sky.api_cancel'],
618
+ user_id=user_id,
619
+ fields=['request_id']))
620
+ ]
621
+ cancelled_request_ids = []
622
+ for request_id in request_ids:
623
+ with update_request(request_id) as request_record:
624
+ if not _should_kill_request(request_id, request_record):
625
+ continue
626
+ if request_record.pid is not None:
627
+ logger.debug(f'Killing request process {request_record.pid}')
628
+ # Use SIGTERM instead of SIGKILL:
629
+ # - The executor can handle SIGTERM gracefully
630
+ # - After SIGTERM, the executor can reuse the request process
631
+ # for other requests, avoiding the overhead of forking a new
632
+ # process for each request.
633
+ os.kill(request_record.pid, signal.SIGTERM)
634
+ request_record.status = RequestStatus.CANCELLED
635
+ request_record.finished_at = time.time()
636
+ cancelled_request_ids.append(request_id)
637
+ return cancelled_request_ids
638
+
639
+
640
+ @init_db_async
641
+ @asyncio_utils.shield
642
+ async def kill_request_async(request_id: str) -> bool:
643
+ """Kill a SkyPilot API request and set its status to cancelled.
644
+
645
+ Returns:
646
+ True if the request was killed, False otherwise.
647
+ """
648
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
649
+ request = await _get_request_no_lock_async(request_id)
650
+ if not _should_kill_request(request_id, request):
651
+ return False
652
+ assert request is not None
653
+ if request.pid is not None:
654
+ logger.debug(f'Killing request process {request.pid}')
655
+ # Use SIGTERM instead of SIGKILL:
656
+ # - The executor can handle SIGTERM gracefully
657
+ # - After SIGTERM, the executor can reuse the request process
658
+ # for other requests, avoiding the overhead of forking a new
659
+ # process for each request.
660
+ os.kill(request.pid, signal.SIGTERM)
661
+ request.status = RequestStatus.CANCELLED
662
+ request.finished_at = time.time()
663
+ await _add_or_update_request_no_lock_async(request)
664
+ return True
665
+
666
+
465
667
  @contextlib.contextmanager
466
668
  @init_db
467
669
  @metrics_lib.time_me
468
670
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
469
671
  """Get and update a SkyPilot API request."""
470
- request = _get_request_no_lock(request_id)
471
- yield request
472
- if request is not None:
473
- _add_or_update_request_no_lock(request)
672
+ # Acquire the lock to avoid race conditions between multiple request
673
+ # operations, e.g. execute and cancel.
674
+ with filelock.FileLock(request_lock_path(request_id)):
675
+ request = _get_request_no_lock(request_id)
676
+ yield request
677
+ if request is not None:
678
+ _add_or_update_request_no_lock(request)
474
679
 
475
680
 
476
- @init_db
681
+ @init_db_async
477
682
  @metrics_lib.time_me
478
- def update_request_async(
479
- request_id: str) -> AsyncContextManager[Optional[Request]]:
480
- """Async version of update_request.
481
-
482
- Returns an async context manager that yields the request record and
483
- persists any in-place updates upon exit.
484
- """
485
-
486
- @contextlib.asynccontextmanager
487
- async def _cm():
683
+ @asyncio_utils.shield
684
+ async def update_status_async(request_id: str, status: RequestStatus) -> None:
685
+ """Update the status of a request"""
686
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
488
687
  request = await _get_request_no_lock_async(request_id)
489
- try:
490
- yield request
491
- finally:
492
- if request is not None:
493
- await _add_or_update_request_no_lock_async(request)
494
-
495
- return _cm()
688
+ if request is not None:
689
+ request.status = status
690
+ await _add_or_update_request_no_lock_async(request)
496
691
 
497
692
 
498
- _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
499
- 'WHERE request_id LIKE ?')
693
+ @init_db_async
694
+ @metrics_lib.time_me
695
+ @asyncio_utils.shield
696
+ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
697
+ """Update the status message of a request"""
698
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
699
+ request = await _get_request_no_lock_async(request_id)
700
+ if request is not None:
701
+ request.status_msg = status_msg
702
+ await _add_or_update_request_no_lock_async(request)
500
703
 
501
704
 
502
- def _get_request_no_lock(request_id: str) -> Optional[Request]:
705
+ def _get_request_no_lock(
706
+ request_id: str,
707
+ fields: Optional[List[str]] = None) -> Optional[Request]:
503
708
  """Get a SkyPilot API request."""
504
709
  assert _DB is not None
710
+ columns_str = ', '.join(REQUEST_COLUMNS)
711
+ if fields:
712
+ columns_str = ', '.join(fields)
505
713
  with _DB.conn:
506
714
  cursor = _DB.conn.cursor()
507
- cursor.execute(_get_request_sql, (request_id + '%',))
715
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
716
+ 'WHERE request_id LIKE ?'), (request_id + '%',))
508
717
  row = cursor.fetchone()
509
718
  if row is None:
510
719
  return None
720
+ if fields:
721
+ row = _update_request_row_fields(row, fields)
511
722
  return Request.from_row(row)
512
723
 
513
724
 
514
- async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
725
+ async def _get_request_no_lock_async(
726
+ request_id: str,
727
+ fields: Optional[List[str]] = None) -> Optional[Request]:
515
728
  """Async version of _get_request_no_lock."""
516
729
  assert _DB is not None
517
- async with _DB.execute_fetchall_async(_get_request_sql,
518
- (request_id + '%',)) as rows:
730
+ columns_str = ', '.join(REQUEST_COLUMNS)
731
+ if fields:
732
+ columns_str = ', '.join(fields)
733
+ async with _DB.execute_fetchall_async(
734
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
735
+ 'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
519
736
  row = rows[0] if rows else None
520
737
  if row is None:
521
738
  return None
739
+ if fields:
740
+ row = _update_request_row_fields(row, fields)
522
741
  return Request.from_row(row)
523
742
 
524
743
 
525
- @init_db
744
+ @init_db_async
526
745
  @metrics_lib.time_me
527
- def get_latest_request_id() -> Optional[str]:
746
+ async def get_latest_request_id_async() -> Optional[str]:
528
747
  """Get the latest request ID."""
529
748
  assert _DB is not None
530
- with _DB.conn:
531
- cursor = _DB.conn.cursor()
532
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
533
- 'ORDER BY created_at DESC LIMIT 1')
534
- row = cursor.fetchone()
535
- return row[0] if row else None
749
+ async with _DB.execute_fetchall_async(
750
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
751
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
752
+ return rows[0][0] if rows else None
536
753
 
537
754
 
538
755
  @init_db
539
756
  @metrics_lib.time_me
540
- def get_request(request_id: str) -> Optional[Request]:
757
+ def get_request(request_id: str,
758
+ fields: Optional[List[str]] = None) -> Optional[Request]:
541
759
  """Get a SkyPilot API request."""
542
760
  with filelock.FileLock(request_lock_path(request_id)):
543
- return _get_request_no_lock(request_id)
761
+ return _get_request_no_lock(request_id, fields)
544
762
 
545
763
 
546
764
  @init_db_async
547
765
  @metrics_lib.time_me_async
548
- async def get_request_async(request_id: str) -> Optional[Request]:
766
+ @asyncio_utils.shield
767
+ async def get_request_async(
768
+ request_id: str,
769
+ fields: Optional[List[str]] = None) -> Optional[Request]:
549
770
  """Async version of get_request."""
771
+ # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
550
772
  async with filelock.AsyncFileLock(request_lock_path(request_id)):
551
- return await _get_request_no_lock_async(request_id)
773
+ return await _get_request_no_lock_async(request_id, fields)
774
+
775
+
776
+ @init_db
777
+ @metrics_lib.time_me
778
+ def get_requests_with_prefix(
779
+ request_id_prefix: str,
780
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
781
+ """Get requests with a given request ID prefix."""
782
+ assert _DB is not None
783
+ if fields:
784
+ columns_str = ', '.join(fields)
785
+ else:
786
+ columns_str = ', '.join(REQUEST_COLUMNS)
787
+ with _DB.conn:
788
+ cursor = _DB.conn.cursor()
789
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
790
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',))
791
+ rows = cursor.fetchall()
792
+ if not rows:
793
+ return None
794
+ if fields:
795
+ rows = [_update_request_row_fields(row, fields) for row in rows]
796
+ return [Request.from_row(row) for row in rows]
797
+
798
+
799
+ @init_db_async
800
+ @metrics_lib.time_me_async
801
+ @asyncio_utils.shield
802
+ async def get_requests_async_with_prefix(
803
+ request_id_prefix: str,
804
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
805
+ """Async version of get_request_with_prefix."""
806
+ assert _DB is not None
807
+ if fields:
808
+ columns_str = ', '.join(fields)
809
+ else:
810
+ columns_str = ', '.join(REQUEST_COLUMNS)
811
+ async with _DB.execute_fetchall_async(
812
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
813
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',)) as rows:
814
+ if not rows:
815
+ return None
816
+ if fields:
817
+ rows = [_update_request_row_fields(row, fields) for row in rows]
818
+ return [Request.from_row(row) for row in rows]
552
819
 
553
820
 
554
821
  class StatusWithMsg(NamedTuple):
@@ -585,26 +852,29 @@ async def get_request_status_async(
585
852
  return StatusWithMsg(status, status_msg)
586
853
 
587
854
 
588
- @init_db
589
- @metrics_lib.time_me
590
- def create_if_not_exists(request: Request) -> bool:
591
- """Create a SkyPilot API request if it does not exist."""
592
- with filelock.FileLock(request_lock_path(request.request_id)):
593
- if _get_request_no_lock(request.request_id) is not None:
594
- return False
595
- _add_or_update_request_no_lock(request)
596
- return True
597
-
598
-
599
855
  @init_db_async
600
856
  @metrics_lib.time_me_async
857
+ @asyncio_utils.shield
601
858
  async def create_if_not_exists_async(request: Request) -> bool:
602
- """Async version of create_if_not_exists."""
603
- async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
604
- if await _get_request_no_lock_async(request.request_id) is not None:
605
- return False
606
- await _add_or_update_request_no_lock_async(request)
607
- return True
859
+ """Create a request if it does not exist, otherwise do nothing.
860
+
861
+ Returns:
862
+ True if a new request is created, False if the request already exists.
863
+ """
864
+ assert _DB is not None
865
+ request_columns = ', '.join(REQUEST_COLUMNS)
866
+ values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
867
+ sql_statement = (
868
+ f'INSERT INTO {REQUEST_TABLE} '
869
+ f'({request_columns}) VALUES '
870
+ f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
871
+ request_row = request.to_row()
872
+ # Execute the SQL statement without getting the request lock.
873
+ # The request lock is used to prevent racing with cancellation codepath,
874
+ # but a request cannot be cancelled before it is created.
875
+ row = await _DB.execute_get_returning_value_async(sql_statement,
876
+ request_row)
877
+ return True if row else False
608
878
 
609
879
 
610
880
  @dataclasses.dataclass
@@ -622,6 +892,7 @@ class RequestTaskFilter:
622
892
  Mutually exclusive with exclude_request_names.
623
893
  finished_before: if provided, only include requests finished before this
624
894
  timestamp.
895
+ limit: the number of requests to show. If None, show all requests.
625
896
 
626
897
  Raises:
627
898
  ValueError: If both exclude_request_names and include_request_names are
@@ -633,6 +904,9 @@ class RequestTaskFilter:
633
904
  exclude_request_names: Optional[List[str]] = None
634
905
  include_request_names: Optional[List[str]] = None
635
906
  finished_before: Optional[float] = None
907
+ limit: Optional[int] = None
908
+ fields: Optional[List[str]] = None
909
+ sort: bool = False
636
910
 
637
911
  def __post_init__(self):
638
912
  if (self.exclude_request_names is not None and
@@ -653,6 +927,10 @@ class RequestTaskFilter:
653
927
  status_list_str = ','.join(
654
928
  repr(status.value) for status in self.status)
655
929
  filters.append(f'status IN ({status_list_str})')
930
+ if self.include_request_names is not None:
931
+ request_names_str = ','.join(
932
+ repr(name) for name in self.include_request_names)
933
+ filters.append(f'name IN ({request_names_str})')
656
934
  if self.exclude_request_names is not None:
657
935
  exclude_request_names_str = ','.join(
658
936
  repr(name) for name in self.exclude_request_names)
@@ -664,10 +942,6 @@ class RequestTaskFilter:
664
942
  if self.user_id is not None:
665
943
  filters.append(f'{COL_USER_ID} = ?')
666
944
  filter_params.append(self.user_id)
667
- if self.include_request_names is not None:
668
- request_names_str = ','.join(
669
- repr(name) for name in self.include_request_names)
670
- filters.append(f'name IN ({request_names_str})')
671
945
  if self.finished_before is not None:
672
946
  filters.append('finished_at < ?')
673
947
  filter_params.append(self.finished_before)
@@ -675,8 +949,16 @@ class RequestTaskFilter:
675
949
  if filter_str:
676
950
  filter_str = f' WHERE {filter_str}'
677
951
  columns_str = ', '.join(REQUEST_COLUMNS)
678
- return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
679
- 'ORDER BY created_at DESC'), filter_params
952
+ if self.fields:
953
+ columns_str = ', '.join(self.fields)
954
+ sort_str = ''
955
+ if self.sort:
956
+ sort_str = ' ORDER BY created_at DESC'
957
+ query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str}'
958
+ f'{sort_str}')
959
+ if self.limit is not None:
960
+ query_str += f' LIMIT {self.limit}'
961
+ return query_str, filter_params
680
962
 
681
963
 
682
964
  @init_db
@@ -695,6 +977,10 @@ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
695
977
  rows = cursor.fetchall()
696
978
  if rows is None:
697
979
  return []
980
+ if req_filter.fields:
981
+ rows = [
982
+ _update_request_row_fields(row, req_filter.fields) for row in rows
983
+ ]
698
984
  return [Request.from_row(row) for row in rows]
699
985
 
700
986
 
@@ -707,6 +993,10 @@ async def get_request_tasks_async(
707
993
  async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
708
994
  if not rows:
709
995
  return []
996
+ if req_filter.fields:
997
+ rows = [
998
+ _update_request_row_fields(row, req_filter.fields) for row in rows
999
+ ]
710
1000
  return [Request.from_row(row) for row in rows]
711
1001
 
712
1002
 
@@ -752,11 +1042,15 @@ async def _add_or_update_request_no_lock_async(request: Request):
752
1042
  request.to_row())
753
1043
 
754
1044
 
755
- def set_request_failed(request_id: str, e: BaseException) -> None:
756
- """Set a request to failed and populate the error message."""
1045
+ def set_exception_stacktrace(e: BaseException) -> None:
757
1046
  with ux_utils.enable_traceback():
758
1047
  stacktrace = traceback.format_exc()
759
1048
  setattr(e, 'stacktrace', stacktrace)
1049
+
1050
+
1051
+ def set_request_failed(request_id: str, e: BaseException) -> None:
1052
+ """Set a request to failed and populate the error message."""
1053
+ set_exception_stacktrace(e)
760
1054
  with update_request(request_id) as request_task:
761
1055
  assert request_task is not None, request_id
762
1056
  request_task.status = RequestStatus.FAILED
@@ -764,6 +1058,21 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
764
1058
  request_task.set_error(e)
765
1059
 
766
1060
 
1061
+ @init_db_async
1062
+ @metrics_lib.time_me_async
1063
+ @asyncio_utils.shield
1064
+ async def set_request_failed_async(request_id: str, e: BaseException) -> None:
1065
+ """Set a request to failed and populate the error message."""
1066
+ set_exception_stacktrace(e)
1067
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1068
+ request_task = await _get_request_no_lock_async(request_id)
1069
+ assert request_task is not None, request_id
1070
+ request_task.status = RequestStatus.FAILED
1071
+ request_task.finished_at = time.time()
1072
+ request_task.set_error(e)
1073
+ await _add_or_update_request_no_lock_async(request_task)
1074
+
1075
+
767
1076
  def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
768
1077
  """Set a request to succeeded and populate the result."""
769
1078
  with update_request(request_id) as request_task:
@@ -774,25 +1083,50 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
774
1083
  request_task.set_return_value(result)
775
1084
 
776
1085
 
777
- def set_request_cancelled(request_id: str) -> None:
778
- """Set a request to cancelled."""
779
- with update_request(request_id) as request_task:
1086
+ @init_db_async
1087
+ @metrics_lib.time_me_async
1088
+ @asyncio_utils.shield
1089
+ async def set_request_succeeded_async(request_id: str,
1090
+ result: Optional[Any]) -> None:
1091
+ """Set a request to succeeded and populate the result."""
1092
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1093
+ request_task = await _get_request_no_lock_async(request_id)
780
1094
  assert request_task is not None, request_id
1095
+ request_task.status = RequestStatus.SUCCEEDED
1096
+ request_task.finished_at = time.time()
1097
+ if result is not None:
1098
+ request_task.set_return_value(result)
1099
+ await _add_or_update_request_no_lock_async(request_task)
1100
+
1101
+
1102
+ @init_db_async
1103
+ @metrics_lib.time_me_async
1104
+ @asyncio_utils.shield
1105
+ async def set_request_cancelled_async(request_id: str) -> None:
1106
+ """Set a pending or running request to cancelled."""
1107
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1108
+ request_task = await _get_request_no_lock_async(request_id)
1109
+ assert request_task is not None, request_id
1110
+ # Already finished or cancelled.
1111
+ if request_task.status > RequestStatus.RUNNING:
1112
+ return
781
1113
  request_task.finished_at = time.time()
782
1114
  request_task.status = RequestStatus.CANCELLED
1115
+ await _add_or_update_request_no_lock_async(request_task)
783
1116
 
784
1117
 
785
1118
  @init_db
786
1119
  @metrics_lib.time_me
787
- async def _delete_requests(requests: List[Request]):
1120
+ async def _delete_requests(request_ids: List[str]):
788
1121
  """Clean up requests by their IDs."""
789
- id_list_str = ','.join(repr(req.request_id) for req in requests)
1122
+ id_list_str = ','.join(repr(request_id) for request_id in request_ids)
790
1123
  assert _DB is not None
791
1124
  await _DB.execute_and_commit_async(
792
1125
  f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
793
1126
 
794
1127
 
795
- async def clean_finished_requests_with_retention(retention_seconds: int):
1128
+ async def clean_finished_requests_with_retention(retention_seconds: int,
1129
+ batch_size: int = 1000):
796
1130
  """Clean up finished requests older than the retention period.
797
1131
 
798
1132
  This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
@@ -801,24 +1135,40 @@ async def clean_finished_requests_with_retention(retention_seconds: int):
801
1135
  Args:
802
1136
  retention_seconds: Requests older than this many seconds will be
803
1137
  deleted.
1138
+ batch_size: batch delete 'batch_size' requests at a time to
1139
+ avoid using too much memory and once and to let each
1140
+ db query complete in a reasonable time. All stale
1141
+ requests older than the retention period will be deleted
1142
+ regardless of the batch size.
804
1143
  """
805
- reqs = await get_request_tasks_async(
806
- req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
807
- finished_before=time.time() -
808
- retention_seconds))
809
-
810
- futs = []
811
- for req in reqs:
812
- futs.append(
813
- asyncio.create_task(
814
- anyio.Path(req.log_path.absolute()).unlink(missing_ok=True)))
815
- await asyncio.gather(*futs)
816
-
817
- await _delete_requests(reqs)
1144
+ total_deleted = 0
1145
+ while True:
1146
+ reqs = await get_request_tasks_async(
1147
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
1148
+ finished_before=time.time() -
1149
+ retention_seconds,
1150
+ limit=batch_size,
1151
+ fields=['request_id']))
1152
+ if len(reqs) == 0:
1153
+ break
1154
+ futs = []
1155
+ for req in reqs:
1156
+ # req.log_path is derived from request_id,
1157
+ # so it's ok to just grab the request_id in the above query.
1158
+ futs.append(
1159
+ asyncio.create_task(
1160
+ anyio.Path(
1161
+ req.log_path.absolute()).unlink(missing_ok=True)))
1162
+ await asyncio.gather(*futs)
1163
+
1164
+ await _delete_requests([req.request_id for req in reqs])
1165
+ total_deleted += len(reqs)
1166
+ if len(reqs) < batch_size:
1167
+ break
818
1168
 
819
1169
  # To avoid leakage of the log file, logs must be deleted before the
820
1170
  # request task in the database.
821
- logger.info(f'Cleaned up {len(reqs)} finished requests '
1171
+ logger.info(f'Cleaned up {total_deleted} finished requests '
822
1172
  f'older than {retention_seconds} seconds')
823
1173
 
824
1174