skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,6 @@ import contextlib
5
5
  import dataclasses
6
6
  import enum
7
7
  import functools
8
- import json
9
8
  import os
10
9
  import pathlib
11
10
  import shutil
@@ -14,24 +13,28 @@ import sqlite3
14
13
  import threading
15
14
  import time
16
15
  import traceback
17
- from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
18
- NamedTuple, Optional, Tuple)
16
+ from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
17
+ Tuple)
18
+ import uuid
19
19
 
20
20
  import anyio
21
21
  import colorama
22
22
  import filelock
23
+ import orjson
23
24
 
24
25
  from sky import exceptions
25
26
  from sky import global_user_state
26
27
  from sky import sky_logging
27
28
  from sky import skypilot_config
29
+ from sky.metrics import utils as metrics_lib
28
30
  from sky.server import common as server_common
29
31
  from sky.server import constants as server_constants
30
32
  from sky.server import daemons
31
- from sky.server import metrics as metrics_lib
32
33
  from sky.server.requests import payloads
33
34
  from sky.server.requests.serializers import decoders
34
35
  from sky.server.requests.serializers import encoders
36
+ from sky.server.requests.serializers import return_value_serializers
37
+ from sky.utils import asyncio_utils
35
38
  from sky.utils import common_utils
36
39
  from sky.utils import ux_utils
37
40
  from sky.utils.db import db_utils
@@ -211,8 +214,8 @@ class Request:
211
214
  entrypoint=self.entrypoint.__name__,
212
215
  request_body=self.request_body.model_dump_json(),
213
216
  status=self.status.value,
214
- return_value=json.dumps(None),
215
- error=json.dumps(None),
217
+ return_value=orjson.dumps(None).decode('utf-8'),
218
+ error=orjson.dumps(None).decode('utf-8'),
216
219
  pid=None,
217
220
  created_at=self.created_at,
218
221
  schedule_type=self.schedule_type.value,
@@ -229,14 +232,17 @@ class Request:
229
232
  assert isinstance(self.request_body,
230
233
  payloads.RequestBody), (self.name, self.request_body)
231
234
  try:
235
+ # Use version-aware serializer to handle backward compatibility
236
+ # for old clients that don't recognize new fields.
237
+ serializer = return_value_serializers.get_serializer(self.name)
232
238
  return payloads.RequestPayload(
233
239
  request_id=self.request_id,
234
240
  name=self.name,
235
241
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
236
242
  request_body=encoders.pickle_and_encode(self.request_body),
237
243
  status=self.status.value,
238
- return_value=json.dumps(self.return_value),
239
- error=json.dumps(self.error),
244
+ return_value=serializer(self.return_value),
245
+ error=orjson.dumps(self.error).decode('utf-8'),
240
246
  pid=self.pid,
241
247
  created_at=self.created_at,
242
248
  schedule_type=self.schedule_type.value,
@@ -268,8 +274,8 @@ class Request:
268
274
  entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
269
275
  request_body=decoders.decode_and_unpickle(payload.request_body),
270
276
  status=RequestStatus(payload.status),
271
- return_value=json.loads(payload.return_value),
272
- error=json.loads(payload.error),
277
+ return_value=orjson.loads(payload.return_value),
278
+ error=orjson.loads(payload.error),
273
279
  pid=payload.pid,
274
280
  created_at=payload.created_at,
275
281
  schedule_type=ScheduleType(payload.schedule_type),
@@ -292,72 +298,104 @@ class Request:
292
298
  raise
293
299
 
294
300
 
295
- def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
296
- """Kill all pending and running requests for a cluster.
301
+ def get_new_request_id() -> str:
302
+ """Get a new request ID."""
303
+ return str(uuid.uuid4())
297
304
 
298
- Args:
299
- cluster_name: the name of the cluster.
300
- exclude_request_names: exclude requests with these names. This is to
301
- prevent killing the caller request.
302
- """
303
- request_ids = [
304
- request_task.request_id
305
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
306
- cluster_names=[cluster_name],
307
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
308
- exclude_request_names=[exclude_request_name]))
309
- ]
310
- kill_requests(request_ids)
311
305
 
306
+ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
307
+ """Serialize the SkyPilot API request for display purposes.
312
308
 
313
- def kill_requests(request_ids: Optional[List[str]] = None,
314
- user_id: Optional[str] = None) -> List[str]:
315
- """Kill a SkyPilot API request and set its status to cancelled.
309
+ This function should be called on the server side to serialize the
310
+ request body into human readable format, e.g., the entrypoint should
311
+ be a string, and the pid, error, or return value are not needed.
316
312
 
317
- Args:
318
- request_ids: The request IDs to kill. If None, all requests for the
319
- user are killed.
320
- user_id: The user ID to kill requests for. If None, all users are
321
- killed.
313
+ The returned value will then be displayed on the client side in request
314
+ table.
322
315
 
323
- Returns:
324
- A list of request IDs that were cancelled.
325
- """
326
- if request_ids is None:
327
- request_ids = [
328
- request_task.request_id
329
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
330
- user_id=user_id,
331
- status=[RequestStatus.RUNNING, RequestStatus.PENDING],
332
- # Avoid cancelling the cancel request itself.
333
- exclude_request_names=['sky.api_cancel']))
334
- ]
335
- cancelled_request_ids = []
336
- for request_id in request_ids:
337
- with update_request(request_id) as request_record:
338
- if request_record is None:
339
- logger.debug(f'No request ID {request_id}')
340
- continue
341
- # Skip internal requests. The internal requests are scheduled with
342
- # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
343
- if request_record.request_id in set(
344
- event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
345
- continue
346
- if request_record.status > RequestStatus.RUNNING:
347
- logger.debug(f'Request {request_id} already finished')
348
- continue
349
- if request_record.pid is not None:
350
- logger.debug(f'Killing request process {request_record.pid}')
351
- # Use SIGTERM instead of SIGKILL:
352
- # - The executor can handle SIGTERM gracefully
353
- # - After SIGTERM, the executor can reuse the request process
354
- # for other requests, avoiding the overhead of forking a new
355
- # process for each request.
356
- os.kill(request_record.pid, signal.SIGTERM)
357
- request_record.status = RequestStatus.CANCELLED
358
- request_record.finished_at = time.time()
359
- cancelled_request_ids.append(request_id)
360
- return cancelled_request_ids
316
+ We do not use `encode` for display to avoid a large amount of data being
317
+ sent to the client side, especially for the request table could include
318
+ all the requests.
319
+ """
320
+ encoded_requests = []
321
+ all_users = global_user_state.get_all_users()
322
+ all_users_map = {user.id: user.name for user in all_users}
323
+ for request in requests:
324
+ if request.request_body is not None:
325
+ assert isinstance(request.request_body,
326
+ payloads.RequestBody), (request.name,
327
+ request.request_body)
328
+ user_name = all_users_map.get(request.user_id)
329
+ payload = payloads.RequestPayload(
330
+ request_id=request.request_id,
331
+ name=request.name,
332
+ entrypoint=request.entrypoint.__name__
333
+ if request.entrypoint is not None else '',
334
+ request_body=request.request_body.model_dump_json()
335
+ if request.request_body is not None else
336
+ orjson.dumps(None).decode('utf-8'),
337
+ status=request.status.value,
338
+ return_value=orjson.dumps(None).decode('utf-8'),
339
+ error=orjson.dumps(None).decode('utf-8'),
340
+ pid=None,
341
+ created_at=request.created_at,
342
+ schedule_type=request.schedule_type.value,
343
+ user_id=request.user_id,
344
+ user_name=user_name,
345
+ cluster_name=request.cluster_name,
346
+ status_msg=request.status_msg,
347
+ should_retry=request.should_retry,
348
+ finished_at=request.finished_at,
349
+ )
350
+ encoded_requests.append(payload)
351
+ return encoded_requests
352
+
353
+
354
+ def _update_request_row_fields(
355
+ row: Tuple[Any, ...],
356
+ fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
357
+ """Update the request row fields."""
358
+ if not fields:
359
+ return row
360
+
361
+ # Convert tuple to dictionary for easier manipulation
362
+ content = dict(zip(fields, row))
363
+
364
+ # Required fields in RequestPayload
365
+ if 'request_id' not in fields:
366
+ content['request_id'] = ''
367
+ if 'name' not in fields:
368
+ content['name'] = ''
369
+ if 'entrypoint' not in fields:
370
+ content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
371
+ if 'request_body' not in fields:
372
+ content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
373
+ if 'status' not in fields:
374
+ content['status'] = RequestStatus.PENDING.value
375
+ if 'created_at' not in fields:
376
+ content['created_at'] = 0
377
+ if 'user_id' not in fields:
378
+ content['user_id'] = ''
379
+ if 'return_value' not in fields:
380
+ content['return_value'] = orjson.dumps(None).decode('utf-8')
381
+ if 'error' not in fields:
382
+ content['error'] = orjson.dumps(None).decode('utf-8')
383
+ if 'schedule_type' not in fields:
384
+ content['schedule_type'] = ScheduleType.SHORT.value
385
+ # Optional fields in RequestPayload
386
+ if 'pid' not in fields:
387
+ content['pid'] = None
388
+ if 'cluster_name' not in fields:
389
+ content['cluster_name'] = None
390
+ if 'status_msg' not in fields:
391
+ content['status_msg'] = None
392
+ if 'should_retry' not in fields:
393
+ content['should_retry'] = False
394
+ if 'finished_at' not in fields:
395
+ content['finished_at'] = None
396
+
397
+ # Convert back to tuple in the same order as REQUEST_COLUMNS
398
+ return tuple(content[col] for col in REQUEST_COLUMNS)
361
399
 
362
400
 
363
401
  def create_table(cursor, conn):
@@ -402,6 +440,21 @@ def create_table(cursor, conn):
402
440
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
403
441
  'REAL')
404
442
 
443
+ # Add an index on (status, name) to speed up queries
444
+ # that filter on these columns.
445
+ cursor.execute(f"""\
446
+ CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
447
+ """)
448
+ # Add an index on cluster_name to speed up queries
449
+ # that filter on this column.
450
+ cursor.execute(f"""\
451
+ CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
452
+ """)
453
+ # Add an index on created_at to speed up queries that sort on this column.
454
+ cursor.execute(f"""\
455
+ CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
456
+ """)
457
+
405
458
 
406
459
  _DB = None
407
460
  _init_db_lock = threading.Lock()
@@ -449,11 +502,37 @@ def init_db_async(func):
449
502
 
450
503
  def reset_db_and_logs():
451
504
  """Create the database."""
505
+ logger.debug('clearing local API server database')
452
506
  server_common.clear_local_api_server_database()
507
+ logger.debug(
508
+ f'clearing local API server logs directory at {REQUEST_LOG_PATH_PREFIX}'
509
+ )
453
510
  shutil.rmtree(pathlib.Path(REQUEST_LOG_PATH_PREFIX).expanduser(),
454
511
  ignore_errors=True)
512
+ logger.debug('clearing local API server client directory at '
513
+ f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
455
514
  shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
456
515
  ignore_errors=True)
516
+ with _init_db_lock:
517
+ _init_db_within_lock()
518
+ assert _DB is not None
519
+ with _DB.conn:
520
+ cursor = _DB.conn.cursor()
521
+ cursor.execute('SELECT sqlite_version()')
522
+ row = cursor.fetchone()
523
+ if row is None:
524
+ raise RuntimeError('Failed to get SQLite version')
525
+ version_str = row[0]
526
+ version_parts = version_str.split('.')
527
+ assert len(version_parts) >= 2, \
528
+ f'Invalid version string: {version_str}'
529
+ major, minor = int(version_parts[0]), int(version_parts[1])
530
+ # SQLite 3.35.0+ supports RETURNING statements.
531
+ # 3.35.0 was released in March 2021.
532
+ if not ((major > 3) or (major == 3 and minor >= 35)):
533
+ raise RuntimeError(
534
+ f'SQLite version {version_str} is not supported. '
535
+ 'Please upgrade to SQLite 3.35.0 or later.')
457
536
 
458
537
 
459
538
  def request_lock_path(request_id: str) -> str:
@@ -462,93 +541,285 @@ def request_lock_path(request_id: str) -> str:
462
541
  return os.path.join(lock_path, f'.{request_id}.lock')
463
542
 
464
543
 
544
+ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
545
+ """Kill all pending and running requests for a cluster.
546
+
547
+ Args:
548
+ cluster_name: the name of the cluster.
549
+ exclude_request_names: exclude requests with these names. This is to
550
+ prevent killing the caller request.
551
+ """
552
+ request_ids = [
553
+ request_task.request_id
554
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
555
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
556
+ exclude_request_names=[exclude_request_name],
557
+ cluster_names=[cluster_name],
558
+ fields=['request_id']))
559
+ ]
560
+ _kill_requests(request_ids)
561
+
562
+
563
+ def kill_requests(request_ids: Optional[List[str]] = None,
564
+ user_id: Optional[str] = None) -> List[str]:
565
+ """Kill requests with a given request ID prefix."""
566
+ expanded_request_ids: Optional[List[str]] = None
567
+ if request_ids is not None:
568
+ expanded_request_ids = []
569
+ for request_id in request_ids:
570
+ request_tasks = get_requests_with_prefix(request_id,
571
+ fields=['request_id'])
572
+ if request_tasks is None or len(request_tasks) == 0:
573
+ continue
574
+ if len(request_tasks) > 1:
575
+ raise ValueError(f'Multiple requests found for '
576
+ f'request ID prefix: {request_id}')
577
+ expanded_request_ids.append(request_tasks[0].request_id)
578
+ return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
579
+
580
+
581
+ # needed for backward compatibility. Remove by v0.10.7 or v0.12.0
582
+ # and rename kill_requests to kill_requests_with_prefix.
583
+ kill_requests_with_prefix = kill_requests
584
+
585
+
586
+ def _should_kill_request(request_id: str,
587
+ request_record: Optional[Request]) -> bool:
588
+ if request_record is None:
589
+ logger.debug(f'No request ID {request_id}')
590
+ return False
591
+ # Skip internal requests. The internal requests are scheduled with
592
+ # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
593
+ if request_record.request_id in set(
594
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
595
+ return False
596
+ if request_record.status > RequestStatus.RUNNING:
597
+ logger.debug(f'Request {request_id} already finished')
598
+ return False
599
+ return True
600
+
601
+
602
+ def _kill_requests(request_ids: Optional[List[str]] = None,
603
+ user_id: Optional[str] = None) -> List[str]:
604
+ """Kill a SkyPilot API request and set its status to cancelled.
605
+
606
+ Args:
607
+ request_ids: The request IDs to kill. If None, all requests for the
608
+ user are killed.
609
+ user_id: The user ID to kill requests for. If None, all users are
610
+ killed.
611
+
612
+ Returns:
613
+ A list of request IDs that were cancelled.
614
+ """
615
+ if request_ids is None:
616
+ request_ids = [
617
+ request_task.request_id
618
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
619
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
620
+ # Avoid cancelling the cancel request itself.
621
+ exclude_request_names=['sky.api_cancel'],
622
+ user_id=user_id,
623
+ fields=['request_id']))
624
+ ]
625
+ cancelled_request_ids = []
626
+ for request_id in request_ids:
627
+ with update_request(request_id) as request_record:
628
+ if not _should_kill_request(request_id, request_record):
629
+ continue
630
+ if request_record.pid is not None:
631
+ logger.debug(f'Killing request process {request_record.pid}')
632
+ # Use SIGTERM instead of SIGKILL:
633
+ # - The executor can handle SIGTERM gracefully
634
+ # - After SIGTERM, the executor can reuse the request process
635
+ # for other requests, avoiding the overhead of forking a new
636
+ # process for each request.
637
+ os.kill(request_record.pid, signal.SIGTERM)
638
+ request_record.status = RequestStatus.CANCELLED
639
+ request_record.finished_at = time.time()
640
+ cancelled_request_ids.append(request_id)
641
+ return cancelled_request_ids
642
+
643
+
644
+ @init_db_async
645
+ @asyncio_utils.shield
646
+ async def kill_request_async(request_id: str) -> bool:
647
+ """Kill a SkyPilot API request and set its status to cancelled.
648
+
649
+ Returns:
650
+ True if the request was killed, False otherwise.
651
+ """
652
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
653
+ request = await _get_request_no_lock_async(request_id)
654
+ if not _should_kill_request(request_id, request):
655
+ return False
656
+ assert request is not None
657
+ if request.pid is not None:
658
+ logger.debug(f'Killing request process {request.pid}')
659
+ # Use SIGTERM instead of SIGKILL:
660
+ # - The executor can handle SIGTERM gracefully
661
+ # - After SIGTERM, the executor can reuse the request process
662
+ # for other requests, avoiding the overhead of forking a new
663
+ # process for each request.
664
+ os.kill(request.pid, signal.SIGTERM)
665
+ request.status = RequestStatus.CANCELLED
666
+ request.finished_at = time.time()
667
+ await _add_or_update_request_no_lock_async(request)
668
+ return True
669
+
670
+
465
671
  @contextlib.contextmanager
466
672
  @init_db
467
673
  @metrics_lib.time_me
468
674
  def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
469
675
  """Get and update a SkyPilot API request."""
470
- request = _get_request_no_lock(request_id)
471
- yield request
472
- if request is not None:
473
- _add_or_update_request_no_lock(request)
676
+ # Acquire the lock to avoid race conditions between multiple request
677
+ # operations, e.g. execute and cancel.
678
+ with filelock.FileLock(request_lock_path(request_id)):
679
+ request = _get_request_no_lock(request_id)
680
+ yield request
681
+ if request is not None:
682
+ _add_or_update_request_no_lock(request)
474
683
 
475
684
 
476
- @init_db
685
+ @init_db_async
477
686
  @metrics_lib.time_me
478
- def update_request_async(
479
- request_id: str) -> AsyncContextManager[Optional[Request]]:
480
- """Async version of update_request.
481
-
482
- Returns an async context manager that yields the request record and
483
- persists any in-place updates upon exit.
484
- """
485
-
486
- @contextlib.asynccontextmanager
487
- async def _cm():
687
+ @asyncio_utils.shield
688
+ async def update_status_async(request_id: str, status: RequestStatus) -> None:
689
+ """Update the status of a request"""
690
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
488
691
  request = await _get_request_no_lock_async(request_id)
489
- try:
490
- yield request
491
- finally:
492
- if request is not None:
493
- await _add_or_update_request_no_lock_async(request)
494
-
495
- return _cm()
692
+ if request is not None:
693
+ request.status = status
694
+ await _add_or_update_request_no_lock_async(request)
496
695
 
497
696
 
498
- _get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
499
- 'WHERE request_id LIKE ?')
697
+ @init_db_async
698
+ @metrics_lib.time_me
699
+ @asyncio_utils.shield
700
+ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
701
+ """Update the status message of a request"""
702
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
703
+ request = await _get_request_no_lock_async(request_id)
704
+ if request is not None:
705
+ request.status_msg = status_msg
706
+ await _add_or_update_request_no_lock_async(request)
500
707
 
501
708
 
502
- def _get_request_no_lock(request_id: str) -> Optional[Request]:
709
+ def _get_request_no_lock(
710
+ request_id: str,
711
+ fields: Optional[List[str]] = None) -> Optional[Request]:
503
712
  """Get a SkyPilot API request."""
504
713
  assert _DB is not None
714
+ columns_str = ', '.join(REQUEST_COLUMNS)
715
+ if fields:
716
+ columns_str = ', '.join(fields)
505
717
  with _DB.conn:
506
718
  cursor = _DB.conn.cursor()
507
- cursor.execute(_get_request_sql, (request_id + '%',))
719
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
720
+ 'WHERE request_id LIKE ?'), (request_id + '%',))
508
721
  row = cursor.fetchone()
509
722
  if row is None:
510
723
  return None
724
+ if fields:
725
+ row = _update_request_row_fields(row, fields)
511
726
  return Request.from_row(row)
512
727
 
513
728
 
514
- async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
729
+ async def _get_request_no_lock_async(
730
+ request_id: str,
731
+ fields: Optional[List[str]] = None) -> Optional[Request]:
515
732
  """Async version of _get_request_no_lock."""
516
733
  assert _DB is not None
517
- async with _DB.execute_fetchall_async(_get_request_sql,
518
- (request_id + '%',)) as rows:
734
+ columns_str = ', '.join(REQUEST_COLUMNS)
735
+ if fields:
736
+ columns_str = ', '.join(fields)
737
+ async with _DB.execute_fetchall_async(
738
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
739
+ 'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
519
740
  row = rows[0] if rows else None
520
741
  if row is None:
521
742
  return None
743
+ if fields:
744
+ row = _update_request_row_fields(row, fields)
522
745
  return Request.from_row(row)
523
746
 
524
747
 
525
- @init_db
748
+ @init_db_async
526
749
  @metrics_lib.time_me
527
- def get_latest_request_id() -> Optional[str]:
750
+ async def get_latest_request_id_async() -> Optional[str]:
528
751
  """Get the latest request ID."""
529
752
  assert _DB is not None
530
- with _DB.conn:
531
- cursor = _DB.conn.cursor()
532
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
533
- 'ORDER BY created_at DESC LIMIT 1')
534
- row = cursor.fetchone()
535
- return row[0] if row else None
753
+ async with _DB.execute_fetchall_async(
754
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
755
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
756
+ return rows[0][0] if rows else None
536
757
 
537
758
 
538
759
  @init_db
539
760
  @metrics_lib.time_me
540
- def get_request(request_id: str) -> Optional[Request]:
761
+ def get_request(request_id: str,
762
+ fields: Optional[List[str]] = None) -> Optional[Request]:
541
763
  """Get a SkyPilot API request."""
542
764
  with filelock.FileLock(request_lock_path(request_id)):
543
- return _get_request_no_lock(request_id)
765
+ return _get_request_no_lock(request_id, fields)
544
766
 
545
767
 
546
768
  @init_db_async
547
769
  @metrics_lib.time_me_async
548
- async def get_request_async(request_id: str) -> Optional[Request]:
770
+ @asyncio_utils.shield
771
+ async def get_request_async(
772
+ request_id: str,
773
+ fields: Optional[List[str]] = None) -> Optional[Request]:
549
774
  """Async version of get_request."""
775
+ # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
550
776
  async with filelock.AsyncFileLock(request_lock_path(request_id)):
551
- return await _get_request_no_lock_async(request_id)
777
+ return await _get_request_no_lock_async(request_id, fields)
778
+
779
+
780
+ @init_db
781
+ @metrics_lib.time_me
782
+ def get_requests_with_prefix(
783
+ request_id_prefix: str,
784
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
785
+ """Get requests with a given request ID prefix."""
786
+ assert _DB is not None
787
+ if fields:
788
+ columns_str = ', '.join(fields)
789
+ else:
790
+ columns_str = ', '.join(REQUEST_COLUMNS)
791
+ with _DB.conn:
792
+ cursor = _DB.conn.cursor()
793
+ cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
794
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',))
795
+ rows = cursor.fetchall()
796
+ if not rows:
797
+ return None
798
+ if fields:
799
+ rows = [_update_request_row_fields(row, fields) for row in rows]
800
+ return [Request.from_row(row) for row in rows]
801
+
802
+
803
+ @init_db_async
804
+ @metrics_lib.time_me_async
805
+ @asyncio_utils.shield
806
+ async def get_requests_async_with_prefix(
807
+ request_id_prefix: str,
808
+ fields: Optional[List[str]] = None) -> Optional[List[Request]]:
809
+ """Async version of get_request_with_prefix."""
810
+ assert _DB is not None
811
+ if fields:
812
+ columns_str = ', '.join(fields)
813
+ else:
814
+ columns_str = ', '.join(REQUEST_COLUMNS)
815
+ async with _DB.execute_fetchall_async(
816
+ (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
817
+ 'WHERE request_id LIKE ?'), (request_id_prefix + '%',)) as rows:
818
+ if not rows:
819
+ return None
820
+ if fields:
821
+ rows = [_update_request_row_fields(row, fields) for row in rows]
822
+ return [Request.from_row(row) for row in rows]
552
823
 
553
824
 
554
825
  class StatusWithMsg(NamedTuple):
@@ -585,26 +856,29 @@ async def get_request_status_async(
585
856
  return StatusWithMsg(status, status_msg)
586
857
 
587
858
 
588
- @init_db
589
- @metrics_lib.time_me
590
- def create_if_not_exists(request: Request) -> bool:
591
- """Create a SkyPilot API request if it does not exist."""
592
- with filelock.FileLock(request_lock_path(request.request_id)):
593
- if _get_request_no_lock(request.request_id) is not None:
594
- return False
595
- _add_or_update_request_no_lock(request)
596
- return True
597
-
598
-
599
859
  @init_db_async
600
860
  @metrics_lib.time_me_async
861
+ @asyncio_utils.shield
601
862
  async def create_if_not_exists_async(request: Request) -> bool:
602
- """Async version of create_if_not_exists."""
603
- async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
604
- if await _get_request_no_lock_async(request.request_id) is not None:
605
- return False
606
- await _add_or_update_request_no_lock_async(request)
607
- return True
863
+ """Create a request if it does not exist, otherwise do nothing.
864
+
865
+ Returns:
866
+ True if a new request is created, False if the request already exists.
867
+ """
868
+ assert _DB is not None
869
+ request_columns = ', '.join(REQUEST_COLUMNS)
870
+ values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
871
+ sql_statement = (
872
+ f'INSERT INTO {REQUEST_TABLE} '
873
+ f'({request_columns}) VALUES '
874
+ f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
875
+ request_row = request.to_row()
876
+ # Execute the SQL statement without getting the request lock.
877
+ # The request lock is used to prevent racing with cancellation codepath,
878
+ # but a request cannot be cancelled before it is created.
879
+ row = await _DB.execute_get_returning_value_async(sql_statement,
880
+ request_row)
881
+ return True if row else False
608
882
 
609
883
 
610
884
  @dataclasses.dataclass
@@ -622,6 +896,7 @@ class RequestTaskFilter:
622
896
  Mutually exclusive with exclude_request_names.
623
897
  finished_before: if provided, only include requests finished before this
624
898
  timestamp.
899
+ limit: the number of requests to show. If None, show all requests.
625
900
 
626
901
  Raises:
627
902
  ValueError: If both exclude_request_names and include_request_names are
@@ -633,6 +908,9 @@ class RequestTaskFilter:
633
908
  exclude_request_names: Optional[List[str]] = None
634
909
  include_request_names: Optional[List[str]] = None
635
910
  finished_before: Optional[float] = None
911
+ limit: Optional[int] = None
912
+ fields: Optional[List[str]] = None
913
+ sort: bool = False
636
914
 
637
915
  def __post_init__(self):
638
916
  if (self.exclude_request_names is not None and
@@ -653,6 +931,10 @@ class RequestTaskFilter:
653
931
  status_list_str = ','.join(
654
932
  repr(status.value) for status in self.status)
655
933
  filters.append(f'status IN ({status_list_str})')
934
+ if self.include_request_names is not None:
935
+ request_names_str = ','.join(
936
+ repr(name) for name in self.include_request_names)
937
+ filters.append(f'name IN ({request_names_str})')
656
938
  if self.exclude_request_names is not None:
657
939
  exclude_request_names_str = ','.join(
658
940
  repr(name) for name in self.exclude_request_names)
@@ -664,10 +946,6 @@ class RequestTaskFilter:
664
946
  if self.user_id is not None:
665
947
  filters.append(f'{COL_USER_ID} = ?')
666
948
  filter_params.append(self.user_id)
667
- if self.include_request_names is not None:
668
- request_names_str = ','.join(
669
- repr(name) for name in self.include_request_names)
670
- filters.append(f'name IN ({request_names_str})')
671
949
  if self.finished_before is not None:
672
950
  filters.append('finished_at < ?')
673
951
  filter_params.append(self.finished_before)
@@ -675,8 +953,16 @@ class RequestTaskFilter:
675
953
  if filter_str:
676
954
  filter_str = f' WHERE {filter_str}'
677
955
  columns_str = ', '.join(REQUEST_COLUMNS)
678
- return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
679
- 'ORDER BY created_at DESC'), filter_params
956
+ if self.fields:
957
+ columns_str = ', '.join(self.fields)
958
+ sort_str = ''
959
+ if self.sort:
960
+ sort_str = ' ORDER BY created_at DESC'
961
+ query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str}'
962
+ f'{sort_str}')
963
+ if self.limit is not None:
964
+ query_str += f' LIMIT {self.limit}'
965
+ return query_str, filter_params
680
966
 
681
967
 
682
968
  @init_db
@@ -695,6 +981,10 @@ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
695
981
  rows = cursor.fetchall()
696
982
  if rows is None:
697
983
  return []
984
+ if req_filter.fields:
985
+ rows = [
986
+ _update_request_row_fields(row, req_filter.fields) for row in rows
987
+ ]
698
988
  return [Request.from_row(row) for row in rows]
699
989
 
700
990
 
@@ -707,6 +997,10 @@ async def get_request_tasks_async(
707
997
  async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
708
998
  if not rows:
709
999
  return []
1000
+ if req_filter.fields:
1001
+ rows = [
1002
+ _update_request_row_fields(row, req_filter.fields) for row in rows
1003
+ ]
710
1004
  return [Request.from_row(row) for row in rows]
711
1005
 
712
1006
 
@@ -752,11 +1046,15 @@ async def _add_or_update_request_no_lock_async(request: Request):
752
1046
  request.to_row())
753
1047
 
754
1048
 
755
- def set_request_failed(request_id: str, e: BaseException) -> None:
756
- """Set a request to failed and populate the error message."""
1049
+ def set_exception_stacktrace(e: BaseException) -> None:
757
1050
  with ux_utils.enable_traceback():
758
1051
  stacktrace = traceback.format_exc()
759
1052
  setattr(e, 'stacktrace', stacktrace)
1053
+
1054
+
1055
+ def set_request_failed(request_id: str, e: BaseException) -> None:
1056
+ """Set a request to failed and populate the error message."""
1057
+ set_exception_stacktrace(e)
760
1058
  with update_request(request_id) as request_task:
761
1059
  assert request_task is not None, request_id
762
1060
  request_task.status = RequestStatus.FAILED
@@ -764,6 +1062,21 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
764
1062
  request_task.set_error(e)
765
1063
 
766
1064
 
1065
+ @init_db_async
1066
+ @metrics_lib.time_me_async
1067
+ @asyncio_utils.shield
1068
+ async def set_request_failed_async(request_id: str, e: BaseException) -> None:
1069
+ """Set a request to failed and populate the error message."""
1070
+ set_exception_stacktrace(e)
1071
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1072
+ request_task = await _get_request_no_lock_async(request_id)
1073
+ assert request_task is not None, request_id
1074
+ request_task.status = RequestStatus.FAILED
1075
+ request_task.finished_at = time.time()
1076
+ request_task.set_error(e)
1077
+ await _add_or_update_request_no_lock_async(request_task)
1078
+
1079
+
767
1080
  def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
768
1081
  """Set a request to succeeded and populate the result."""
769
1082
  with update_request(request_id) as request_task:
@@ -774,25 +1087,50 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
774
1087
  request_task.set_return_value(result)
775
1088
 
776
1089
 
777
- def set_request_cancelled(request_id: str) -> None:
778
- """Set a request to cancelled."""
779
- with update_request(request_id) as request_task:
1090
+ @init_db_async
1091
+ @metrics_lib.time_me_async
1092
+ @asyncio_utils.shield
1093
+ async def set_request_succeeded_async(request_id: str,
1094
+ result: Optional[Any]) -> None:
1095
+ """Set a request to succeeded and populate the result."""
1096
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1097
+ request_task = await _get_request_no_lock_async(request_id)
780
1098
  assert request_task is not None, request_id
1099
+ request_task.status = RequestStatus.SUCCEEDED
1100
+ request_task.finished_at = time.time()
1101
+ if result is not None:
1102
+ request_task.set_return_value(result)
1103
+ await _add_or_update_request_no_lock_async(request_task)
1104
+
1105
+
1106
+ @init_db_async
1107
+ @metrics_lib.time_me_async
1108
+ @asyncio_utils.shield
1109
+ async def set_request_cancelled_async(request_id: str) -> None:
1110
+ """Set a pending or running request to cancelled."""
1111
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
1112
+ request_task = await _get_request_no_lock_async(request_id)
1113
+ assert request_task is not None, request_id
1114
+ # Already finished or cancelled.
1115
+ if request_task.status > RequestStatus.RUNNING:
1116
+ return
781
1117
  request_task.finished_at = time.time()
782
1118
  request_task.status = RequestStatus.CANCELLED
1119
+ await _add_or_update_request_no_lock_async(request_task)
783
1120
 
784
1121
 
785
1122
  @init_db
786
1123
  @metrics_lib.time_me
787
- async def _delete_requests(requests: List[Request]):
1124
+ async def _delete_requests(request_ids: List[str]):
788
1125
  """Clean up requests by their IDs."""
789
- id_list_str = ','.join(repr(req.request_id) for req in requests)
1126
+ id_list_str = ','.join(repr(request_id) for request_id in request_ids)
790
1127
  assert _DB is not None
791
1128
  await _DB.execute_and_commit_async(
792
1129
  f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
793
1130
 
794
1131
 
795
- async def clean_finished_requests_with_retention(retention_seconds: int):
1132
+ async def clean_finished_requests_with_retention(retention_seconds: int,
1133
+ batch_size: int = 1000):
796
1134
  """Clean up finished requests older than the retention period.
797
1135
 
798
1136
  This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
@@ -801,24 +1139,40 @@ async def clean_finished_requests_with_retention(retention_seconds: int):
801
1139
  Args:
802
1140
  retention_seconds: Requests older than this many seconds will be
803
1141
  deleted.
1142
+ batch_size: batch delete 'batch_size' requests at a time to
1143
+ avoid using too much memory and once and to let each
1144
+ db query complete in a reasonable time. All stale
1145
+ requests older than the retention period will be deleted
1146
+ regardless of the batch size.
804
1147
  """
805
- reqs = await get_request_tasks_async(
806
- req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
807
- finished_before=time.time() -
808
- retention_seconds))
809
-
810
- futs = []
811
- for req in reqs:
812
- futs.append(
813
- asyncio.create_task(
814
- anyio.Path(req.log_path.absolute()).unlink(missing_ok=True)))
815
- await asyncio.gather(*futs)
816
-
817
- await _delete_requests(reqs)
1148
+ total_deleted = 0
1149
+ while True:
1150
+ reqs = await get_request_tasks_async(
1151
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
1152
+ finished_before=time.time() -
1153
+ retention_seconds,
1154
+ limit=batch_size,
1155
+ fields=['request_id']))
1156
+ if len(reqs) == 0:
1157
+ break
1158
+ futs = []
1159
+ for req in reqs:
1160
+ # req.log_path is derived from request_id,
1161
+ # so it's ok to just grab the request_id in the above query.
1162
+ futs.append(
1163
+ asyncio.create_task(
1164
+ anyio.Path(
1165
+ req.log_path.absolute()).unlink(missing_ok=True)))
1166
+ await asyncio.gather(*futs)
1167
+
1168
+ await _delete_requests([req.request_id for req in reqs])
1169
+ total_deleted += len(reqs)
1170
+ if len(reqs) < batch_size:
1171
+ break
818
1172
 
819
1173
  # To avoid leakage of the log file, logs must be deleted before the
820
1174
  # request task in the database.
821
- logger.info(f'Cleaned up {len(reqs)} finished requests '
1175
+ logger.info(f'Cleaned up {total_deleted} finished requests '
822
1176
  f'older than {retention_seconds} seconds')
823
1177
 
824
1178