skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -18,6 +18,8 @@ The number of the workers is determined by the system resources.
18
18
 
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
+ import asyncio
22
+ import concurrent.futures
21
23
  import contextlib
22
24
  import multiprocessing
23
25
  import os
@@ -29,26 +31,38 @@ import time
29
31
  import typing
30
32
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
31
33
 
34
+ import psutil
32
35
  import setproctitle
33
36
 
37
+ from sky import exceptions
34
38
  from sky import global_user_state
35
39
  from sky import models
36
40
  from sky import sky_logging
37
41
  from sky import skypilot_config
42
+ from sky.metrics import utils as metrics_utils
38
43
  from sky.server import common as server_common
39
44
  from sky.server import config as server_config
40
45
  from sky.server import constants as server_constants
46
+ from sky.server import metrics as metrics_lib
41
47
  from sky.server.requests import payloads
42
48
  from sky.server.requests import preconditions
43
49
  from sky.server.requests import process
50
+ from sky.server.requests import request_names
44
51
  from sky.server.requests import requests as api_requests
52
+ from sky.server.requests import threads
45
53
  from sky.server.requests.queues import local_queue
46
54
  from sky.server.requests.queues import mp_queue
47
55
  from sky.skylet import constants
48
56
  from sky.utils import annotations
49
57
  from sky.utils import common_utils
58
+ from sky.utils import context
59
+ from sky.utils import context_utils
50
60
  from sky.utils import subprocess_utils
61
+ from sky.utils import tempstore
51
62
  from sky.utils import timeline
63
+ from sky.utils import yaml_utils
64
+ from sky.utils.db import db_utils
65
+ from sky.workspaces import core as workspaces_core
52
66
 
53
67
  if typing.TYPE_CHECKING:
54
68
  import types
@@ -60,7 +74,6 @@ else:
60
74
  from typing_extensions import ParamSpec
61
75
 
62
76
  P = ParamSpec('P')
63
-
64
77
  logger = sky_logging.init_logger(__name__)
65
78
 
66
79
  # On macOS, the default start method for multiprocessing is 'fork', which
@@ -70,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
70
83
  # platforms, including macOS.
71
84
  multiprocessing.set_start_method('spawn', force=True)
72
85
 
86
+ # An upper limit of max threads for request execution per server process that
87
+ # unlikely to be reached to allow higher concurrency while still prevent the
88
+ # server process become overloaded.
89
+ _REQUEST_THREADS_LIMIT = 128
90
+
91
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
92
+ # A dedicated thread pool executor for synced requests execution in coroutine to
93
+ # avoid:
94
+ # 1. blocking the event loop;
95
+ # 2. exhausting the default thread pool executor of event loop;
96
+ _REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
97
+
98
+
99
+ def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
100
+ """Lazy init and return the request thread executor for current process."""
101
+ global _REQUEST_THREAD_EXECUTOR
102
+ if _REQUEST_THREAD_EXECUTOR is not None:
103
+ return _REQUEST_THREAD_EXECUTOR
104
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
105
+ if _REQUEST_THREAD_EXECUTOR is None:
106
+ _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
107
+ name='request_thread_executor',
108
+ max_workers=_REQUEST_THREADS_LIMIT)
109
+ return _REQUEST_THREAD_EXECUTOR
110
+
73
111
 
74
112
  class RequestQueue:
75
113
  """The queue for the requests, either redis or multiprocessing.
@@ -89,21 +127,21 @@ class RequestQueue:
89
127
  else:
90
128
  raise RuntimeError(f'Invalid queue backend: {backend}')
91
129
 
92
- def put(self, request: Tuple[str, bool]) -> None:
130
+ def put(self, request: Tuple[str, bool, bool]) -> None:
93
131
  """Put and request to the queue.
94
132
 
95
133
  Args:
96
- request: A tuple of request_id and ignore_return_value.
134
+ request: A tuple of request_id, ignore_return_value, and retryable.
97
135
  """
98
136
  self.queue.put(request) # type: ignore
99
137
 
100
- def get(self) -> Optional[Tuple[str, bool]]:
138
+ def get(self) -> Optional[Tuple[str, bool, bool]]:
101
139
  """Get a request from the queue.
102
140
 
103
141
  It is non-blocking if the queue is empty, and returns None.
104
142
 
105
143
  Returns:
106
- A tuple of request_id and ignore_return_value.
144
+ A tuple of request_id, ignore_return_value, and retryable.
107
145
  """
108
146
  try:
109
147
  return self.queue.get(block=False)
@@ -121,6 +159,10 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
121
159
  def executor_initializer(proc_group: str):
122
160
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
123
161
  f'{multiprocessing.current_process().pid}')
162
+ # Executor never stops, unless the whole process is killed.
163
+ threading.Thread(target=metrics_lib.process_monitor,
164
+ args=(f'worker:{proc_group}', threading.Event()),
165
+ daemon=True).start()
124
166
 
125
167
 
126
168
  class RequestWorker:
@@ -144,10 +186,27 @@ class RequestWorker:
144
186
  self.schedule_type = schedule_type
145
187
  self.garanteed_parallelism = config.garanteed_parallelism
146
188
  self.burstable_parallelism = config.burstable_parallelism
189
+ self.num_db_connections_per_worker = (
190
+ config.num_db_connections_per_worker)
191
+ self._thread: Optional[threading.Thread] = None
192
+ self._cancel_event = threading.Event()
147
193
 
148
194
  def __str__(self) -> str:
149
195
  return f'Worker(schedule_type={self.schedule_type.value})'
150
196
 
197
+ def run_in_background(self) -> None:
198
+ # Thread dispatcher is sufficient for current scale, refer to
199
+ # tests/load_tests/test_queue_dispatcher.py for more details.
200
+ # Use daemon thread for automatic cleanup.
201
+ thread = threading.Thread(target=self.run, daemon=True)
202
+ thread.start()
203
+ self._thread = thread
204
+
205
+ def cancel(self) -> None:
206
+ if self._thread is not None:
207
+ self._cancel_event.set()
208
+ self._thread.join()
209
+
151
210
  def process_request(self, executor: process.BurstableExecutor,
152
211
  queue: RequestQueue) -> None:
153
212
  try:
@@ -155,11 +214,12 @@ class RequestWorker:
155
214
  if request_element is None:
156
215
  time.sleep(0.1)
157
216
  return
158
- request_id, ignore_return_value = request_element
159
- request = api_requests.get_request(request_id)
217
+ request_id, ignore_return_value, _ = request_element
218
+ request = api_requests.get_request(request_id, fields=['status'])
160
219
  assert request is not None, f'Request with ID {request_id} is None'
161
220
  if request.status == api_requests.RequestStatus.CANCELLED:
162
221
  return
222
+ del request
163
223
  logger.info(f'[{self}] Submitting request: {request_id}')
164
224
  # Start additional process to run the request, so that it can be
165
225
  # cancelled when requested by a user.
@@ -167,8 +227,19 @@ class RequestWorker:
167
227
  # multiple requests can share the same process pid, which may cause
168
228
  # issues with SkyPilot core functions if they rely on the exit of
169
229
  # the process, such as subprocess_daemon.py.
170
- executor.submit_until_success(_request_execution_wrapper,
171
- request_id, ignore_return_value)
230
+ fut = executor.submit_until_success(
231
+ _request_execution_wrapper, request_id, ignore_return_value,
232
+ self.num_db_connections_per_worker)
233
+ # Decrement the free executor count when a request starts
234
+ if metrics_utils.METRICS_ENABLED:
235
+ if self.schedule_type == api_requests.ScheduleType.LONG:
236
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.dec()
237
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
238
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.dec()
239
+ # Monitor the result of the request execution.
240
+ threading.Thread(target=self.handle_task_result,
241
+ args=(fut, request_element),
242
+ daemon=True).start()
172
243
 
173
244
  logger.info(f'[{self}] Submitted request: {request_id}')
174
245
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
@@ -178,6 +249,45 @@ class RequestWorker:
178
249
  f'{request_id if "request_id" in locals() else ""} '
179
250
  f'{common_utils.format_exception(e, use_bracket=True)}')
180
251
 
252
+ def handle_task_result(self, fut: concurrent.futures.Future,
253
+ request_element: Tuple[str, bool, bool]) -> None:
254
+ try:
255
+ fut.result()
256
+ except concurrent.futures.process.BrokenProcessPool as e:
257
+ # Happens when the worker process dies unexpectedly, e.g. OOM
258
+ # killed.
259
+ request_id, _, retryable = request_element
260
+ # Ensure the request status.
261
+ api_requests.set_request_failed(request_id, e)
262
+ logger.error(
263
+ f'Request {request_id} failed to get processed '
264
+ f'{common_utils.format_exception(e, use_bracket=True)}')
265
+ if retryable:
266
+ # If the request is retryable and disrupted by broken
267
+ # process pool, reschedule it immediately to get it
268
+ # retried in the new process pool.
269
+ queue = _get_queue(self.schedule_type)
270
+ queue.put(request_element)
271
+ except exceptions.ExecutionRetryableError as e:
272
+ time.sleep(e.retry_wait_seconds)
273
+ # Reset the request status to PENDING so it can be picked up again.
274
+ # Assume retryable since the error is ExecutionRetryableError.
275
+ request_id, _, _ = request_element
276
+ with api_requests.update_request(request_id) as request_task:
277
+ assert request_task is not None, request_id
278
+ request_task.status = api_requests.RequestStatus.PENDING
279
+ # Reschedule the request.
280
+ queue = _get_queue(self.schedule_type)
281
+ queue.put(request_element)
282
+ logger.info(f'Rescheduled request {request_id} for retry')
283
+ finally:
284
+ # Increment the free executor count when a request finishes
285
+ if metrics_utils.METRICS_ENABLED:
286
+ if self.schedule_type == api_requests.ScheduleType.LONG:
287
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.inc()
288
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
289
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.inc()
290
+
181
291
  def run(self) -> None:
182
292
  # Handle the SIGTERM signal to abort the executor process gracefully.
183
293
  proc_group = f'{self.schedule_type.value}'
@@ -198,7 +308,17 @@ class RequestWorker:
198
308
  burst_workers=self.burstable_parallelism,
199
309
  initializer=executor_initializer,
200
310
  initargs=(proc_group,))
201
- while True:
311
+ # Initialize the appropriate gauge for the number of free executors
312
+ total_executors = (self.garanteed_parallelism +
313
+ self.burstable_parallelism)
314
+ if metrics_utils.METRICS_ENABLED:
315
+ if self.schedule_type == api_requests.ScheduleType.LONG:
316
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.set(
317
+ total_executors)
318
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
319
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.set(
320
+ total_executors)
321
+ while not self._cancel_event.is_set():
202
322
  self.process_request(executor, queue)
203
323
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
204
324
  except KeyboardInterrupt:
@@ -221,22 +341,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
221
341
 
222
342
  @contextlib.contextmanager
223
343
  def override_request_env_and_config(
224
- request_body: payloads.RequestBody) -> Generator[None, None, None]:
344
+ request_body: payloads.RequestBody, request_id: str,
345
+ request_name: str) -> Generator[None, None, None]:
225
346
  """Override the environment and SkyPilot config for a request."""
226
347
  original_env = os.environ.copy()
227
- os.environ.update(request_body.env_vars)
228
- user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
229
- name=request_body.env_vars[constants.USER_ENV_VAR])
230
- global_user_state.add_or_update_user(user)
231
- # Force color to be enabled.
232
- os.environ['CLICOLOR_FORCE'] = '1'
233
- server_common.reload_for_new_request(
234
- client_entrypoint=request_body.entrypoint,
235
- client_command=request_body.entrypoint_command,
236
- using_remote_api_server=request_body.using_remote_api_server)
237
348
  try:
349
+ # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
350
+ # server affecting client requests. If set on the client side, it will
351
+ # be overridden by the request body.
352
+ os.environ.pop('SKYPILOT_DEBUG', None)
353
+ # Remove the db connection uri from client supplied env vars, as the
354
+ # client should not set the db string on server side.
355
+ request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
356
+ os.environ.update(request_body.env_vars)
357
+ # Note: may be overridden by AuthProxyMiddleware.
358
+ # TODO(zhwu): we need to make the entire request a context available to
359
+ # the entire request execution, so that we can access info like user
360
+ # through the execution.
361
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
362
+ name=request_body.env_vars[constants.USER_ENV_VAR])
363
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
364
+
365
+ # Force color to be enabled.
366
+ os.environ['CLICOLOR_FORCE'] = '1'
367
+ server_common.reload_for_new_request(
368
+ client_entrypoint=request_body.entrypoint,
369
+ client_command=request_body.entrypoint_command,
370
+ using_remote_api_server=request_body.using_remote_api_server,
371
+ user=user,
372
+ request_id=request_id)
373
+ logger.debug(
374
+ f'override path: {request_body.override_skypilot_config_path}')
238
375
  with skypilot_config.override_skypilot_config(
239
- request_body.override_skypilot_config):
376
+ request_body.override_skypilot_config,
377
+ request_body.override_skypilot_config_path):
378
+ # Skip permission check for sky.workspaces.get request
379
+ # as it is used to determine which workspaces the user
380
+ # has access to.
381
+ if request_name != 'sky.workspaces.get':
382
+ try:
383
+ # Reject requests that the user does not have permission
384
+ # to access.
385
+ workspaces_core.reject_request_for_unauthorized_workspace(
386
+ user)
387
+ except exceptions.PermissionDeniedError as e:
388
+ logger.debug(
389
+ f'{request_id} permission denied to workspace: '
390
+ f'{skypilot_config.get_active_workspace()}: {e}')
391
+ raise e
392
+ logger.debug(
393
+ f'{request_id} permission granted to {request_name} request')
240
394
  yield
241
395
  finally:
242
396
  # We need to call the save_timeline() since atexit will not be
@@ -250,35 +404,13 @@ def override_request_env_and_config(
250
404
  os.environ.update(original_env)
251
405
 
252
406
 
253
- def _redirect_output(file: TextIO) -> Tuple[int, int]:
254
- """Redirect stdout and stderr to the log file."""
255
- fd = file.fileno() # Get the file descriptor from the file object
256
- # Store copies of the original stdout and stderr file descriptors
257
- original_stdout = os.dup(sys.stdout.fileno())
258
- original_stderr = os.dup(sys.stderr.fileno())
259
-
260
- # Copy this fd to stdout and stderr
261
- os.dup2(fd, sys.stdout.fileno())
262
- os.dup2(fd, sys.stderr.fileno())
263
- return original_stdout, original_stderr
264
-
265
-
266
- def _restore_output(original_stdout: int, original_stderr: int) -> None:
267
- """Restore stdout and stderr to their original file descriptors."""
268
- os.dup2(original_stdout, sys.stdout.fileno())
269
- os.dup2(original_stderr, sys.stderr.fileno())
270
-
271
- # Close the duplicate file descriptors
272
- os.close(original_stdout)
273
- os.close(original_stderr)
274
-
275
-
276
407
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
277
408
  raise KeyboardInterrupt
278
409
 
279
410
 
280
411
  def _request_execution_wrapper(request_id: str,
281
- ignore_return_value: bool) -> None:
412
+ ignore_return_value: bool,
413
+ num_db_connections_per_worker: int = 0) -> None:
282
414
  """Wrapper for a request execution.
283
415
 
284
416
  It wraps the execution of a request to:
@@ -287,71 +419,353 @@ def _request_execution_wrapper(request_id: str,
287
419
  2. Update the request status based on the execution result;
288
420
  3. Redirect the stdout and stderr of the execution to log file;
289
421
  4. Handle the SIGTERM signal to abort the request gracefully.
422
+ 5. Maintain the lifecycle of the temp dir used by the request.
290
423
  """
424
+ pid = multiprocessing.current_process().pid
425
+ proc = psutil.Process(pid)
426
+ rss_begin = proc.memory_info().rss
427
+ db_utils.set_max_connections(num_db_connections_per_worker)
291
428
  # Handle the SIGTERM signal to abort the request processing gracefully.
292
- signal.signal(signal.SIGTERM, _sigterm_handler)
429
+ # Only set up signal handlers in the main thread, as signal.signal() raises
430
+ # ValueError if called from a non-main thread (e.g., in tests).
431
+ if threading.current_thread() is threading.main_thread():
432
+ signal.signal(signal.SIGTERM, _sigterm_handler)
293
433
 
294
- pid = multiprocessing.current_process().pid
295
434
  logger.info(f'Running request {request_id} with pid {pid}')
296
- with api_requests.update_request(request_id) as request_task:
297
- assert request_task is not None, request_id
298
- log_path = request_task.log_path
299
- request_task.pid = pid
300
- request_task.status = api_requests.RequestStatus.RUNNING
301
- func = request_task.entrypoint
302
- request_body = request_task.request_body
303
-
304
- with log_path.open('w', encoding='utf-8') as f:
435
+
436
+ original_stdout = original_stderr = None
437
+
438
+ def _save_current_output() -> None:
439
+ """Save the current stdout and stderr file descriptors."""
440
+ nonlocal original_stdout, original_stderr
441
+ original_stdout = os.dup(sys.stdout.fileno())
442
+ original_stderr = os.dup(sys.stderr.fileno())
443
+
444
+ def _redirect_output(file: TextIO) -> None:
445
+ """Redirect stdout and stderr to the log file."""
446
+ # Get the file descriptor from the file object
447
+ fd = file.fileno()
448
+ # Copy this fd to stdout and stderr
449
+ os.dup2(fd, sys.stdout.fileno())
450
+ os.dup2(fd, sys.stderr.fileno())
451
+
452
+ def _restore_output() -> None:
453
+ """Restore stdout and stderr to their original file descriptors."""
454
+ nonlocal original_stdout, original_stderr
455
+ if original_stdout is not None:
456
+ os.dup2(original_stdout, sys.stdout.fileno())
457
+ os.close(original_stdout)
458
+ original_stdout = None
459
+
460
+ if original_stderr is not None:
461
+ os.dup2(original_stderr, sys.stderr.fileno())
462
+ os.close(original_stderr)
463
+ original_stderr = None
464
+
465
+ request_name = None
466
+ try:
467
+ # As soon as the request is updated with the executor PID, we can
468
+ # receive SIGTERM from cancellation. So, we update the request inside
469
+ # the try block to ensure we have the KeyboardInterrupt handling.
470
+ with api_requests.update_request(request_id) as request_task:
471
+ assert request_task is not None, request_id
472
+ if request_task.status != api_requests.RequestStatus.PENDING:
473
+ logger.debug(f'Request is already {request_task.status.value}, '
474
+ f'skipping execution')
475
+ return
476
+ log_path = request_task.log_path
477
+ request_task.pid = pid
478
+ request_task.status = api_requests.RequestStatus.RUNNING
479
+ func = request_task.entrypoint
480
+ request_body = request_task.request_body
481
+ request_name = request_task.name
482
+
305
483
  # Store copies of the original stdout and stderr file descriptors
306
- original_stdout, original_stderr = _redirect_output(f)
307
- # Redirect the stdout/stderr before overriding the environment and
308
- # config, as there can be some logs during override that needs to be
309
- # captured in the log file.
310
- try:
311
- with override_request_env_and_config(request_body):
484
+ # We do this in two steps because we should make sure to restore the
485
+ # original values even if we are cancelled or fail during the redirect.
486
+ _save_current_output()
487
+
488
+ # Append to the log file instead of overwriting it since there might be
489
+ # logs from previous retries.
490
+ with log_path.open('a', encoding='utf-8') as f:
491
+ # Redirect the stdout/stderr before overriding the environment and
492
+ # config, as there can be some logs during override that needs to be
493
+ # captured in the log file.
494
+ _redirect_output(f)
495
+
496
+ with sky_logging.add_debug_log_handler(request_id), \
497
+ override_request_env_and_config(
498
+ request_body, request_id, request_name), \
499
+ tempstore.tempdir():
312
500
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
313
501
  config = skypilot_config.to_dict()
314
502
  logger.debug(f'request config: \n'
315
- f'{common_utils.dump_yaml_str(dict(config))}')
316
- return_value = func(**request_body.to_kwargs())
503
+ f'{yaml_utils.dump_yaml_str(dict(config))}')
504
+ (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
505
+ labels(request=request_name, pid=pid).inc())
506
+ with metrics_utils.time_it(name=request_name,
507
+ group='request_execution'):
508
+ return_value = func(**request_body.to_kwargs())
317
509
  f.flush()
318
- except KeyboardInterrupt:
319
- logger.info(f'Request {request_id} cancelled by user')
320
- # Kill all children processes related to this request.
321
- # Each executor handles a single request, so we can safely kill all
322
- # children processes related to this request.
323
- # This is required as python does not pass the KeyboardInterrupt
324
- # to the threads that are not main thread.
325
- subprocess_utils.kill_children_processes()
326
- _restore_output(original_stdout, original_stderr)
327
- return
328
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
329
- api_requests.set_request_failed(request_id, e)
330
- _restore_output(original_stdout, original_stderr)
331
- logger.info(f'Request {request_id} failed due to '
332
- f'{common_utils.format_exception(e)}')
333
- return
334
- else:
335
- with api_requests.update_request(request_id) as request_task:
336
- assert request_task is not None, request_id
337
- request_task.status = api_requests.RequestStatus.SUCCEEDED
338
- if not ignore_return_value:
339
- request_task.set_return_value(return_value)
340
- _restore_output(original_stdout, original_stderr)
341
- logger.info(f'Request {request_id} finished')
342
-
343
-
344
- def schedule_request(
345
- request_id: str,
346
- request_name: str,
347
- request_body: payloads.RequestBody,
348
- func: Callable[P, Any],
349
- request_cluster_name: Optional[str] = None,
350
- ignore_return_value: bool = False,
351
- schedule_type: api_requests.ScheduleType = (
352
- api_requests.ScheduleType.LONG),
353
- is_skypilot_system: bool = False,
354
- precondition: Optional[preconditions.Precondition] = None) -> None:
510
+ except KeyboardInterrupt:
511
+ logger.info(f'Request {request_id} cancelled by user')
512
+ # Kill all children processes related to this request.
513
+ # Each executor handles a single request, so we can safely kill all
514
+ # children processes related to this request.
515
+ # This is required as python does not pass the KeyboardInterrupt to the
516
+ # threads that are not main thread.
517
+ subprocess_utils.kill_children_processes()
518
+ return
519
+ except exceptions.ExecutionRetryableError as e:
520
+ logger.error(e)
521
+ logger.info(e.hint)
522
+ with api_requests.update_request(request_id) as request_task:
523
+ assert request_task is not None, request_id
524
+ # Retried request will undergo rescheduling and a new execution,
525
+ # clear the pid of the request.
526
+ request_task.pid = None
527
+ # Yield control to the scheduler for uniform handling of retries.
528
+ _restore_output()
529
+ raise
530
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
531
+ api_requests.set_request_failed(request_id, e)
532
+ # Manually reset the original stdout and stderr file descriptors early
533
+ # so that the "Request xxxx failed due to ..." log message will be
534
+ # written to the original stdout and stderr file descriptors.
535
+ _restore_output()
536
+ logger.info(f'Request {request_id} failed due to '
537
+ f'{common_utils.format_exception(e)}')
538
+ return
539
+ else:
540
+ api_requests.set_request_succeeded(
541
+ request_id, return_value if not ignore_return_value else None)
542
+ # Manually reset the original stdout and stderr file descriptors early
543
+ # so that the "Request xxxx failed due to ..." log message will be
544
+ # written to the original stdout and stderr file descriptors.
545
+ _restore_output()
546
+ logger.info(f'Request {request_id} finished')
547
+ finally:
548
+ _restore_output()
549
+ try:
550
+ # Capture the peak RSS before GC.
551
+ peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
552
+ # Clear request level cache to release all memory used by the
553
+ # request.
554
+ annotations.clear_request_level_cache()
555
+ with metrics_utils.time_it(name='release_memory', group='internal'):
556
+ common_utils.release_memory()
557
+ if request_name is not None:
558
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
559
+ except Exception as e: # pylint: disable=broad-except
560
+ logger.error(f'Failed to record memory metrics: '
561
+ f'{common_utils.format_exception(e)}')
562
+
563
+
564
+ _first_request = True
565
+
566
+
567
+ def _record_memory_metrics(request_name: str, proc: psutil.Process,
568
+ rss_begin: int, peak_rss: int) -> None:
569
+ """Record the memory metrics for a request."""
570
+ # Do not record full memory delta for the first request as it
571
+ # will loads the sky core modules and make the memory usage
572
+ # estimation inaccurate.
573
+ global _first_request
574
+ if _first_request:
575
+ _first_request = False
576
+ return
577
+ rss_end = proc.memory_info().rss
578
+
579
+ # Answer "how much RSS this request contributed?"
580
+ metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
581
+ name=request_name).observe(max(rss_end - rss_begin, 0))
582
+ # Estimate the memory usage by the request by capturing the
583
+ # peak memory delta during the request execution.
584
+ metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
585
+ name=request_name).observe(max(peak_rss - rss_begin, 0))
586
+
587
+
588
+ class CoroutineTask:
589
+ """Wrapper of a background task runs in coroutine"""
590
+
591
+ def __init__(self, task: asyncio.Task):
592
+ self.task = task
593
+
594
+ async def cancel(self):
595
+ try:
596
+ self.task.cancel()
597
+ await self.task
598
+ except asyncio.CancelledError:
599
+ pass
600
+
601
+
602
+ def check_request_thread_executor_available() -> None:
603
+ """Check if the request thread executor is available.
604
+
605
+ This is a best effort check to hint the client to retry other server
606
+ processes when there is no avaiable thread worker in current one. But
607
+ a request may pass this check and still cannot get worker on execution
608
+ time due to race condition. In this case, the client will see a failed
609
+ request instead of retry.
610
+
611
+ TODO(aylei): this can be refined with a refactor of our coroutine
612
+ execution flow.
613
+ """
614
+ get_request_thread_executor().check_available()
615
+
616
+
617
+ def execute_request_in_coroutine(
618
+ request: api_requests.Request) -> CoroutineTask:
619
+ """Execute a request in current event loop.
620
+
621
+ Args:
622
+ request: The request to execute.
623
+
624
+ Returns:
625
+ A CoroutineTask handle to operate the background task.
626
+ """
627
+ task = asyncio.create_task(_execute_request_coroutine(request))
628
+ return CoroutineTask(task)
629
+
630
+
631
+ def _execute_with_config_override(func: Callable,
632
+ request_body: payloads.RequestBody,
633
+ request_id: str, request_name: str,
634
+ **kwargs) -> Any:
635
+ """Execute a function with env and config override inside a thread."""
636
+ # Override the environment and config within this thread's context,
637
+ # which gets copied when we call to_thread.
638
+ with override_request_env_and_config(request_body, request_id,
639
+ request_name):
640
+ return func(**kwargs)
641
+
642
+
643
+ async def _execute_request_coroutine(request: api_requests.Request):
644
+ """Execute a request in current event loop.
645
+
646
+ Similar to _request_execution_wrapper, but executed as coroutine in current
647
+ event loop. This is designed for executing tasks that are not CPU
648
+ intensive, e.g. sky logs.
649
+ """
650
+ context.initialize()
651
+ ctx = context.get()
652
+ assert ctx is not None, 'Context is not initialized'
653
+ logger.info(f'Executing request {request.request_id} in coroutine')
654
+ func = request.entrypoint
655
+ request_body = request.request_body
656
+ await api_requests.update_status_async(request.request_id,
657
+ api_requests.RequestStatus.RUNNING)
658
+ # Redirect stdout and stderr to the request log path.
659
+ original_output = ctx.redirect_log(request.log_path)
660
+ try:
661
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
662
+ get_request_thread_executor(), _execute_with_config_override, func,
663
+ request_body, request.request_id, request.name,
664
+ **request_body.to_kwargs())
665
+ except Exception as e: # pylint: disable=broad-except
666
+ ctx.redirect_log(original_output)
667
+ await api_requests.set_request_failed_async(request.request_id, e)
668
+ logger.error(f'Failed to run request {request.request_id} due to '
669
+ f'{common_utils.format_exception(e)}')
670
+ return
671
+
672
+ async def poll_task(request_id: str) -> bool:
673
+ req_status = await api_requests.get_request_status_async(request_id)
674
+ if req_status is None:
675
+ raise RuntimeError('Request not found')
676
+
677
+ if req_status.status == api_requests.RequestStatus.CANCELLED:
678
+ ctx.cancel()
679
+ return True
680
+
681
+ if fut.done():
682
+ try:
683
+ result = await fut
684
+ await api_requests.set_request_succeeded_async(
685
+ request_id, result)
686
+ except asyncio.CancelledError:
687
+ # The task is cancelled by ctx.cancel(), where the status
688
+ # should already be set to CANCELLED.
689
+ pass
690
+ except Exception as e: # pylint: disable=broad-except
691
+ ctx.redirect_log(original_output)
692
+ await api_requests.set_request_failed_async(request_id, e)
693
+ logger.error(f'Request {request_id} failed due to '
694
+ f'{common_utils.format_exception(e)}')
695
+ return True
696
+ return False
697
+
698
+ try:
699
+ while True:
700
+ res = await poll_task(request.request_id)
701
+ if res:
702
+ break
703
+ await asyncio.sleep(0.5)
704
+ except asyncio.CancelledError:
705
+ # Current coroutine is cancelled due to client disconnect, set the
706
+ # request status for consistency.
707
+ await api_requests.set_request_cancelled_async(request.request_id)
708
+ pass
709
+ # pylint: disable=broad-except
710
+ except (Exception, KeyboardInterrupt, SystemExit) as e:
711
+ # Handle any other error
712
+ ctx.redirect_log(original_output)
713
+ await api_requests.set_request_failed_async(request.request_id, e)
714
+ logger.error(f'Request {request.request_id} interrupted due to '
715
+ f'unhandled exception: {common_utils.format_exception(e)}')
716
+ raise
717
+ finally:
718
+ # Always cancel the context to kill potentially running background
719
+ # routine.
720
+ ctx.cancel()
721
+
722
+
723
+ async def prepare_request_async(
724
+ request_id: str,
725
+ request_name: request_names.RequestName,
726
+ request_body: payloads.RequestBody,
727
+ func: Callable[P, Any],
728
+ request_cluster_name: Optional[str] = None,
729
+ schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
730
+ is_skypilot_system: bool = False,
731
+ ) -> api_requests.Request:
732
+ """Prepare a request for execution."""
733
+ user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
734
+ if is_skypilot_system:
735
+ user_id = constants.SKYPILOT_SYSTEM_USER_ID
736
+ global_user_state.add_or_update_user(
737
+ models.User(id=user_id, name=user_id))
738
+ request = api_requests.Request(request_id=request_id,
739
+ name=server_constants.REQUEST_NAME_PREFIX +
740
+ request_name,
741
+ entrypoint=func,
742
+ request_body=request_body,
743
+ status=api_requests.RequestStatus.PENDING,
744
+ created_at=time.time(),
745
+ schedule_type=schedule_type,
746
+ user_id=user_id,
747
+ cluster_name=request_cluster_name)
748
+
749
+ if not await api_requests.create_if_not_exists_async(request):
750
+ raise exceptions.RequestAlreadyExistsError(
751
+ f'Request {request_id} already exists.')
752
+
753
+ request.log_path.touch()
754
+ return request
755
+
756
+
757
+ async def schedule_request_async(request_id: str,
758
+ request_name: request_names.RequestName,
759
+ request_body: payloads.RequestBody,
760
+ func: Callable[P, Any],
761
+ request_cluster_name: Optional[str] = None,
762
+ ignore_return_value: bool = False,
763
+ schedule_type: api_requests.ScheduleType = (
764
+ api_requests.ScheduleType.LONG),
765
+ is_skypilot_system: bool = False,
766
+ precondition: Optional[
767
+ preconditions.Precondition] = None,
768
+ retryable: bool = False) -> None:
355
769
  """Enqueue a request to the request queue.
356
770
 
357
771
  Args:
@@ -372,32 +786,37 @@ def schedule_request(
372
786
  The precondition is waited asynchronously and does not block the
373
787
  caller.
374
788
  """
375
- user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
376
- if is_skypilot_system:
377
- user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
378
- global_user_state.add_or_update_user(
379
- models.User(id=user_id, name=user_id))
380
- request = api_requests.Request(request_id=request_id,
381
- name=server_constants.REQUEST_NAME_PREFIX +
382
- request_name,
383
- entrypoint=func,
384
- request_body=request_body,
385
- status=api_requests.RequestStatus.PENDING,
386
- created_at=time.time(),
387
- schedule_type=schedule_type,
388
- user_id=user_id,
389
- cluster_name=request_cluster_name)
390
-
391
- if not api_requests.create_if_not_exists(request):
392
- logger.debug(f'Request {request_id} already exists.')
393
- return
789
+ request_task = await prepare_request_async(request_id, request_name,
790
+ request_body, func,
791
+ request_cluster_name,
792
+ schedule_type,
793
+ is_skypilot_system)
794
+ schedule_prepared_request(request_task, ignore_return_value, precondition,
795
+ retryable)
796
+
797
+
798
+ def schedule_prepared_request(request_task: api_requests.Request,
799
+ ignore_return_value: bool = False,
800
+ precondition: Optional[
801
+ preconditions.Precondition] = None,
802
+ retryable: bool = False) -> None:
803
+ """Enqueue a request to the request queue
394
804
 
395
- request.log_path.touch()
805
+ Args:
806
+ request_task: The prepared request task to schedule.
807
+ ignore_return_value: If True, the return value of the function will be
808
+ ignored.
809
+ precondition: If a precondition is provided, the request will only be
810
+ scheduled for execution when the precondition is met (returns True).
811
+ The precondition is waited asynchronously and does not block the
812
+ caller.
813
+ retryable: Whether the request should be retried if it fails.
814
+ """
396
815
 
397
816
  def enqueue():
398
- input_tuple = (request_id, ignore_return_value)
399
- logger.info(f'Queuing request: {request_id}')
400
- _get_queue(schedule_type).put(input_tuple)
817
+ input_tuple = (request_task.request_id, ignore_return_value, retryable)
818
+ logger.info(f'Queuing request: {request_task.request_id}')
819
+ _get_queue(request_task.schedule_type).put(input_tuple)
401
820
 
402
821
  if precondition is not None:
403
822
  # Wait async to avoid blocking caller.
@@ -406,15 +825,21 @@ def schedule_request(
406
825
  enqueue()
407
826
 
408
827
 
409
- def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
828
+ def start(
829
+ config: server_config.ServerConfig
830
+ ) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
410
831
  """Start the request workers.
411
832
 
412
833
  Request workers run in background, schedule the requests and delegate the
413
834
  request execution to executor processes.
835
+
836
+ Returns:
837
+ A tuple of the queue server process and the list of request worker
838
+ threads.
414
839
  """
415
840
  global queue_backend
416
841
  queue_backend = config.queue_backend
417
- sub_procs = []
842
+ queue_server = None
418
843
  # Setup the queues.
419
844
  if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
420
845
  logger.info('Creating shared request queues')
@@ -431,7 +856,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
431
856
  queue_server = multiprocessing.Process(
432
857
  target=mp_queue.start_queue_manager, args=(queue_names, port))
433
858
  queue_server.start()
434
- sub_procs.append(queue_server)
435
859
  mp_queue.wait_for_queues_to_be_ready(queue_names,
436
860
  queue_server,
437
861
  port=port)
@@ -444,20 +868,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
444
868
 
445
869
  logger.info('Request queues created')
446
870
 
447
- def run_worker_in_background(worker: RequestWorker):
448
- # Thread dispatcher is sufficient for current scale, refer to
449
- # tests/load_tests/test_queue_dispatcher.py for more details.
450
- # Use daemon thread for automatic cleanup.
451
- thread = threading.Thread(target=worker.run, daemon=True)
452
- thread.start()
453
-
871
+ workers = []
454
872
  # Start a worker for long requests.
455
873
  long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
456
874
  config=config.long_worker_config)
457
- run_worker_in_background(long_worker)
875
+ long_worker.run_in_background()
876
+ workers.append(long_worker)
458
877
 
459
878
  # Start a worker for short requests.
460
879
  short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
461
880
  config=config.short_worker_config)
462
- run_worker_in_background(short_worker)
463
- return sub_procs
881
+ short_worker.run_in_background()
882
+ workers.append(short_worker)
883
+ return queue_server, workers