skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,8 @@ The number of the workers is determined by the system resources.
18
18
 
19
19
  See the [README.md](../README.md) for detailed architecture of the executor.
20
20
  """
21
+ import asyncio
22
+ import concurrent.futures
21
23
  import contextlib
22
24
  import multiprocessing
23
25
  import os
@@ -29,26 +31,38 @@ import time
29
31
  import typing
30
32
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
31
33
 
34
+ import psutil
32
35
  import setproctitle
33
36
 
37
+ from sky import exceptions
34
38
  from sky import global_user_state
35
39
  from sky import models
36
40
  from sky import sky_logging
37
41
  from sky import skypilot_config
42
+ from sky.metrics import utils as metrics_utils
38
43
  from sky.server import common as server_common
39
44
  from sky.server import config as server_config
40
45
  from sky.server import constants as server_constants
46
+ from sky.server import metrics as metrics_lib
41
47
  from sky.server.requests import payloads
42
48
  from sky.server.requests import preconditions
43
49
  from sky.server.requests import process
50
+ from sky.server.requests import request_names
44
51
  from sky.server.requests import requests as api_requests
52
+ from sky.server.requests import threads
45
53
  from sky.server.requests.queues import local_queue
46
54
  from sky.server.requests.queues import mp_queue
47
55
  from sky.skylet import constants
48
56
  from sky.utils import annotations
49
57
  from sky.utils import common_utils
58
+ from sky.utils import context
59
+ from sky.utils import context_utils
50
60
  from sky.utils import subprocess_utils
61
+ from sky.utils import tempstore
51
62
  from sky.utils import timeline
63
+ from sky.utils import yaml_utils
64
+ from sky.utils.db import db_utils
65
+ from sky.workspaces import core as workspaces_core
52
66
 
53
67
  if typing.TYPE_CHECKING:
54
68
  import types
@@ -60,7 +74,6 @@ else:
60
74
  from typing_extensions import ParamSpec
61
75
 
62
76
  P = ParamSpec('P')
63
-
64
77
  logger = sky_logging.init_logger(__name__)
65
78
 
66
79
  # On macOS, the default start method for multiprocessing is 'fork', which
@@ -70,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
70
83
  # platforms, including macOS.
71
84
  multiprocessing.set_start_method('spawn', force=True)
72
85
 
86
+ # An upper limit of max threads for request execution per server process that
87
+ # unlikely to be reached to allow higher concurrency while still prevent the
88
+ # server process become overloaded.
89
+ _REQUEST_THREADS_LIMIT = 128
90
+
91
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
92
+ # A dedicated thread pool executor for synced requests execution in coroutine to
93
+ # avoid:
94
+ # 1. blocking the event loop;
95
+ # 2. exhausting the default thread pool executor of event loop;
96
+ _REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
97
+
98
+
99
+ def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
100
+ """Lazy init and return the request thread executor for current process."""
101
+ global _REQUEST_THREAD_EXECUTOR
102
+ if _REQUEST_THREAD_EXECUTOR is not None:
103
+ return _REQUEST_THREAD_EXECUTOR
104
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
105
+ if _REQUEST_THREAD_EXECUTOR is None:
106
+ _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
107
+ name='request_thread_executor',
108
+ max_workers=_REQUEST_THREADS_LIMIT)
109
+ return _REQUEST_THREAD_EXECUTOR
110
+
73
111
 
74
112
  class RequestQueue:
75
113
  """The queue for the requests, either redis or multiprocessing.
@@ -89,21 +127,21 @@ class RequestQueue:
89
127
  else:
90
128
  raise RuntimeError(f'Invalid queue backend: {backend}')
91
129
 
92
- def put(self, request: Tuple[str, bool]) -> None:
130
+ def put(self, request: Tuple[str, bool, bool]) -> None:
93
131
  """Put and request to the queue.
94
132
 
95
133
  Args:
96
- request: A tuple of request_id and ignore_return_value.
134
+ request: A tuple of request_id, ignore_return_value, and retryable.
97
135
  """
98
136
  self.queue.put(request) # type: ignore
99
137
 
100
- def get(self) -> Optional[Tuple[str, bool]]:
138
+ def get(self) -> Optional[Tuple[str, bool, bool]]:
101
139
  """Get a request from the queue.
102
140
 
103
141
  It is non-blocking if the queue is empty, and returns None.
104
142
 
105
143
  Returns:
106
- A tuple of request_id and ignore_return_value.
144
+ A tuple of request_id, ignore_return_value, and retryable.
107
145
  """
108
146
  try:
109
147
  return self.queue.get(block=False)
@@ -121,6 +159,10 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
121
159
  def executor_initializer(proc_group: str):
122
160
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
123
161
  f'{multiprocessing.current_process().pid}')
162
+ # Executor never stops, unless the whole process is killed.
163
+ threading.Thread(target=metrics_lib.process_monitor,
164
+ args=(f'worker:{proc_group}', threading.Event()),
165
+ daemon=True).start()
124
166
 
125
167
 
126
168
  class RequestWorker:
@@ -144,10 +186,27 @@ class RequestWorker:
144
186
  self.schedule_type = schedule_type
145
187
  self.garanteed_parallelism = config.garanteed_parallelism
146
188
  self.burstable_parallelism = config.burstable_parallelism
189
+ self.num_db_connections_per_worker = (
190
+ config.num_db_connections_per_worker)
191
+ self._thread: Optional[threading.Thread] = None
192
+ self._cancel_event = threading.Event()
147
193
 
148
194
  def __str__(self) -> str:
149
195
  return f'Worker(schedule_type={self.schedule_type.value})'
150
196
 
197
+ def run_in_background(self) -> None:
198
+ # Thread dispatcher is sufficient for current scale, refer to
199
+ # tests/load_tests/test_queue_dispatcher.py for more details.
200
+ # Use daemon thread for automatic cleanup.
201
+ thread = threading.Thread(target=self.run, daemon=True)
202
+ thread.start()
203
+ self._thread = thread
204
+
205
+ def cancel(self) -> None:
206
+ if self._thread is not None:
207
+ self._cancel_event.set()
208
+ self._thread.join()
209
+
151
210
  def process_request(self, executor: process.BurstableExecutor,
152
211
  queue: RequestQueue) -> None:
153
212
  try:
@@ -155,11 +214,12 @@ class RequestWorker:
155
214
  if request_element is None:
156
215
  time.sleep(0.1)
157
216
  return
158
- request_id, ignore_return_value = request_element
159
- request = api_requests.get_request(request_id)
217
+ request_id, ignore_return_value, _ = request_element
218
+ request = api_requests.get_request(request_id, fields=['status'])
160
219
  assert request is not None, f'Request with ID {request_id} is None'
161
220
  if request.status == api_requests.RequestStatus.CANCELLED:
162
221
  return
222
+ del request
163
223
  logger.info(f'[{self}] Submitting request: {request_id}')
164
224
  # Start additional process to run the request, so that it can be
165
225
  # cancelled when requested by a user.
@@ -167,8 +227,13 @@ class RequestWorker:
167
227
  # multiple requests can share the same process pid, which may cause
168
228
  # issues with SkyPilot core functions if they rely on the exit of
169
229
  # the process, such as subprocess_daemon.py.
170
- executor.submit_until_success(_request_execution_wrapper,
171
- request_id, ignore_return_value)
230
+ fut = executor.submit_until_success(
231
+ _request_execution_wrapper, request_id, ignore_return_value,
232
+ self.num_db_connections_per_worker)
233
+ # Monitor the result of the request execution.
234
+ threading.Thread(target=self.handle_task_result,
235
+ args=(fut, request_element),
236
+ daemon=True).start()
172
237
 
173
238
  logger.info(f'[{self}] Submitted request: {request_id}')
174
239
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
@@ -178,6 +243,31 @@ class RequestWorker:
178
243
  f'{request_id if "request_id" in locals() else ""} '
179
244
  f'{common_utils.format_exception(e, use_bracket=True)}')
180
245
 
246
+ def handle_task_result(self, fut: concurrent.futures.Future,
247
+ request_element: Tuple[str, bool, bool]) -> None:
248
+ try:
249
+ fut.result()
250
+ except concurrent.futures.process.BrokenProcessPool as e:
251
+ # Happens when the worker process dies unexpectedly, e.g. OOM
252
+ # killed.
253
+ request_id, _, retryable = request_element
254
+ # Ensure the request status.
255
+ api_requests.set_request_failed(request_id, e)
256
+ logger.error(
257
+ f'Request {request_id} failed to get processed '
258
+ f'{common_utils.format_exception(e, use_bracket=True)}')
259
+ if retryable:
260
+ # If the request is retryable and disrupted by broken
261
+ # process pool, reschedule it immediately to get it
262
+ # retried in the new process pool.
263
+ queue = _get_queue(self.schedule_type)
264
+ queue.put(request_element)
265
+ except exceptions.ExecutionRetryableError as e:
266
+ time.sleep(e.retry_wait_seconds)
267
+ # Reschedule the request.
268
+ queue = _get_queue(self.schedule_type)
269
+ queue.put(request_element)
270
+
181
271
  def run(self) -> None:
182
272
  # Handle the SIGTERM signal to abort the executor process gracefully.
183
273
  proc_group = f'{self.schedule_type.value}'
@@ -198,7 +288,7 @@ class RequestWorker:
198
288
  burst_workers=self.burstable_parallelism,
199
289
  initializer=executor_initializer,
200
290
  initargs=(proc_group,))
201
- while True:
291
+ while not self._cancel_event.is_set():
202
292
  self.process_request(executor, queue)
203
293
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
204
294
  except KeyboardInterrupt:
@@ -221,22 +311,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
221
311
 
222
312
  @contextlib.contextmanager
223
313
  def override_request_env_and_config(
224
- request_body: payloads.RequestBody) -> Generator[None, None, None]:
314
+ request_body: payloads.RequestBody, request_id: str,
315
+ request_name: str) -> Generator[None, None, None]:
225
316
  """Override the environment and SkyPilot config for a request."""
226
317
  original_env = os.environ.copy()
227
- os.environ.update(request_body.env_vars)
228
- user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
229
- name=request_body.env_vars[constants.USER_ENV_VAR])
230
- global_user_state.add_or_update_user(user)
231
- # Force color to be enabled.
232
- os.environ['CLICOLOR_FORCE'] = '1'
233
- server_common.reload_for_new_request(
234
- client_entrypoint=request_body.entrypoint,
235
- client_command=request_body.entrypoint_command,
236
- using_remote_api_server=request_body.using_remote_api_server)
237
318
  try:
319
+ # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
320
+ # server affecting client requests. If set on the client side, it will
321
+ # be overridden by the request body.
322
+ os.environ.pop('SKYPILOT_DEBUG', None)
323
+ # Remove the db connection uri from client supplied env vars, as the
324
+ # client should not set the db string on server side.
325
+ request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
326
+ os.environ.update(request_body.env_vars)
327
+ # Note: may be overridden by AuthProxyMiddleware.
328
+ # TODO(zhwu): we need to make the entire request a context available to
329
+ # the entire request execution, so that we can access info like user
330
+ # through the execution.
331
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
332
+ name=request_body.env_vars[constants.USER_ENV_VAR])
333
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
334
+
335
+ # Force color to be enabled.
336
+ os.environ['CLICOLOR_FORCE'] = '1'
337
+ server_common.reload_for_new_request(
338
+ client_entrypoint=request_body.entrypoint,
339
+ client_command=request_body.entrypoint_command,
340
+ using_remote_api_server=request_body.using_remote_api_server,
341
+ user=user,
342
+ request_id=request_id)
343
+ logger.debug(
344
+ f'override path: {request_body.override_skypilot_config_path}')
238
345
  with skypilot_config.override_skypilot_config(
239
- request_body.override_skypilot_config):
346
+ request_body.override_skypilot_config,
347
+ request_body.override_skypilot_config_path):
348
+ # Skip permission check for sky.workspaces.get request
349
+ # as it is used to determine which workspaces the user
350
+ # has access to.
351
+ if request_name != 'sky.workspaces.get':
352
+ try:
353
+ # Reject requests that the user does not have permission
354
+ # to access.
355
+ workspaces_core.reject_request_for_unauthorized_workspace(
356
+ user)
357
+ except exceptions.PermissionDeniedError as e:
358
+ logger.debug(
359
+ f'{request_id} permission denied to workspace: '
360
+ f'{skypilot_config.get_active_workspace()}: {e}')
361
+ raise e
362
+ logger.debug(
363
+ f'{request_id} permission granted to {request_name} request')
240
364
  yield
241
365
  finally:
242
366
  # We need to call the save_timeline() since atexit will not be
@@ -250,35 +374,13 @@ def override_request_env_and_config(
250
374
  os.environ.update(original_env)
251
375
 
252
376
 
253
- def _redirect_output(file: TextIO) -> Tuple[int, int]:
254
- """Redirect stdout and stderr to the log file."""
255
- fd = file.fileno() # Get the file descriptor from the file object
256
- # Store copies of the original stdout and stderr file descriptors
257
- original_stdout = os.dup(sys.stdout.fileno())
258
- original_stderr = os.dup(sys.stderr.fileno())
259
-
260
- # Copy this fd to stdout and stderr
261
- os.dup2(fd, sys.stdout.fileno())
262
- os.dup2(fd, sys.stderr.fileno())
263
- return original_stdout, original_stderr
264
-
265
-
266
- def _restore_output(original_stdout: int, original_stderr: int) -> None:
267
- """Restore stdout and stderr to their original file descriptors."""
268
- os.dup2(original_stdout, sys.stdout.fileno())
269
- os.dup2(original_stderr, sys.stderr.fileno())
270
-
271
- # Close the duplicate file descriptors
272
- os.close(original_stdout)
273
- os.close(original_stderr)
274
-
275
-
276
377
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
277
378
  raise KeyboardInterrupt
278
379
 
279
380
 
280
381
  def _request_execution_wrapper(request_id: str,
281
- ignore_return_value: bool) -> None:
382
+ ignore_return_value: bool,
383
+ num_db_connections_per_worker: int = 0) -> None:
282
384
  """Wrapper for a request execution.
283
385
 
284
386
  It wraps the execution of a request to:
@@ -287,71 +389,353 @@ def _request_execution_wrapper(request_id: str,
287
389
  2. Update the request status based on the execution result;
288
390
  3. Redirect the stdout and stderr of the execution to log file;
289
391
  4. Handle the SIGTERM signal to abort the request gracefully.
392
+ 5. Maintain the lifecycle of the temp dir used by the request.
290
393
  """
394
+ pid = multiprocessing.current_process().pid
395
+ proc = psutil.Process(pid)
396
+ rss_begin = proc.memory_info().rss
397
+ db_utils.set_max_connections(num_db_connections_per_worker)
291
398
  # Handle the SIGTERM signal to abort the request processing gracefully.
292
- signal.signal(signal.SIGTERM, _sigterm_handler)
399
+ # Only set up signal handlers in the main thread, as signal.signal() raises
400
+ # ValueError if called from a non-main thread (e.g., in tests).
401
+ if threading.current_thread() is threading.main_thread():
402
+ signal.signal(signal.SIGTERM, _sigterm_handler)
293
403
 
294
- pid = multiprocessing.current_process().pid
295
404
  logger.info(f'Running request {request_id} with pid {pid}')
296
- with api_requests.update_request(request_id) as request_task:
297
- assert request_task is not None, request_id
298
- log_path = request_task.log_path
299
- request_task.pid = pid
300
- request_task.status = api_requests.RequestStatus.RUNNING
301
- func = request_task.entrypoint
302
- request_body = request_task.request_body
303
-
304
- with log_path.open('w', encoding='utf-8') as f:
405
+
406
+ original_stdout = original_stderr = None
407
+
408
+ def _save_current_output() -> None:
409
+ """Save the current stdout and stderr file descriptors."""
410
+ nonlocal original_stdout, original_stderr
411
+ original_stdout = os.dup(sys.stdout.fileno())
412
+ original_stderr = os.dup(sys.stderr.fileno())
413
+
414
+ def _redirect_output(file: TextIO) -> None:
415
+ """Redirect stdout and stderr to the log file."""
416
+ # Get the file descriptor from the file object
417
+ fd = file.fileno()
418
+ # Copy this fd to stdout and stderr
419
+ os.dup2(fd, sys.stdout.fileno())
420
+ os.dup2(fd, sys.stderr.fileno())
421
+
422
+ def _restore_output() -> None:
423
+ """Restore stdout and stderr to their original file descriptors."""
424
+ nonlocal original_stdout, original_stderr
425
+ if original_stdout is not None:
426
+ os.dup2(original_stdout, sys.stdout.fileno())
427
+ os.close(original_stdout)
428
+ original_stdout = None
429
+
430
+ if original_stderr is not None:
431
+ os.dup2(original_stderr, sys.stderr.fileno())
432
+ os.close(original_stderr)
433
+ original_stderr = None
434
+
435
+ request_name = None
436
+ try:
437
+ # As soon as the request is updated with the executor PID, we can
438
+ # receive SIGTERM from cancellation. So, we update the request inside
439
+ # the try block to ensure we have the KeyboardInterrupt handling.
440
+ with api_requests.update_request(request_id) as request_task:
441
+ assert request_task is not None, request_id
442
+ if request_task.status != api_requests.RequestStatus.PENDING:
443
+ logger.debug(f'Request is already {request_task.status.value}, '
444
+ f'skipping execution')
445
+ return
446
+ log_path = request_task.log_path
447
+ request_task.pid = pid
448
+ request_task.status = api_requests.RequestStatus.RUNNING
449
+ func = request_task.entrypoint
450
+ request_body = request_task.request_body
451
+ request_name = request_task.name
452
+
305
453
  # Store copies of the original stdout and stderr file descriptors
306
- original_stdout, original_stderr = _redirect_output(f)
307
- # Redirect the stdout/stderr before overriding the environment and
308
- # config, as there can be some logs during override that needs to be
309
- # captured in the log file.
310
- try:
311
- with override_request_env_and_config(request_body):
454
+ # We do this in two steps because we should make sure to restore the
455
+ # original values even if we are cancelled or fail during the redirect.
456
+ _save_current_output()
457
+
458
+ # Append to the log file instead of overwriting it since there might be
459
+ # logs from previous retries.
460
+ with log_path.open('a', encoding='utf-8') as f:
461
+ # Redirect the stdout/stderr before overriding the environment and
462
+ # config, as there can be some logs during override that needs to be
463
+ # captured in the log file.
464
+ _redirect_output(f)
465
+
466
+ with sky_logging.add_debug_log_handler(request_id), \
467
+ override_request_env_and_config(
468
+ request_body, request_id, request_name), \
469
+ tempstore.tempdir():
312
470
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
313
471
  config = skypilot_config.to_dict()
314
472
  logger.debug(f'request config: \n'
315
- f'{common_utils.dump_yaml_str(dict(config))}')
316
- return_value = func(**request_body.to_kwargs())
473
+ f'{yaml_utils.dump_yaml_str(dict(config))}')
474
+ (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
475
+ labels(request=request_name, pid=pid).inc())
476
+ with metrics_utils.time_it(name=request_name,
477
+ group='request_execution'):
478
+ return_value = func(**request_body.to_kwargs())
317
479
  f.flush()
318
- except KeyboardInterrupt:
319
- logger.info(f'Request {request_id} cancelled by user')
320
- # Kill all children processes related to this request.
321
- # Each executor handles a single request, so we can safely kill all
322
- # children processes related to this request.
323
- # This is required as python does not pass the KeyboardInterrupt
324
- # to the threads that are not main thread.
325
- subprocess_utils.kill_children_processes()
326
- _restore_output(original_stdout, original_stderr)
327
- return
328
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
329
- api_requests.set_request_failed(request_id, e)
330
- _restore_output(original_stdout, original_stderr)
331
- logger.info(f'Request {request_id} failed due to '
332
- f'{common_utils.format_exception(e)}')
333
- return
334
- else:
335
- with api_requests.update_request(request_id) as request_task:
336
- assert request_task is not None, request_id
337
- request_task.status = api_requests.RequestStatus.SUCCEEDED
338
- if not ignore_return_value:
339
- request_task.set_return_value(return_value)
340
- _restore_output(original_stdout, original_stderr)
341
- logger.info(f'Request {request_id} finished')
342
-
343
-
344
- def schedule_request(
345
- request_id: str,
346
- request_name: str,
347
- request_body: payloads.RequestBody,
348
- func: Callable[P, Any],
349
- request_cluster_name: Optional[str] = None,
350
- ignore_return_value: bool = False,
351
- schedule_type: api_requests.ScheduleType = (
352
- api_requests.ScheduleType.LONG),
353
- is_skypilot_system: bool = False,
354
- precondition: Optional[preconditions.Precondition] = None) -> None:
480
+ except KeyboardInterrupt:
481
+ logger.info(f'Request {request_id} cancelled by user')
482
+ # Kill all children processes related to this request.
483
+ # Each executor handles a single request, so we can safely kill all
484
+ # children processes related to this request.
485
+ # This is required as python does not pass the KeyboardInterrupt to the
486
+ # threads that are not main thread.
487
+ subprocess_utils.kill_children_processes()
488
+ return
489
+ except exceptions.ExecutionRetryableError as e:
490
+ logger.error(e)
491
+ logger.info(e.hint)
492
+ with api_requests.update_request(request_id) as request_task:
493
+ assert request_task is not None, request_id
494
+ # Retried request will undergo rescheduling and a new execution,
495
+ # clear the pid of the request.
496
+ request_task.pid = None
497
+ # Yield control to the scheduler for uniform handling of retries.
498
+ _restore_output()
499
+ raise
500
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
501
+ api_requests.set_request_failed(request_id, e)
502
+ # Manually reset the original stdout and stderr file descriptors early
503
+ # so that the "Request xxxx failed due to ..." log message will be
504
+ # written to the original stdout and stderr file descriptors.
505
+ _restore_output()
506
+ logger.info(f'Request {request_id} failed due to '
507
+ f'{common_utils.format_exception(e)}')
508
+ return
509
+ else:
510
+ api_requests.set_request_succeeded(
511
+ request_id, return_value if not ignore_return_value else None)
512
+ # Manually reset the original stdout and stderr file descriptors early
513
+ # so that the "Request xxxx failed due to ..." log message will be
514
+ # written to the original stdout and stderr file descriptors.
515
+ _restore_output()
516
+ logger.info(f'Request {request_id} finished')
517
+ finally:
518
+ _restore_output()
519
+ try:
520
+ # Capture the peak RSS before GC.
521
+ peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
522
+ # Clear request level cache to release all memory used by the
523
+ # request.
524
+ annotations.clear_request_level_cache()
525
+ with metrics_utils.time_it(name='release_memory', group='internal'):
526
+ common_utils.release_memory()
527
+ if request_name is not None:
528
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
529
+ except Exception as e: # pylint: disable=broad-except
530
+ logger.error(f'Failed to record memory metrics: '
531
+ f'{common_utils.format_exception(e)}')
532
+
533
+
534
+ _first_request = True
535
+
536
+
537
+ def _record_memory_metrics(request_name: str, proc: psutil.Process,
538
+ rss_begin: int, peak_rss: int) -> None:
539
+ """Record the memory metrics for a request."""
540
+ # Do not record full memory delta for the first request as it
541
+ # will loads the sky core modules and make the memory usage
542
+ # estimation inaccurate.
543
+ global _first_request
544
+ if _first_request:
545
+ _first_request = False
546
+ return
547
+ rss_end = proc.memory_info().rss
548
+
549
+ # Answer "how much RSS this request contributed?"
550
+ metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
551
+ name=request_name).observe(max(rss_end - rss_begin, 0))
552
+ # Estimate the memory usage by the request by capturing the
553
+ # peak memory delta during the request execution.
554
+ metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
555
+ name=request_name).observe(max(peak_rss - rss_begin, 0))
556
+
557
+
558
+ class CoroutineTask:
559
+ """Wrapper of a background task runs in coroutine"""
560
+
561
+ def __init__(self, task: asyncio.Task):
562
+ self.task = task
563
+
564
+ async def cancel(self):
565
+ try:
566
+ self.task.cancel()
567
+ await self.task
568
+ except asyncio.CancelledError:
569
+ pass
570
+
571
+
572
+ def check_request_thread_executor_available() -> None:
573
+ """Check if the request thread executor is available.
574
+
575
+ This is a best effort check to hint the client to retry other server
576
+ processes when there is no avaiable thread worker in current one. But
577
+ a request may pass this check and still cannot get worker on execution
578
+ time due to race condition. In this case, the client will see a failed
579
+ request instead of retry.
580
+
581
+ TODO(aylei): this can be refined with a refactor of our coroutine
582
+ execution flow.
583
+ """
584
+ get_request_thread_executor().check_available()
585
+
586
+
587
+ def execute_request_in_coroutine(
588
+ request: api_requests.Request) -> CoroutineTask:
589
+ """Execute a request in current event loop.
590
+
591
+ Args:
592
+ request: The request to execute.
593
+
594
+ Returns:
595
+ A CoroutineTask handle to operate the background task.
596
+ """
597
+ task = asyncio.create_task(_execute_request_coroutine(request))
598
+ return CoroutineTask(task)
599
+
600
+
601
+ def _execute_with_config_override(func: Callable,
602
+ request_body: payloads.RequestBody,
603
+ request_id: str, request_name: str,
604
+ **kwargs) -> Any:
605
+ """Execute a function with env and config override inside a thread."""
606
+ # Override the environment and config within this thread's context,
607
+ # which gets copied when we call to_thread.
608
+ with override_request_env_and_config(request_body, request_id,
609
+ request_name):
610
+ return func(**kwargs)
611
+
612
+
613
+ async def _execute_request_coroutine(request: api_requests.Request):
614
+ """Execute a request in current event loop.
615
+
616
+ Similar to _request_execution_wrapper, but executed as coroutine in current
617
+ event loop. This is designed for executing tasks that are not CPU
618
+ intensive, e.g. sky logs.
619
+ """
620
+ context.initialize()
621
+ ctx = context.get()
622
+ assert ctx is not None, 'Context is not initialized'
623
+ logger.info(f'Executing request {request.request_id} in coroutine')
624
+ func = request.entrypoint
625
+ request_body = request.request_body
626
+ await api_requests.update_status_async(request.request_id,
627
+ api_requests.RequestStatus.RUNNING)
628
+ # Redirect stdout and stderr to the request log path.
629
+ original_output = ctx.redirect_log(request.log_path)
630
+ try:
631
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
632
+ get_request_thread_executor(), _execute_with_config_override, func,
633
+ request_body, request.request_id, request.name,
634
+ **request_body.to_kwargs())
635
+ except Exception as e: # pylint: disable=broad-except
636
+ ctx.redirect_log(original_output)
637
+ await api_requests.set_request_failed_async(request.request_id, e)
638
+ logger.error(f'Failed to run request {request.request_id} due to '
639
+ f'{common_utils.format_exception(e)}')
640
+ return
641
+
642
+ async def poll_task(request_id: str) -> bool:
643
+ req_status = await api_requests.get_request_status_async(request_id)
644
+ if req_status is None:
645
+ raise RuntimeError('Request not found')
646
+
647
+ if req_status.status == api_requests.RequestStatus.CANCELLED:
648
+ ctx.cancel()
649
+ return True
650
+
651
+ if fut.done():
652
+ try:
653
+ result = await fut
654
+ await api_requests.set_request_succeeded_async(
655
+ request_id, result)
656
+ except asyncio.CancelledError:
657
+ # The task is cancelled by ctx.cancel(), where the status
658
+ # should already be set to CANCELLED.
659
+ pass
660
+ except Exception as e: # pylint: disable=broad-except
661
+ ctx.redirect_log(original_output)
662
+ await api_requests.set_request_failed_async(request_id, e)
663
+ logger.error(f'Request {request_id} failed due to '
664
+ f'{common_utils.format_exception(e)}')
665
+ return True
666
+ return False
667
+
668
+ try:
669
+ while True:
670
+ res = await poll_task(request.request_id)
671
+ if res:
672
+ break
673
+ await asyncio.sleep(0.5)
674
+ except asyncio.CancelledError:
675
+ # Current coroutine is cancelled due to client disconnect, set the
676
+ # request status for consistency.
677
+ await api_requests.set_request_cancelled_async(request.request_id)
678
+ pass
679
+ # pylint: disable=broad-except
680
+ except (Exception, KeyboardInterrupt, SystemExit) as e:
681
+ # Handle any other error
682
+ ctx.redirect_log(original_output)
683
+ await api_requests.set_request_failed_async(request.request_id, e)
684
+ logger.error(f'Request {request.request_id} interrupted due to '
685
+ f'unhandled exception: {common_utils.format_exception(e)}')
686
+ raise
687
+ finally:
688
+ # Always cancel the context to kill potentially running background
689
+ # routine.
690
+ ctx.cancel()
691
+
692
+
693
+ async def prepare_request_async(
694
+ request_id: str,
695
+ request_name: request_names.RequestName,
696
+ request_body: payloads.RequestBody,
697
+ func: Callable[P, Any],
698
+ request_cluster_name: Optional[str] = None,
699
+ schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
700
+ is_skypilot_system: bool = False,
701
+ ) -> api_requests.Request:
702
+ """Prepare a request for execution."""
703
+ user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
704
+ if is_skypilot_system:
705
+ user_id = constants.SKYPILOT_SYSTEM_USER_ID
706
+ global_user_state.add_or_update_user(
707
+ models.User(id=user_id, name=user_id))
708
+ request = api_requests.Request(request_id=request_id,
709
+ name=server_constants.REQUEST_NAME_PREFIX +
710
+ request_name,
711
+ entrypoint=func,
712
+ request_body=request_body,
713
+ status=api_requests.RequestStatus.PENDING,
714
+ created_at=time.time(),
715
+ schedule_type=schedule_type,
716
+ user_id=user_id,
717
+ cluster_name=request_cluster_name)
718
+
719
+ if not await api_requests.create_if_not_exists_async(request):
720
+ raise exceptions.RequestAlreadyExistsError(
721
+ f'Request {request_id} already exists.')
722
+
723
+ request.log_path.touch()
724
+ return request
725
+
726
+
727
+ async def schedule_request_async(request_id: str,
728
+ request_name: request_names.RequestName,
729
+ request_body: payloads.RequestBody,
730
+ func: Callable[P, Any],
731
+ request_cluster_name: Optional[str] = None,
732
+ ignore_return_value: bool = False,
733
+ schedule_type: api_requests.ScheduleType = (
734
+ api_requests.ScheduleType.LONG),
735
+ is_skypilot_system: bool = False,
736
+ precondition: Optional[
737
+ preconditions.Precondition] = None,
738
+ retryable: bool = False) -> None:
355
739
  """Enqueue a request to the request queue.
356
740
 
357
741
  Args:
@@ -372,32 +756,37 @@ def schedule_request(
372
756
  The precondition is waited asynchronously and does not block the
373
757
  caller.
374
758
  """
375
- user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
376
- if is_skypilot_system:
377
- user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
378
- global_user_state.add_or_update_user(
379
- models.User(id=user_id, name=user_id))
380
- request = api_requests.Request(request_id=request_id,
381
- name=server_constants.REQUEST_NAME_PREFIX +
382
- request_name,
383
- entrypoint=func,
384
- request_body=request_body,
385
- status=api_requests.RequestStatus.PENDING,
386
- created_at=time.time(),
387
- schedule_type=schedule_type,
388
- user_id=user_id,
389
- cluster_name=request_cluster_name)
759
+ request_task = await prepare_request_async(request_id, request_name,
760
+ request_body, func,
761
+ request_cluster_name,
762
+ schedule_type,
763
+ is_skypilot_system)
764
+ schedule_prepared_request(request_task, ignore_return_value, precondition,
765
+ retryable)
766
+
767
+
768
+ def schedule_prepared_request(request_task: api_requests.Request,
769
+ ignore_return_value: bool = False,
770
+ precondition: Optional[
771
+ preconditions.Precondition] = None,
772
+ retryable: bool = False) -> None:
773
+ """Enqueue a request to the request queue
390
774
 
391
- if not api_requests.create_if_not_exists(request):
392
- logger.debug(f'Request {request_id} already exists.')
393
- return
394
-
395
- request.log_path.touch()
775
+ Args:
776
+ request_task: The prepared request task to schedule.
777
+ ignore_return_value: If True, the return value of the function will be
778
+ ignored.
779
+ precondition: If a precondition is provided, the request will only be
780
+ scheduled for execution when the precondition is met (returns True).
781
+ The precondition is waited asynchronously and does not block the
782
+ caller.
783
+ retryable: Whether the request should be retried if it fails.
784
+ """
396
785
 
397
786
  def enqueue():
398
- input_tuple = (request_id, ignore_return_value)
399
- logger.info(f'Queuing request: {request_id}')
400
- _get_queue(schedule_type).put(input_tuple)
787
+ input_tuple = (request_task.request_id, ignore_return_value, retryable)
788
+ logger.info(f'Queuing request: {request_task.request_id}')
789
+ _get_queue(request_task.schedule_type).put(input_tuple)
401
790
 
402
791
  if precondition is not None:
403
792
  # Wait async to avoid blocking caller.
@@ -406,15 +795,21 @@ def schedule_request(
406
795
  enqueue()
407
796
 
408
797
 
409
- def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
798
+ def start(
799
+ config: server_config.ServerConfig
800
+ ) -> Tuple[Optional[multiprocessing.Process], List[RequestWorker]]:
410
801
  """Start the request workers.
411
802
 
412
803
  Request workers run in background, schedule the requests and delegate the
413
804
  request execution to executor processes.
805
+
806
+ Returns:
807
+ A tuple of the queue server process and the list of request worker
808
+ threads.
414
809
  """
415
810
  global queue_backend
416
811
  queue_backend = config.queue_backend
417
- sub_procs = []
812
+ queue_server = None
418
813
  # Setup the queues.
419
814
  if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
420
815
  logger.info('Creating shared request queues')
@@ -431,7 +826,6 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
431
826
  queue_server = multiprocessing.Process(
432
827
  target=mp_queue.start_queue_manager, args=(queue_names, port))
433
828
  queue_server.start()
434
- sub_procs.append(queue_server)
435
829
  mp_queue.wait_for_queues_to_be_ready(queue_names,
436
830
  queue_server,
437
831
  port=port)
@@ -444,20 +838,16 @@ def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
444
838
 
445
839
  logger.info('Request queues created')
446
840
 
447
- def run_worker_in_background(worker: RequestWorker):
448
- # Thread dispatcher is sufficient for current scale, refer to
449
- # tests/load_tests/test_queue_dispatcher.py for more details.
450
- # Use daemon thread for automatic cleanup.
451
- thread = threading.Thread(target=worker.run, daemon=True)
452
- thread.start()
453
-
841
+ workers = []
454
842
  # Start a worker for long requests.
455
843
  long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
456
844
  config=config.long_worker_config)
457
- run_worker_in_background(long_worker)
845
+ long_worker.run_in_background()
846
+ workers.append(long_worker)
458
847
 
459
848
  # Start a worker for short requests.
460
849
  short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
461
850
  config=config.short_worker_config)
462
- run_worker_in_background(short_worker)
463
- return sub_procs
851
+ short_worker.run_in_background()
852
+ workers.append(short_worker)
853
+ return queue_server, workers