skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,130 @@
1
+ """Async SDK for SkyServe."""
2
+ import typing
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ from sky.client import sdk_async
6
+ from sky.serve.client import sdk
7
+ from sky.usage import usage_lib
8
+ from sky.utils import context_utils
9
+
10
+ if typing.TYPE_CHECKING:
11
+ import io
12
+
13
+ import sky
14
+ from sky.serve import serve_utils
15
+
16
+
17
+ @usage_lib.entrypoint
18
+ async def up(
19
+ task: Union['sky.Task', 'sky.Dag'],
20
+ service_name: str,
21
+ # Internal only:
22
+ # pylint: disable=invalid-name
23
+ _need_confirmation: bool = False,
24
+ stream_logs: Optional[
25
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
26
+ ) -> Tuple[str, str]:
27
+ """Async version of up() that spins up a service."""
28
+ request_id = await context_utils.to_thread(sdk.up, task, service_name,
29
+ _need_confirmation)
30
+ if stream_logs is not None:
31
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
32
+ else:
33
+ return await sdk_async.get(request_id)
34
+
35
+
36
+ @usage_lib.entrypoint
37
+ async def update(
38
+ task: Union['sky.Task', 'sky.Dag'],
39
+ service_name: str,
40
+ mode: 'serve_utils.UpdateMode',
41
+ # Internal only:
42
+ # pylint: disable=invalid-name
43
+ _need_confirmation: bool = False,
44
+ stream_logs: Optional[
45
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
46
+ ) -> None:
47
+ """Async version of update() that updates an existing service."""
48
+ request_id = await context_utils.to_thread(sdk.update, task, service_name,
49
+ mode, _need_confirmation)
50
+ if stream_logs is not None:
51
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
52
+ else:
53
+ return await sdk_async.get(request_id)
54
+
55
+
56
+ @usage_lib.entrypoint
57
+ async def down(
58
+ service_names: Optional[Union[str, List[str]]],
59
+ all: bool = False, # pylint: disable=redefined-builtin
60
+ purge: bool = False,
61
+ stream_logs: Optional[
62
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
63
+ ) -> None:
64
+ """Async version of down() that tears down a service."""
65
+ request_id = await context_utils.to_thread(sdk.down, service_names, all,
66
+ purge)
67
+ if stream_logs is not None:
68
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
69
+ else:
70
+ return await sdk_async.get(request_id)
71
+
72
+
73
+ @usage_lib.entrypoint
74
+ async def terminate_replica(
75
+ service_name: str,
76
+ replica_id: int,
77
+ purge: bool,
78
+ stream_logs: Optional[
79
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
80
+ ) -> None:
81
+ """Async version of terminate_replica() that tears down a specific
82
+ replica."""
83
+ request_id = await context_utils.to_thread(sdk.terminate_replica,
84
+ service_name, replica_id, purge)
85
+ if stream_logs is not None:
86
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
87
+ else:
88
+ return await sdk_async.get(request_id)
89
+
90
+
91
+ @usage_lib.entrypoint
92
+ async def status(
93
+ service_names: Optional[Union[str, List[str]]],
94
+ stream_logs: Optional[
95
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
96
+ ) -> List[Dict[str, Any]]:
97
+ """Async version of status() that sdk_async.gets service statuses."""
98
+ request_id = await context_utils.to_thread(sdk.status, service_names)
99
+ if stream_logs is not None:
100
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
101
+ else:
102
+ return await sdk_async.get(request_id)
103
+
104
+
105
+ @usage_lib.entrypoint
106
+ async def tail_logs(service_name: str,
107
+ target: Union[str, 'serve_utils.ServiceComponent'],
108
+ replica_id: Optional[int] = None,
109
+ follow: bool = True,
110
+ output_stream: Optional['io.TextIOBase'] = None) -> None:
111
+ """Async version of tail_logs() that tails logs for a service."""
112
+ return await context_utils.to_thread(sdk.tail_logs, service_name, target,
113
+ replica_id, follow, output_stream)
114
+
115
+
116
+ @usage_lib.entrypoint
117
+ async def sync_down_logs(service_name: str,
118
+ local_dir: str,
119
+ *,
120
+ targets: Optional[Union[
121
+ str, 'serve_utils.ServiceComponent', List[Union[
122
+ str, 'serve_utils.ServiceComponent']]]] = None,
123
+ replica_ids: Optional[List[int]] = None) -> None:
124
+ """Async version of sync_down_logs() that syncs down logs from service
125
+ components."""
126
+ return await context_utils.to_thread(sdk.sync_down_logs,
127
+ service_name,
128
+ local_dir,
129
+ targets=targets,
130
+ replica_ids=replica_ids)
sky/serve/constants.py CHANGED
@@ -65,7 +65,8 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
65
65
  # TODO(tian): We might need to be careful that service logs can take a lot of
66
66
  # disk space. Maybe we could use a larger disk size, migrate to cloud storage or
67
67
  # do some log rotation.
68
- CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
68
+ # Set default minimal memory to 8GB to allow at least one service to run.
69
+ CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8+', 'disk_size': 200}
69
70
  # Autostop config for the jobs controller. These are the default values for
70
71
  # serve.controller.autostop in ~/.sky/config.yaml.
71
72
  CONTROLLER_AUTOSTOP = {
@@ -73,13 +74,6 @@ CONTROLLER_AUTOSTOP = {
73
74
  'down': False,
74
75
  }
75
76
 
76
- # Due to the CPU/memory usage of the controller process launched with a job on
77
- # controller VM (use ray job under the hood), we need to reserve some CPU/memory
78
- # for each serve controller process.
79
- # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
80
- # services.
81
- CONTROLLER_MEMORY_USAGE_GB = 1.0
82
-
83
77
  # A period of time to initialize your service. Any readiness probe failures
84
78
  # during this period will be ignored.
85
79
  DEFAULT_INITIAL_DELAY_SECONDS = 1200
@@ -104,8 +98,17 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
104
98
  # Changelog:
105
99
  # v1.0 - Introduce rolling update.
106
100
  # v2.0 - Added template-replica feature.
107
- SERVE_VERSION = 2
101
+ # v3.0 - Added pool.
102
+ # v4.0 - Added pool argument to wait_service_registration.
103
+ # v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
104
+ SERVE_VERSION = 5
108
105
 
109
106
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
110
107
  'The version of service is outdated and does not support manually '
111
108
  'terminating replicas. Please terminate the service and spin up again.')
109
+
110
+ # Dummy run command for pool.
111
+ POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
112
+
113
+ # Error message for max number of services reached.
114
+ MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
sky/serve/controller.py CHANGED
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
4
4
  """
5
5
  import contextlib
6
6
  import logging
7
+ import os
7
8
  import threading
8
9
  import time
9
10
  import traceback
@@ -20,17 +21,20 @@ from sky.serve import autoscalers
20
21
  from sky.serve import replica_managers
21
22
  from sky.serve import serve_state
22
23
  from sky.serve import serve_utils
24
+ from sky.skylet import constants
23
25
  from sky.utils import common_utils
26
+ from sky.utils import context_utils
24
27
  from sky.utils import ux_utils
25
28
 
26
29
  logger = sky_logging.init_logger(__name__)
27
30
 
28
31
 
29
- class SuppressSuccessGetAccessLogsFilter(logging.Filter):
32
+ class AutoscalerInfoFilter(logging.Filter):
30
33
 
31
34
  def filter(self, record: logging.LogRecord) -> bool:
32
35
  message = record.getMessage()
33
- return not ('GET' in message and '200' in message)
36
+ return not ('GET' in message and '200' in message and
37
+ '/autoscaler/info' in message)
34
38
 
35
39
 
36
40
  class SkyServeController:
@@ -42,12 +46,12 @@ class SkyServeController:
42
46
  """
43
47
 
44
48
  def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
45
- task_yaml: str, host: str, port: int) -> None:
49
+ version: int, host: str, port: int) -> None:
46
50
  self._service_name = service_name
47
51
  self._replica_manager: replica_managers.ReplicaManager = (
48
52
  replica_managers.SkyPilotReplicaManager(service_name=service_name,
49
53
  spec=service_spec,
50
- task_yaml_path=task_yaml))
54
+ version=version))
51
55
  self._autoscaler: autoscalers.Autoscaler = (
52
56
  autoscalers.Autoscaler.from_spec(service_name, service_spec))
53
57
  self._host = host
@@ -59,6 +63,7 @@ class SkyServeController:
59
63
  uvicorn_access_logger = logging.getLogger('uvicorn.access')
60
64
  for handler in uvicorn_access_logger.handlers:
61
65
  handler.setFormatter(sky_logging.FORMATTER)
66
+ handler.addFilter(AutoscalerInfoFilter())
62
67
  yield
63
68
 
64
69
  def _run_autoscaler(self):
@@ -74,7 +79,11 @@ class SkyServeController:
74
79
  assert record is not None, ('No service record found for '
75
80
  f'{self._service_name}')
76
81
  active_versions = record['active_versions']
77
- logger.info(f'All replica info: {replica_infos}')
82
+ logger.info(f'All replica info for autoscaler: {replica_infos}')
83
+
84
+ # Autoscaler now extracts GPU type info directly from
85
+ # replica_infos in generate_scaling_decisions method
86
+ # for better decoupling.
78
87
  scaling_options = self._autoscaler.generate_scaling_decisions(
79
88
  replica_infos, active_versions)
80
89
  for scaling_option in scaling_options:
@@ -99,6 +108,11 @@ class SkyServeController:
99
108
 
100
109
  def run(self) -> None:
101
110
 
111
+ @self._app.get('/autoscaler/info')
112
+ async def get_autoscaler_info() -> fastapi.Response:
113
+ return responses.JSONResponse(content=self._autoscaler.info(),
114
+ status_code=200)
115
+
102
116
  @self._app.post('/controller/load_balancer_sync')
103
117
  async def load_balancer_sync(
104
118
  request: fastapi.Request) -> fastapi.Response:
@@ -109,11 +123,37 @@ class SkyServeController:
109
123
  timestamps: List[int] = request_aggregator.get('timestamps', [])
110
124
  logger.info(f'Received {len(timestamps)} inflight requests.')
111
125
  self._autoscaler.collect_request_information(request_aggregator)
112
- return responses.JSONResponse(content={
113
- 'ready_replica_urls':
114
- self._replica_manager.get_active_replica_urls()
115
- },
116
- status_code=200)
126
+
127
+ # Get replica information for instance-aware load balancing
128
+ replica_infos = serve_state.get_replica_infos(self._service_name)
129
+ ready_replica_urls = self._replica_manager.get_active_replica_urls()
130
+
131
+ # Use URL-to-info mapping to avoid duplication
132
+ replica_info = {}
133
+ for info in replica_infos:
134
+ if info.url in ready_replica_urls:
135
+ # Get GPU type from handle.launched_resources.accelerators
136
+ gpu_type = 'unknown'
137
+ handle = info.handle()
138
+ if handle is not None:
139
+ accelerators = handle.launched_resources.accelerators
140
+ if accelerators and len(accelerators) > 0:
141
+ # Get the first accelerator type
142
+ gpu_type = list(accelerators.keys())[0]
143
+
144
+ replica_info[info.url] = {'gpu_type': gpu_type}
145
+
146
+ # Check that all ready replica URLs are included in replica_info
147
+ missing_urls = set(ready_replica_urls) - set(replica_info.keys())
148
+ if missing_urls:
149
+ logger.warning(f'Ready replica URLs missing from replica_info: '
150
+ f'{missing_urls}')
151
+ # fallback: add missing URLs with unknown GPU type
152
+ for url in missing_urls:
153
+ replica_info[url] = {'gpu_type': 'unknown'}
154
+
155
+ return responses.JSONResponse(
156
+ content={'replica_info': replica_info}, status_code=200)
117
157
 
118
158
  @self._app.post('/controller/update_service')
119
159
  async def update_service(request: fastapi.Request) -> fastapi.Response:
@@ -133,7 +173,11 @@ class SkyServeController:
133
173
  # See sky/serve/core.py::update
134
174
  latest_task_yaml = serve_utils.generate_task_yaml_file_name(
135
175
  self._service_name, version)
136
- service = serve.SkyServiceSpec.from_yaml(latest_task_yaml)
176
+ with open(latest_task_yaml, 'r', encoding='utf-8') as f:
177
+ yaml_content = f.read()
178
+ service = serve.SkyServiceSpec.from_yaml_str(yaml_content)
179
+ serve_state.add_or_update_version(self._service_name, version,
180
+ service, yaml_content)
137
181
  logger.info(
138
182
  f'Update to new version version {version}: {service}')
139
183
 
@@ -155,9 +199,13 @@ class SkyServeController:
155
199
  return responses.JSONResponse(content={'message': 'Success'},
156
200
  status_code=200)
157
201
  except Exception as e: # pylint: disable=broad-except
158
- logger.error(f'Error in update_service: '
159
- f'{common_utils.format_exception(e)}')
160
- return responses.JSONResponse(content={'message': 'Error'},
202
+ exception_str = common_utils.format_exception(e)
203
+ logger.error(f'Error in update_service: {exception_str}')
204
+ return responses.JSONResponse(content={
205
+ 'message': 'Error',
206
+ 'exception': exception_str,
207
+ 'traceback': traceback.format_exc()
208
+ },
161
209
  status_code=500)
162
210
 
163
211
  @self._app.post('/controller/terminate_replica')
@@ -232,7 +280,7 @@ class SkyServeController:
232
280
  threading.Thread(target=self._run_autoscaler).start()
233
281
 
234
282
  logger.info('SkyServe Controller started on '
235
- f'http://{self._host}:{self._port}')
283
+ f'http://{self._host}:{self._port}. PID: {os.getpid()}')
236
284
 
237
285
  uvicorn.run(self._app, host=self._host, port=self._port)
238
286
 
@@ -240,7 +288,10 @@ class SkyServeController:
240
288
  # TODO(tian): Probably we should support service that will stop the VM in
241
289
  # specific time period.
242
290
  def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
243
- task_yaml: str, controller_host: str, controller_port: int):
244
- controller = SkyServeController(service_name, service_spec, task_yaml,
291
+ version: int, controller_host: str, controller_port: int):
292
+ os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
293
+ # Hijack sys.stdout/stderr to be context aware.
294
+ context_utils.hijack_sys_attrs()
295
+ controller = SkyServeController(service_name, service_spec, version,
245
296
  controller_host, controller_port)
246
297
  controller.run()
@@ -1,8 +1,10 @@
1
1
  """LoadBalancer: Distribute any incoming request to all ready replicas."""
2
2
  import asyncio
3
3
  import logging
4
+ import os
4
5
  import threading
5
- from typing import Dict, Optional, Union
6
+ import traceback
7
+ from typing import Dict, List, Optional, Union
6
8
 
7
9
  import aiohttp
8
10
  import fastapi
@@ -28,11 +30,13 @@ class SkyServeLoadBalancer:
28
30
  """
29
31
 
30
32
  def __init__(
31
- self,
32
- controller_url: str,
33
- load_balancer_port: int,
34
- load_balancing_policy_name: Optional[str] = None,
35
- tls_credential: Optional[serve_utils.TLSCredential] = None) -> None:
33
+ self,
34
+ controller_url: str,
35
+ load_balancer_port: int,
36
+ load_balancing_policy_name: Optional[str] = None,
37
+ tls_credential: Optional[serve_utils.TLSCredential] = None,
38
+ target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
39
+ ) -> None:
36
40
  """Initialize the load balancer.
37
41
 
38
42
  Args:
@@ -42,6 +46,9 @@ class SkyServeLoadBalancer:
42
46
  to use. Defaults to None.
43
47
  tls_credentials: The TLS credentials for HTTPS endpoint. Defaults
44
48
  to None.
49
+ target_qps_per_replica: Target QPS per replica for instance-aware
50
+ load balancing. Can be a float or dict mapping GPU types to QPS.
51
+ Defaults to None.
45
52
  """
46
53
  self._app = fastapi.FastAPI()
47
54
  self._controller_url: str = controller_url
@@ -49,6 +56,15 @@ class SkyServeLoadBalancer:
49
56
  # Use the registry to create the load balancing policy
50
57
  self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
51
58
  load_balancing_policy_name)
59
+
60
+ # Set accelerator QPS for instance-aware policies
61
+ if (target_qps_per_replica and
62
+ isinstance(target_qps_per_replica, dict) and
63
+ isinstance(self._load_balancing_policy,
64
+ lb_policies.InstanceAwareLeastLoadPolicy)):
65
+ self._load_balancing_policy.set_target_qps_per_accelerator(
66
+ target_qps_per_replica)
67
+
52
68
  logger.info('Starting load balancer with policy '
53
69
  f'{load_balancing_policy_name}.')
54
70
  self._request_aggregator: serve_utils.RequestsAggregator = (
@@ -69,6 +85,56 @@ class SkyServeLoadBalancer:
69
85
  # updating it from _sync_with_controller.
70
86
  self._client_pool_lock: threading.Lock = threading.Lock()
71
87
 
88
+ async def _sync_with_controller_once(self) -> List[asyncio.Task]:
89
+ close_client_tasks = []
90
+ ready_replica_urls = []
91
+ replica_info = {}
92
+
93
+ async with aiohttp.ClientSession() as session:
94
+ try:
95
+ # Send request information
96
+ async with session.post(
97
+ self._controller_url + '/controller/load_balancer_sync',
98
+ json={
99
+ 'request_aggregator':
100
+ self._request_aggregator.to_dict()
101
+ },
102
+ timeout=aiohttp.ClientTimeout(5),
103
+ ) as response:
104
+ # Clean up after reporting request info to avoid OOM.
105
+ self._request_aggregator.clear()
106
+ response.raise_for_status()
107
+ response_json = await response.json()
108
+ replica_info = response_json.get('replica_info', {})
109
+ ready_replica_urls = list(replica_info.keys())
110
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
111
+ logger.error(f'An error occurred when syncing with '
112
+ f'the controller: {e}'
113
+ f'\nTraceback: {traceback.format_exc()}')
114
+ else:
115
+ logger.info(f'Available Replica URLs: {ready_replica_urls}')
116
+ with self._client_pool_lock:
117
+ self._load_balancing_policy.set_ready_replicas(
118
+ ready_replica_urls)
119
+ # Set replica info for instance-aware policies
120
+ if isinstance(self._load_balancing_policy,
121
+ lb_policies.InstanceAwareLeastLoadPolicy):
122
+ self._load_balancing_policy.set_replica_info(
123
+ replica_info)
124
+ for replica_url in ready_replica_urls:
125
+ if replica_url not in self._client_pool:
126
+ self._client_pool[replica_url] = httpx.AsyncClient(
127
+ base_url=replica_url)
128
+ urls_to_close = set(
129
+ self._client_pool.keys()) - set(ready_replica_urls)
130
+ client_to_close = []
131
+ for replica_url in urls_to_close:
132
+ client_to_close.append(
133
+ self._client_pool.pop(replica_url))
134
+ for client in client_to_close:
135
+ close_client_tasks.append(client.aclose())
136
+ return close_client_tasks
137
+
72
138
  async def _sync_with_controller(self):
73
139
  """Sync with controller periodically.
74
140
 
@@ -82,49 +148,16 @@ class SkyServeLoadBalancer:
82
148
  await asyncio.sleep(5)
83
149
 
84
150
  while True:
85
- close_client_tasks = []
86
- async with aiohttp.ClientSession() as session:
87
- try:
88
- # Send request information
89
- async with session.post(
90
- self._controller_url +
91
- '/controller/load_balancer_sync',
92
- json={
93
- 'request_aggregator':
94
- self._request_aggregator.to_dict()
95
- },
96
- timeout=aiohttp.ClientTimeout(5),
97
- ) as response:
98
- # Clean up after reporting request info to avoid OOM.
99
- self._request_aggregator.clear()
100
- response.raise_for_status()
101
- response_json = await response.json()
102
- ready_replica_urls = response_json.get(
103
- 'ready_replica_urls', [])
104
- except aiohttp.ClientError as e:
105
- logger.error('An error occurred when syncing with '
106
- f'the controller: {e}')
107
- else:
108
- logger.info(f'Available Replica URLs: {ready_replica_urls}')
109
- with self._client_pool_lock:
110
- self._load_balancing_policy.set_ready_replicas(
111
- ready_replica_urls)
112
- for replica_url in ready_replica_urls:
113
- if replica_url not in self._client_pool:
114
- self._client_pool[replica_url] = (
115
- httpx.AsyncClient(base_url=replica_url))
116
- urls_to_close = set(
117
- self._client_pool.keys()) - set(ready_replica_urls)
118
- client_to_close = []
119
- for replica_url in urls_to_close:
120
- client_to_close.append(
121
- self._client_pool.pop(replica_url))
122
- for client in client_to_close:
123
- close_client_tasks.append(client.aclose())
124
-
125
- await asyncio.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS)
126
- # Await those tasks after the interval to avoid blocking.
127
- await asyncio.gather(*close_client_tasks)
151
+ try:
152
+ close_client_tasks = await self._sync_with_controller_once()
153
+ await asyncio.sleep(
154
+ constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS)
155
+ # Await those tasks after the interval to avoid blocking.
156
+ await asyncio.gather(*close_client_tasks)
157
+ except Exception as e: # pylint: disable=broad-except
158
+ logger.error(f'An error occurred when syncing with '
159
+ f'the controller: {e}'
160
+ f'\nTraceback: {traceback.format_exc()}')
128
161
 
129
162
  async def _proxy_request_to(
130
163
  self, url: str, request: fastapi.Request
@@ -168,7 +201,8 @@ class SkyServeLoadBalancer:
168
201
  background=background.BackgroundTask(background_func))
169
202
  except (httpx.RequestError, httpx.HTTPStatusError) as e:
170
203
  logger.error(f'Error when proxy request to {url}: '
171
- f'{common_utils.format_exception(e)}')
204
+ f'{common_utils.format_exception(e)}'
205
+ f'\nTraceback: {traceback.format_exc()}')
172
206
  return e
173
207
 
174
208
  async def _proxy_with_retries(
@@ -243,7 +277,8 @@ class SkyServeLoadBalancer:
243
277
  protocol = 'https' if self._tls_credential is not None else 'http'
244
278
 
245
279
  logger.info('SkyServe Load Balancer started on '
246
- f'{protocol}://0.0.0.0:{self._load_balancer_port}')
280
+ f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
281
+ f'PID: {os.getpid()}')
247
282
 
248
283
  uvicorn.run(self._app,
249
284
  host='0.0.0.0',
@@ -252,23 +287,31 @@ class SkyServeLoadBalancer:
252
287
 
253
288
 
254
289
  def run_load_balancer(
255
- controller_addr: str,
256
- load_balancer_port: int,
257
- load_balancing_policy_name: Optional[str] = None,
258
- tls_credential: Optional[serve_utils.TLSCredential] = None) -> None:
290
+ controller_addr: str,
291
+ load_balancer_port: int,
292
+ load_balancing_policy_name: Optional[str] = None,
293
+ tls_credential: Optional[serve_utils.TLSCredential] = None,
294
+ target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
295
+ ) -> None:
259
296
  """ Run the load balancer.
260
297
 
261
298
  Args:
262
299
  controller_addr: The address of the controller.
263
300
  load_balancer_port: The port where the load balancer listens to.
264
- policy_name: The name of the load balancing policy to use. Defaults to
265
- None.
301
+ policy_name: The name of the load balancing policy to use.
302
+ Defaults to None.
303
+ tls_credential:
304
+ The TLS credentials for HTTPS endpoint. Defaults to None.
305
+ target_qps_per_replica: Target QPS per replica for instance-aware
306
+ load balancing. Can be a float or dict mapping GPU types to QPS.
307
+ Defaults to None.
266
308
  """
267
309
  load_balancer = SkyServeLoadBalancer(
268
310
  controller_url=controller_addr,
269
311
  load_balancer_port=load_balancer_port,
270
312
  load_balancing_policy_name=load_balancing_policy_name,
271
- tls_credential=tls_credential)
313
+ tls_credential=tls_credential,
314
+ target_qps_per_replica=target_qps_per_replica)
272
315
  load_balancer.run()
273
316
 
274
317
 
@@ -292,5 +335,8 @@ if __name__ == '__main__':
292
335
  help=f'The load balancing policy to use. Available policies: '
293
336
  f'{", ".join(available_policies)}.')
294
337
  args = parser.parse_args()
295
- run_load_balancer(args.controller_addr, args.load_balancer_port,
296
- args.load_balancing_policy)
338
+ run_load_balancer(args.controller_addr,
339
+ args.load_balancer_port,
340
+ args.load_balancing_policy,
341
+ tls_credential=None,
342
+ target_qps_per_replica=None)