skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
1
+ """Async SDK for SkyServe."""
2
+ import typing
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ from sky.client import sdk_async
6
+ from sky.serve.client import sdk
7
+ from sky.usage import usage_lib
8
+ from sky.utils import context_utils
9
+
10
+ if typing.TYPE_CHECKING:
11
+ import io
12
+
13
+ import sky
14
+ from sky.serve import serve_utils
15
+
16
+
17
+ @usage_lib.entrypoint
18
+ async def up(
19
+ task: Union['sky.Task', 'sky.Dag'],
20
+ service_name: str,
21
+ # Internal only:
22
+ # pylint: disable=invalid-name
23
+ _need_confirmation: bool = False,
24
+ stream_logs: Optional[
25
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
26
+ ) -> Tuple[str, str]:
27
+ """Async version of up() that spins up a service."""
28
+ request_id = await context_utils.to_thread(sdk.up, task, service_name,
29
+ _need_confirmation)
30
+ if stream_logs is not None:
31
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
32
+ else:
33
+ return await sdk_async.get(request_id)
34
+
35
+
36
+ @usage_lib.entrypoint
37
+ async def update(
38
+ task: Union['sky.Task', 'sky.Dag'],
39
+ service_name: str,
40
+ mode: 'serve_utils.UpdateMode',
41
+ # Internal only:
42
+ # pylint: disable=invalid-name
43
+ _need_confirmation: bool = False,
44
+ stream_logs: Optional[
45
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
46
+ ) -> None:
47
+ """Async version of update() that updates an existing service."""
48
+ request_id = await context_utils.to_thread(sdk.update, task, service_name,
49
+ mode, _need_confirmation)
50
+ if stream_logs is not None:
51
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
52
+ else:
53
+ return await sdk_async.get(request_id)
54
+
55
+
56
+ @usage_lib.entrypoint
57
+ async def down(
58
+ service_names: Optional[Union[str, List[str]]],
59
+ all: bool = False, # pylint: disable=redefined-builtin
60
+ purge: bool = False,
61
+ stream_logs: Optional[
62
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
63
+ ) -> None:
64
+ """Async version of down() that tears down a service."""
65
+ request_id = await context_utils.to_thread(sdk.down, service_names, all,
66
+ purge)
67
+ if stream_logs is not None:
68
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
69
+ else:
70
+ return await sdk_async.get(request_id)
71
+
72
+
73
+ @usage_lib.entrypoint
74
+ async def terminate_replica(
75
+ service_name: str,
76
+ replica_id: int,
77
+ purge: bool,
78
+ stream_logs: Optional[
79
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
80
+ ) -> None:
81
+ """Async version of terminate_replica() that tears down a specific
82
+ replica."""
83
+ request_id = await context_utils.to_thread(sdk.terminate_replica,
84
+ service_name, replica_id, purge)
85
+ if stream_logs is not None:
86
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
87
+ else:
88
+ return await sdk_async.get(request_id)
89
+
90
+
91
+ @usage_lib.entrypoint
92
+ async def status(
93
+ service_names: Optional[Union[str, List[str]]],
94
+ stream_logs: Optional[
95
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
96
+ ) -> List[Dict[str, Any]]:
97
+ """Async version of status() that sdk_async.gets service statuses."""
98
+ request_id = await context_utils.to_thread(sdk.status, service_names)
99
+ if stream_logs is not None:
100
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
101
+ else:
102
+ return await sdk_async.get(request_id)
103
+
104
+
105
+ @usage_lib.entrypoint
106
+ async def tail_logs(service_name: str,
107
+ target: Union[str, 'serve_utils.ServiceComponent'],
108
+ replica_id: Optional[int] = None,
109
+ follow: bool = True,
110
+ output_stream: Optional['io.TextIOBase'] = None) -> None:
111
+ """Async version of tail_logs() that tails logs for a service."""
112
+ return await context_utils.to_thread(sdk.tail_logs, service_name, target,
113
+ replica_id, follow, output_stream)
114
+
115
+
116
+ @usage_lib.entrypoint
117
+ async def sync_down_logs(service_name: str,
118
+ local_dir: str,
119
+ *,
120
+ targets: Optional[Union[
121
+ str, 'serve_utils.ServiceComponent', List[Union[
122
+ str, 'serve_utils.ServiceComponent']]]] = None,
123
+ replica_ids: Optional[List[int]] = None) -> None:
124
+ """Async version of sync_down_logs() that syncs down logs from service
125
+ components."""
126
+ return await context_utils.to_thread(sdk.sync_down_logs,
127
+ service_name,
128
+ local_dir,
129
+ targets=targets,
130
+ replica_ids=replica_ids)
sky/serve/constants.py CHANGED
@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
73
73
  'down': False,
74
74
  }
75
75
 
76
- # Due to the CPU/memory usage of the controller process launched with a job on
77
- # controller VM (use ray job under the hood), we need to reserve some CPU/memory
78
- # for each serve controller process.
79
- # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
80
- # services.
81
- CONTROLLER_MEMORY_USAGE_GB = 1.0
82
-
83
76
  # A period of time to initialize your service. Any readiness probe failures
84
77
  # during this period will be ignored.
85
78
  DEFAULT_INITIAL_DELAY_SECONDS = 1200
@@ -104,8 +97,17 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
104
97
  # Changelog:
105
98
  # v1.0 - Introduce rolling update.
106
99
  # v2.0 - Added template-replica feature.
107
- SERVE_VERSION = 2
100
+ # v3.0 - Added cluster pool.
101
+ # v4.0 - Added pool argument to wait_service_registration.
102
+ # v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
103
+ SERVE_VERSION = 5
108
104
 
109
105
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
110
106
  'The version of service is outdated and does not support manually '
111
107
  'terminating replicas. Please terminate the service and spin up again.')
108
+
109
+ # Dummy run command for cluster pool.
110
+ POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
111
+
112
+ # Error message for max number of services reached.
113
+ MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
sky/serve/controller.py CHANGED
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
4
4
  """
5
5
  import contextlib
6
6
  import logging
7
+ import os
7
8
  import threading
8
9
  import time
9
10
  import traceback
@@ -26,11 +27,12 @@ from sky.utils import ux_utils
26
27
  logger = sky_logging.init_logger(__name__)
27
28
 
28
29
 
29
- class SuppressSuccessGetAccessLogsFilter(logging.Filter):
30
+ class AutoscalerInfoFilter(logging.Filter):
30
31
 
31
32
  def filter(self, record: logging.LogRecord) -> bool:
32
33
  message = record.getMessage()
33
- return not ('GET' in message and '200' in message)
34
+ return not ('GET' in message and '200' in message and
35
+ '/autoscaler/info' in message)
34
36
 
35
37
 
36
38
  class SkyServeController:
@@ -42,12 +44,13 @@ class SkyServeController:
42
44
  """
43
45
 
44
46
  def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
45
- task_yaml: str, host: str, port: int) -> None:
47
+ service_task_yaml: str, host: str, port: int) -> None:
46
48
  self._service_name = service_name
47
49
  self._replica_manager: replica_managers.ReplicaManager = (
48
- replica_managers.SkyPilotReplicaManager(service_name=service_name,
49
- spec=service_spec,
50
- task_yaml_path=task_yaml))
50
+ replica_managers.SkyPilotReplicaManager(
51
+ service_name=service_name,
52
+ spec=service_spec,
53
+ service_task_yaml_path=service_task_yaml))
51
54
  self._autoscaler: autoscalers.Autoscaler = (
52
55
  autoscalers.Autoscaler.from_spec(service_name, service_spec))
53
56
  self._host = host
@@ -59,6 +62,7 @@ class SkyServeController:
59
62
  uvicorn_access_logger = logging.getLogger('uvicorn.access')
60
63
  for handler in uvicorn_access_logger.handlers:
61
64
  handler.setFormatter(sky_logging.FORMATTER)
65
+ handler.addFilter(AutoscalerInfoFilter())
62
66
  yield
63
67
 
64
68
  def _run_autoscaler(self):
@@ -74,7 +78,11 @@ class SkyServeController:
74
78
  assert record is not None, ('No service record found for '
75
79
  f'{self._service_name}')
76
80
  active_versions = record['active_versions']
77
- logger.info(f'All replica info: {replica_infos}')
81
+ logger.info(f'All replica info for autoscaler: {replica_infos}')
82
+
83
+ # Autoscaler now extracts GPU type info directly from
84
+ # replica_infos in generate_scaling_decisions method
85
+ # for better decoupling.
78
86
  scaling_options = self._autoscaler.generate_scaling_decisions(
79
87
  replica_infos, active_versions)
80
88
  for scaling_option in scaling_options:
@@ -99,6 +107,11 @@ class SkyServeController:
99
107
 
100
108
  def run(self) -> None:
101
109
 
110
+ @self._app.get('/autoscaler/info')
111
+ async def get_autoscaler_info() -> fastapi.Response:
112
+ return responses.JSONResponse(content=self._autoscaler.info(),
113
+ status_code=200)
114
+
102
115
  @self._app.post('/controller/load_balancer_sync')
103
116
  async def load_balancer_sync(
104
117
  request: fastapi.Request) -> fastapi.Response:
@@ -109,11 +122,37 @@ class SkyServeController:
109
122
  timestamps: List[int] = request_aggregator.get('timestamps', [])
110
123
  logger.info(f'Received {len(timestamps)} inflight requests.')
111
124
  self._autoscaler.collect_request_information(request_aggregator)
112
- return responses.JSONResponse(content={
113
- 'ready_replica_urls':
114
- self._replica_manager.get_active_replica_urls()
115
- },
116
- status_code=200)
125
+
126
+ # Get replica information for instance-aware load balancing
127
+ replica_infos = serve_state.get_replica_infos(self._service_name)
128
+ ready_replica_urls = self._replica_manager.get_active_replica_urls()
129
+
130
+ # Use URL-to-info mapping to avoid duplication
131
+ replica_info = {}
132
+ for info in replica_infos:
133
+ if info.url in ready_replica_urls:
134
+ # Get GPU type from handle.launched_resources.accelerators
135
+ gpu_type = 'unknown'
136
+ handle = info.handle()
137
+ if handle is not None:
138
+ accelerators = handle.launched_resources.accelerators
139
+ if accelerators and len(accelerators) > 0:
140
+ # Get the first accelerator type
141
+ gpu_type = list(accelerators.keys())[0]
142
+
143
+ replica_info[info.url] = {'gpu_type': gpu_type}
144
+
145
+ # Check that all ready replica URLs are included in replica_info
146
+ missing_urls = set(ready_replica_urls) - set(replica_info.keys())
147
+ if missing_urls:
148
+ logger.warning(f'Ready replica URLs missing from replica_info: '
149
+ f'{missing_urls}')
150
+ # fallback: add missing URLs with unknown GPU type
151
+ for url in missing_urls:
152
+ replica_info[url] = {'gpu_type': 'unknown'}
153
+
154
+ return responses.JSONResponse(
155
+ content={'replica_info': replica_info}, status_code=200)
117
156
 
118
157
  @self._app.post('/controller/update_service')
119
158
  async def update_service(request: fastapi.Request) -> fastapi.Response:
@@ -155,9 +194,13 @@ class SkyServeController:
155
194
  return responses.JSONResponse(content={'message': 'Success'},
156
195
  status_code=200)
157
196
  except Exception as e: # pylint: disable=broad-except
158
- logger.error(f'Error in update_service: '
159
- f'{common_utils.format_exception(e)}')
160
- return responses.JSONResponse(content={'message': 'Error'},
197
+ exception_str = common_utils.format_exception(e)
198
+ logger.error(f'Error in update_service: {exception_str}')
199
+ return responses.JSONResponse(content={
200
+ 'message': 'Error',
201
+ 'exception': exception_str,
202
+ 'traceback': traceback.format_exc()
203
+ },
161
204
  status_code=500)
162
205
 
163
206
  @self._app.post('/controller/terminate_replica')
@@ -232,7 +275,7 @@ class SkyServeController:
232
275
  threading.Thread(target=self._run_autoscaler).start()
233
276
 
234
277
  logger.info('SkyServe Controller started on '
235
- f'http://{self._host}:{self._port}')
278
+ f'http://{self._host}:{self._port}. PID: {os.getpid()}')
236
279
 
237
280
  uvicorn.run(self._app, host=self._host, port=self._port)
238
281
 
@@ -240,7 +283,9 @@ class SkyServeController:
240
283
  # TODO(tian): Probably we should support service that will stop the VM in
241
284
  # specific time period.
242
285
  def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
243
- task_yaml: str, controller_host: str, controller_port: int):
244
- controller = SkyServeController(service_name, service_spec, task_yaml,
245
- controller_host, controller_port)
286
+ service_task_yaml: str, controller_host: str,
287
+ controller_port: int):
288
+ controller = SkyServeController(service_name, service_spec,
289
+ service_task_yaml, controller_host,
290
+ controller_port)
246
291
  controller.run()
@@ -1,8 +1,10 @@
1
1
  """LoadBalancer: Distribute any incoming request to all ready replicas."""
2
2
  import asyncio
3
3
  import logging
4
+ import os
4
5
  import threading
5
- from typing import Dict, Optional, Union
6
+ import traceback
7
+ from typing import Dict, List, Optional, Union
6
8
 
7
9
  import aiohttp
8
10
  import fastapi
@@ -28,11 +30,13 @@ class SkyServeLoadBalancer:
28
30
  """
29
31
 
30
32
  def __init__(
31
- self,
32
- controller_url: str,
33
- load_balancer_port: int,
34
- load_balancing_policy_name: Optional[str] = None,
35
- tls_credential: Optional[serve_utils.TLSCredential] = None) -> None:
33
+ self,
34
+ controller_url: str,
35
+ load_balancer_port: int,
36
+ load_balancing_policy_name: Optional[str] = None,
37
+ tls_credential: Optional[serve_utils.TLSCredential] = None,
38
+ target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
39
+ ) -> None:
36
40
  """Initialize the load balancer.
37
41
 
38
42
  Args:
@@ -42,6 +46,9 @@ class SkyServeLoadBalancer:
42
46
  to use. Defaults to None.
43
47
  tls_credentials: The TLS credentials for HTTPS endpoint. Defaults
44
48
  to None.
49
+ target_qps_per_replica: Target QPS per replica for instance-aware
50
+ load balancing. Can be a float or dict mapping GPU types to QPS.
51
+ Defaults to None.
45
52
  """
46
53
  self._app = fastapi.FastAPI()
47
54
  self._controller_url: str = controller_url
@@ -49,6 +56,15 @@ class SkyServeLoadBalancer:
49
56
  # Use the registry to create the load balancing policy
50
57
  self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
51
58
  load_balancing_policy_name)
59
+
60
+ # Set accelerator QPS for instance-aware policies
61
+ if (target_qps_per_replica and
62
+ isinstance(target_qps_per_replica, dict) and
63
+ isinstance(self._load_balancing_policy,
64
+ lb_policies.InstanceAwareLeastLoadPolicy)):
65
+ self._load_balancing_policy.set_target_qps_per_accelerator(
66
+ target_qps_per_replica)
67
+
52
68
  logger.info('Starting load balancer with policy '
53
69
  f'{load_balancing_policy_name}.')
54
70
  self._request_aggregator: serve_utils.RequestsAggregator = (
@@ -69,6 +85,56 @@ class SkyServeLoadBalancer:
69
85
  # updating it from _sync_with_controller.
70
86
  self._client_pool_lock: threading.Lock = threading.Lock()
71
87
 
88
+ async def _sync_with_controller_once(self) -> List[asyncio.Task]:
89
+ close_client_tasks = []
90
+ ready_replica_urls = []
91
+ replica_info = {}
92
+
93
+ async with aiohttp.ClientSession() as session:
94
+ try:
95
+ # Send request information
96
+ async with session.post(
97
+ self._controller_url + '/controller/load_balancer_sync',
98
+ json={
99
+ 'request_aggregator':
100
+ self._request_aggregator.to_dict()
101
+ },
102
+ timeout=aiohttp.ClientTimeout(5),
103
+ ) as response:
104
+ # Clean up after reporting request info to avoid OOM.
105
+ self._request_aggregator.clear()
106
+ response.raise_for_status()
107
+ response_json = await response.json()
108
+ replica_info = response_json.get('replica_info', {})
109
+ ready_replica_urls = list(replica_info.keys())
110
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
111
+ logger.error(f'An error occurred when syncing with '
112
+ f'the controller: {e}'
113
+ f'\nTraceback: {traceback.format_exc()}')
114
+ else:
115
+ logger.info(f'Available Replica URLs: {ready_replica_urls}')
116
+ with self._client_pool_lock:
117
+ self._load_balancing_policy.set_ready_replicas(
118
+ ready_replica_urls)
119
+ # Set replica info for instance-aware policies
120
+ if isinstance(self._load_balancing_policy,
121
+ lb_policies.InstanceAwareLeastLoadPolicy):
122
+ self._load_balancing_policy.set_replica_info(
123
+ replica_info)
124
+ for replica_url in ready_replica_urls:
125
+ if replica_url not in self._client_pool:
126
+ self._client_pool[replica_url] = httpx.AsyncClient(
127
+ base_url=replica_url)
128
+ urls_to_close = set(
129
+ self._client_pool.keys()) - set(ready_replica_urls)
130
+ client_to_close = []
131
+ for replica_url in urls_to_close:
132
+ client_to_close.append(
133
+ self._client_pool.pop(replica_url))
134
+ for client in client_to_close:
135
+ close_client_tasks.append(client.aclose())
136
+ return close_client_tasks
137
+
72
138
  async def _sync_with_controller(self):
73
139
  """Sync with controller periodically.
74
140
 
@@ -82,49 +148,16 @@ class SkyServeLoadBalancer:
82
148
  await asyncio.sleep(5)
83
149
 
84
150
  while True:
85
- close_client_tasks = []
86
- async with aiohttp.ClientSession() as session:
87
- try:
88
- # Send request information
89
- async with session.post(
90
- self._controller_url +
91
- '/controller/load_balancer_sync',
92
- json={
93
- 'request_aggregator':
94
- self._request_aggregator.to_dict()
95
- },
96
- timeout=aiohttp.ClientTimeout(5),
97
- ) as response:
98
- # Clean up after reporting request info to avoid OOM.
99
- self._request_aggregator.clear()
100
- response.raise_for_status()
101
- response_json = await response.json()
102
- ready_replica_urls = response_json.get(
103
- 'ready_replica_urls', [])
104
- except aiohttp.ClientError as e:
105
- logger.error('An error occurred when syncing with '
106
- f'the controller: {e}')
107
- else:
108
- logger.info(f'Available Replica URLs: {ready_replica_urls}')
109
- with self._client_pool_lock:
110
- self._load_balancing_policy.set_ready_replicas(
111
- ready_replica_urls)
112
- for replica_url in ready_replica_urls:
113
- if replica_url not in self._client_pool:
114
- self._client_pool[replica_url] = (
115
- httpx.AsyncClient(base_url=replica_url))
116
- urls_to_close = set(
117
- self._client_pool.keys()) - set(ready_replica_urls)
118
- client_to_close = []
119
- for replica_url in urls_to_close:
120
- client_to_close.append(
121
- self._client_pool.pop(replica_url))
122
- for client in client_to_close:
123
- close_client_tasks.append(client.aclose())
124
-
125
- await asyncio.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS)
126
- # Await those tasks after the interval to avoid blocking.
127
- await asyncio.gather(*close_client_tasks)
151
+ try:
152
+ close_client_tasks = await self._sync_with_controller_once()
153
+ await asyncio.sleep(
154
+ constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS)
155
+ # Await those tasks after the interval to avoid blocking.
156
+ await asyncio.gather(*close_client_tasks)
157
+ except Exception as e: # pylint: disable=broad-except
158
+ logger.error(f'An error occurred when syncing with '
159
+ f'the controller: {e}'
160
+ f'\nTraceback: {traceback.format_exc()}')
128
161
 
129
162
  async def _proxy_request_to(
130
163
  self, url: str, request: fastapi.Request
@@ -168,7 +201,8 @@ class SkyServeLoadBalancer:
168
201
  background=background.BackgroundTask(background_func))
169
202
  except (httpx.RequestError, httpx.HTTPStatusError) as e:
170
203
  logger.error(f'Error when proxy request to {url}: '
171
- f'{common_utils.format_exception(e)}')
204
+ f'{common_utils.format_exception(e)}'
205
+ f'\nTraceback: {traceback.format_exc()}')
172
206
  return e
173
207
 
174
208
  async def _proxy_with_retries(
@@ -243,7 +277,8 @@ class SkyServeLoadBalancer:
243
277
  protocol = 'https' if self._tls_credential is not None else 'http'
244
278
 
245
279
  logger.info('SkyServe Load Balancer started on '
246
- f'{protocol}://0.0.0.0:{self._load_balancer_port}')
280
+ f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
281
+ f'PID: {os.getpid()}')
247
282
 
248
283
  uvicorn.run(self._app,
249
284
  host='0.0.0.0',
@@ -252,23 +287,31 @@ class SkyServeLoadBalancer:
252
287
 
253
288
 
254
289
  def run_load_balancer(
255
- controller_addr: str,
256
- load_balancer_port: int,
257
- load_balancing_policy_name: Optional[str] = None,
258
- tls_credential: Optional[serve_utils.TLSCredential] = None) -> None:
290
+ controller_addr: str,
291
+ load_balancer_port: int,
292
+ load_balancing_policy_name: Optional[str] = None,
293
+ tls_credential: Optional[serve_utils.TLSCredential] = None,
294
+ target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
295
+ ) -> None:
259
296
  """ Run the load balancer.
260
297
 
261
298
  Args:
262
299
  controller_addr: The address of the controller.
263
300
  load_balancer_port: The port where the load balancer listens to.
264
- policy_name: The name of the load balancing policy to use. Defaults to
265
- None.
301
+ policy_name: The name of the load balancing policy to use.
302
+ Defaults to None.
303
+ tls_credential:
304
+ The TLS credentials for HTTPS endpoint. Defaults to None.
305
+ target_qps_per_replica: Target QPS per replica for instance-aware
306
+ load balancing. Can be a float or dict mapping GPU types to QPS.
307
+ Defaults to None.
266
308
  """
267
309
  load_balancer = SkyServeLoadBalancer(
268
310
  controller_url=controller_addr,
269
311
  load_balancer_port=load_balancer_port,
270
312
  load_balancing_policy_name=load_balancing_policy_name,
271
- tls_credential=tls_credential)
313
+ tls_credential=tls_credential,
314
+ target_qps_per_replica=target_qps_per_replica)
272
315
  load_balancer.run()
273
316
 
274
317
 
@@ -292,5 +335,8 @@ if __name__ == '__main__':
292
335
  help=f'The load balancing policy to use. Available policies: '
293
336
  f'{", ".join(available_policies)}.')
294
337
  args = parser.parse_args()
295
- run_load_balancer(args.controller_addr, args.load_balancer_port,
296
- args.load_balancing_policy)
338
+ run_load_balancer(args.controller_addr,
339
+ args.load_balancer_port,
340
+ args.load_balancing_policy,
341
+ tls_credential=None,
342
+ target_qps_per_replica=None)