skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1093 @@
1
+ """Implementation of the SkyServe core APIs."""
2
+ import pathlib
3
+ import re
4
+ import shlex
5
+ import signal
6
+ import tempfile
7
+ import threading
8
+ import typing
9
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
10
+ import uuid
11
+
12
+ import colorama
13
+ import filelock
14
+
15
+ from sky import backends
16
+ from sky import exceptions
17
+ from sky import execution
18
+ from sky import sky_logging
19
+ from sky import skypilot_config
20
+ from sky import task as task_lib
21
+ from sky.adaptors import common as adaptors_common
22
+ from sky.backends import backend_utils
23
+ from sky.catalog import common as service_catalog_common
24
+ from sky.data import storage as storage_lib
25
+ from sky.serve import constants as serve_constants
26
+ from sky.serve import serve_rpc_utils
27
+ from sky.serve import serve_state
28
+ from sky.serve import serve_utils
29
+ from sky.server.requests import request_names
30
+ from sky.skylet import constants
31
+ from sky.skylet import job_lib
32
+ from sky.utils import admin_policy_utils
33
+ from sky.utils import command_runner
34
+ from sky.utils import common
35
+ from sky.utils import common_utils
36
+ from sky.utils import controller_utils
37
+ from sky.utils import dag_utils
38
+ from sky.utils import rich_utils
39
+ from sky.utils import subprocess_utils
40
+ from sky.utils import ux_utils
41
+ from sky.utils import yaml_utils
42
+
43
+ if typing.TYPE_CHECKING:
44
+ import grpc
45
+ else:
46
+ grpc = adaptors_common.LazyImport('grpc')
47
+
48
+ logger = sky_logging.init_logger(__name__)
49
+
50
+
51
+ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
52
+ service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
53
+ """Rewrite the paths of TLS credentials in the task.
54
+
55
+ Args:
56
+ service_name: Name of the service.
57
+ task: sky.Task to rewrite.
58
+
59
+ Returns:
60
+ The generated template variables for TLS.
61
+ """
62
+ service_spec = task.service
63
+ # Already checked by validate_service_task
64
+ assert service_spec is not None
65
+ if service_spec.tls_credential is None:
66
+ return {'use_tls': False}
67
+ remote_tls_keyfile = (
68
+ serve_utils.generate_remote_tls_keyfile_name(service_name))
69
+ remote_tls_certfile = (
70
+ serve_utils.generate_remote_tls_certfile_name(service_name))
71
+ tls_template_vars = {
72
+ 'use_tls': True,
73
+ 'remote_tls_keyfile': remote_tls_keyfile,
74
+ 'remote_tls_certfile': remote_tls_certfile,
75
+ 'local_tls_keyfile': service_spec.tls_credential.keyfile,
76
+ 'local_tls_certfile': service_spec.tls_credential.certfile,
77
+ }
78
+ service_spec.tls_credential = serve_utils.TLSCredential(
79
+ remote_tls_keyfile, remote_tls_certfile)
80
+ return tls_template_vars
81
+
82
+
83
+ def _get_service_record(
84
+ service_name: str, pool: bool,
85
+ handle: backends.CloudVmRayResourceHandle,
86
+ backend: backends.CloudVmRayBackend) -> Optional[Dict[str, Any]]:
87
+ """Get the service record."""
88
+ noun = 'pool' if pool else 'service'
89
+
90
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
91
+ use_legacy = not handle.is_grpc_enabled_with_flag
92
+
93
+ if not use_legacy:
94
+ try:
95
+ service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
96
+ handle, [service_name], pool)
97
+ except exceptions.SkyletMethodNotImplementedError:
98
+ use_legacy = True
99
+
100
+ if use_legacy:
101
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
102
+ pool=pool)
103
+ returncode, serve_status_payload, stderr = backend.run_on_head(
104
+ handle,
105
+ code,
106
+ require_outputs=True,
107
+ stream_logs=False,
108
+ separate_stderr=True)
109
+ try:
110
+ subprocess_utils.handle_returncode(returncode,
111
+ code,
112
+ f'Failed to get {noun} status',
113
+ stderr,
114
+ stream_logs=True)
115
+ except exceptions.CommandError as e:
116
+ raise RuntimeError(e.error_msg) from e
117
+
118
+ service_statuses = serve_utils.load_service_status(serve_status_payload)
119
+
120
+ assert len(service_statuses) <= 1, service_statuses
121
+ if not service_statuses:
122
+ return None
123
+ return service_statuses[0]
124
+
125
+
126
+ def _maybe_display_run_warning(task: 'task_lib.Task') -> None:
127
+ # We do not block the user from creating a pool with a run section
128
+ # in order to enable using the same yaml for pool creation
129
+ # and job submission. But we want to make it clear that 'run' will not
130
+ # be respected here.
131
+ if task.run is not None:
132
+ logger.warning(
133
+ f'{colorama.Fore.YELLOW} Pool creation does not support the '
134
+ '`run` section. Creating the pool while ignoring the '
135
+ f'`run` section.{colorama.Style.RESET_ALL}')
136
+
137
+
138
+ def up(
139
+ task: 'task_lib.Task',
140
+ service_name: Optional[str] = None,
141
+ pool: bool = False,
142
+ ) -> Tuple[str, str]:
143
+ """Spins up a service or a pool."""
144
+ task.validate()
145
+ serve_utils.validate_service_task(task, pool=pool)
146
+ assert task.service is not None
147
+ assert task.service.pool == pool, 'Inconsistent pool flag.'
148
+ noun = 'pool' if pool else 'service'
149
+ capnoun = noun.capitalize()
150
+ if service_name is None:
151
+ service_name = serve_utils.generate_service_name(pool)
152
+
153
+ # The service name will be used as:
154
+ # 1. controller cluster name: 'sky-serve-controller-<service_name>'
155
+ # 2. replica cluster name: '<service_name>-<replica_id>'
156
+ # In both cases, service name shares the same regex with cluster name.
157
+ if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None:
158
+ with ux_utils.print_exception_no_traceback():
159
+ raise ValueError(f'{capnoun} name {service_name!r} is invalid: '
160
+ f'ensure it is fully matched by regex (e.g., '
161
+ 'only contains lower letters, numbers and dash): '
162
+ f'{constants.CLUSTER_NAME_VALID_REGEX}')
163
+
164
+ dag = dag_utils.convert_entrypoint_to_dag(task)
165
+ # Always apply the policy again here, even though it might have been applied
166
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
167
+ # and get the mutated config.
168
+ dag, mutated_user_config = admin_policy_utils.apply(
169
+ dag, request_name=request_names.AdminPolicyRequestName.SERVE_UP)
170
+ dag.resolve_and_validate_volumes()
171
+ dag.pre_mount_volumes()
172
+ task = dag.tasks[0]
173
+ assert task.service is not None
174
+ if pool:
175
+ _maybe_display_run_warning(task)
176
+ # Use dummy run script for pool.
177
+ task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
178
+
179
+ with rich_utils.safe_status(
180
+ ux_utils.spinner_message(f'Initializing {noun}')):
181
+ # Handle file mounts using two-hop approach when cloud storage
182
+ # unavailable
183
+ storage_clouds = (
184
+ storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
185
+ force_disable_cloud_bucket = skypilot_config.get_nested(
186
+ ('serve', 'force_disable_cloud_bucket'), False)
187
+ if storage_clouds and not force_disable_cloud_bucket:
188
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
189
+ task, task_type='serve')
190
+ local_to_controller_file_mounts = {}
191
+ else:
192
+ # Fall back to two-hop file_mount uploading when no cloud storage
193
+ if task.storage_mounts:
194
+ raise exceptions.NotSupportedError(
195
+ 'Cloud-based file_mounts are specified, but no cloud '
196
+ 'storage is available. Please specify local '
197
+ 'file_mounts only.')
198
+ local_to_controller_file_mounts = (
199
+ controller_utils.translate_local_file_mounts_to_two_hop(task))
200
+
201
+ tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
202
+ service_name, task)
203
+
204
+ with tempfile.NamedTemporaryFile(
205
+ prefix=f'service-task-{service_name}-',
206
+ mode='w',
207
+ ) as service_file, tempfile.NamedTemporaryFile(
208
+ prefix=f'controller-task-{service_name}-',
209
+ mode='w',
210
+ ) as controller_file:
211
+ controller = controller_utils.get_controller_for_pool(pool)
212
+ controller_name = controller.value.cluster_name
213
+ task_config = task.to_yaml_config()
214
+ yaml_utils.dump_yaml(service_file.name, task_config)
215
+ remote_tmp_task_yaml_path = (
216
+ serve_utils.generate_remote_tmp_task_yaml_file_name(service_name))
217
+ remote_config_yaml_path = (
218
+ serve_utils.generate_remote_config_yaml_file_name(service_name))
219
+ controller_log_file = (
220
+ serve_utils.generate_remote_controller_log_file_name(service_name))
221
+ controller_resources = controller_utils.get_controller_resources(
222
+ controller=controller, task_resources=task.resources)
223
+ controller_job_id = None
224
+ if serve_utils.is_consolidation_mode(pool):
225
+ # We need a unique integer per sky.serve.up call to avoid name
226
+ # conflict. Originally in non-consolidation mode, this is the ray
227
+ # job id; now we use the request id hash instead. Here we also
228
+ # make sure it is a 32-bit integer to avoid overflow on sqlalchemy.
229
+ rid = common_utils.get_current_request_id()
230
+ controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFF
231
+
232
+ vars_to_fill = {
233
+ 'remote_task_yaml_path': remote_tmp_task_yaml_path,
234
+ 'local_task_yaml_path': service_file.name,
235
+ 'service_name': service_name,
236
+ 'controller_log_file': controller_log_file,
237
+ 'remote_user_config_path': remote_config_yaml_path,
238
+ 'local_to_controller_file_mounts': local_to_controller_file_mounts,
239
+ 'modified_catalogs':
240
+ service_catalog_common.get_modified_catalog_file_mounts(),
241
+ 'consolidation_mode_job_id': controller_job_id,
242
+ 'entrypoint': shlex.quote(common_utils.get_current_command()),
243
+ **tls_template_vars,
244
+ **controller_utils.shared_controller_vars_to_fill(
245
+ controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
246
+ remote_user_config_path=remote_config_yaml_path,
247
+ local_user_config=mutated_user_config,
248
+ ),
249
+ }
250
+ common_utils.fill_template(serve_constants.CONTROLLER_TEMPLATE,
251
+ vars_to_fill,
252
+ output_path=controller_file.name)
253
+ controller_task = task_lib.Task.from_yaml(controller_file.name)
254
+ # TODO(tian): Probably run another sky.launch after we get the load
255
+ # balancer port from the controller? So we don't need to open so many
256
+ # ports here. Or, we should have a nginx traffic control to refuse
257
+ # any connection to the unregistered ports.
258
+ if not pool:
259
+ controller_resources = {
260
+ r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
261
+ for r in controller_resources
262
+ }
263
+ controller_task.set_resources(controller_resources)
264
+
265
+ # # Set service_name so the backend will know to modify default ray
266
+ # task CPU usage to custom value instead of default 0.5 vCPU. We need
267
+ # to set it to a smaller value to support a larger number of services.
268
+ controller_task.service_name = service_name
269
+
270
+ # We directly submit the request to the controller and let the
271
+ # controller to check name conflict. Suppose we have multiple
272
+ # sky.serve.up() with same service name, the first one will
273
+ # successfully write its job id to controller service database;
274
+ # and for all following sky.serve.up(), the controller will throw
275
+ # an exception (name conflict detected) and exit. Therefore the
276
+ # controller job id in database could be use as an indicator of
277
+ # whether the service is already running. If the id is the same
278
+ # with the current job id, we know the service is up and running
279
+ # for the first time; otherwise it is a name conflict.
280
+ # Since the controller may be shared among multiple users, launch the
281
+ # controller with the API server's user hash.
282
+ if not serve_utils.is_consolidation_mode(pool):
283
+ print(f'{colorama.Fore.YELLOW}Launching controller for '
284
+ f'{service_name!r}...{colorama.Style.RESET_ALL}')
285
+ with common.with_server_user():
286
+ with skypilot_config.local_active_workspace_ctx(
287
+ constants.SKYPILOT_DEFAULT_WORKSPACE):
288
+ controller_job_id, controller_handle = execution.launch(
289
+ task=controller_task,
290
+ cluster_name=controller_name,
291
+ retry_until_up=True,
292
+ _request_name=request_names.AdminPolicyRequestName.
293
+ SERVE_LAUNCH_CONTROLLER,
294
+ _disable_controller_check=True,
295
+ )
296
+ else:
297
+ controller_type = controller_utils.get_controller_for_pool(pool)
298
+ controller_handle = backend_utils.is_controller_accessible(
299
+ controller=controller_type, stopped_message='')
300
+ backend = backend_utils.get_backend_from_handle(controller_handle)
301
+ assert isinstance(backend, backends.CloudVmRayBackend)
302
+ backend.sync_file_mounts(
303
+ handle=controller_handle,
304
+ all_file_mounts=controller_task.file_mounts,
305
+ storage_mounts=controller_task.storage_mounts)
306
+ run_script = controller_task.run
307
+ assert isinstance(run_script, str)
308
+ # Manually add the env variables to the run script. Originally
309
+ # this is done in ray jobs submission but now we have to do it
310
+ # manually because there is no ray runtime on the API server.
311
+ env_cmds = [
312
+ f'export {k}={v!r}' for k, v in controller_task.envs.items()
313
+ ]
314
+ run_script = '\n'.join(env_cmds + [run_script])
315
+ # Dump script for high availability recovery.
316
+ serve_state.set_ha_recovery_script(service_name, run_script)
317
+ backend.run_on_head(controller_handle, run_script)
318
+
319
+ style = colorama.Style
320
+ fore = colorama.Fore
321
+
322
+ assert controller_job_id is not None and controller_handle is not None
323
+ assert isinstance(controller_handle, backends.CloudVmRayResourceHandle)
324
+ backend = backend_utils.get_backend_from_handle(controller_handle)
325
+ assert isinstance(backend, backends.CloudVmRayBackend)
326
+ # TODO(tian): Cache endpoint locally to speedup. Endpoint won't
327
+ # change after the first time, so there is no consistency issue.
328
+ try:
329
+ with rich_utils.safe_status(
330
+ ux_utils.spinner_message(
331
+ f'Waiting for the {noun} to register')):
332
+ # This function will check the controller job id in the database
333
+ # and return the endpoint if the job id matches. Otherwise it
334
+ # will return None.
335
+ use_legacy = not controller_handle.is_grpc_enabled_with_flag
336
+
337
+ if controller_handle.is_grpc_enabled_with_flag:
338
+ try:
339
+ lb_port = serve_rpc_utils.RpcRunner.wait_service_registration( # pylint: disable=line-too-long
340
+ controller_handle, service_name, controller_job_id,
341
+ pool)
342
+ except exceptions.SkyletMethodNotImplementedError:
343
+ use_legacy = True
344
+
345
+ if use_legacy:
346
+ code = serve_utils.ServeCodeGen.wait_service_registration(
347
+ service_name, controller_job_id, pool)
348
+ returncode, lb_port_payload, _ = backend.run_on_head(
349
+ controller_handle,
350
+ code,
351
+ require_outputs=True,
352
+ stream_logs=False)
353
+ subprocess_utils.handle_returncode(
354
+ returncode, code,
355
+ f'Failed to wait for {noun} initialization',
356
+ lb_port_payload)
357
+ lb_port = serve_utils.load_service_initialization_result(
358
+ lb_port_payload)
359
+ except (exceptions.CommandError, grpc.FutureTimeoutError,
360
+ grpc.RpcError):
361
+ if serve_utils.is_consolidation_mode(pool):
362
+ with ux_utils.print_exception_no_traceback():
363
+ raise RuntimeError(
364
+ f'Failed to wait for {noun} initialization. '
365
+ 'Please check the logs above for more details.'
366
+ ) from None
367
+ statuses = backend.get_job_status(controller_handle,
368
+ [controller_job_id],
369
+ stream_logs=False)
370
+ controller_job_status = list(statuses.values())[0]
371
+ if controller_job_status == job_lib.JobStatus.PENDING:
372
+ # Max number of services reached due to vCPU constraint.
373
+ # The controller job is pending due to ray job scheduling.
374
+ # We manually cancel the job here.
375
+ backend.cancel_jobs(controller_handle, [controller_job_id])
376
+ with ux_utils.print_exception_no_traceback():
377
+ raise RuntimeError(
378
+ 'Max number of services reached. '
379
+ 'To spin up more services, please '
380
+ 'tear down some existing services.') from None
381
+ else:
382
+ # Possible cases:
383
+ # (1) name conflict;
384
+ # (2) max number of services reached due to memory
385
+ # constraint. The job will successfully run on the
386
+ # controller, but there will be an error thrown due
387
+ # to memory constraint check in the controller.
388
+ # See sky/serve/service.py for more details.
389
+ with ux_utils.print_exception_no_traceback():
390
+ raise RuntimeError(
391
+ 'Failed to spin up the service. Please '
392
+ 'check the logs above for more details.') from None
393
+ else:
394
+ if not serve_utils.is_consolidation_mode(pool) and not pool:
395
+ socket_endpoint = backend_utils.get_endpoints(
396
+ controller_handle.cluster_name,
397
+ lb_port,
398
+ skip_status_check=True).get(lb_port)
399
+ else:
400
+ socket_endpoint = f'localhost:{lb_port}'
401
+ assert socket_endpoint is not None, (
402
+ 'Did not get endpoint for controller.')
403
+ # Already checked by validate_service_task
404
+ assert task.service is not None
405
+ protocol = ('http'
406
+ if task.service.tls_credential is None else 'https')
407
+ socket_endpoint = socket_endpoint.replace('https://', '').replace(
408
+ 'http://', '')
409
+ endpoint = f'{protocol}://{socket_endpoint}'
410
+
411
+ if pool:
412
+ logger.info(
413
+ f'{fore.CYAN}Pool name: '
414
+ f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
415
+ f'\n📋 Useful Commands'
416
+ f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
417
+ f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
418
+ f'<yaml_file>{ux_utils.RESET_BOLD}'
419
+ f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
420
+ f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
421
+ f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
422
+ f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
423
+ f'{ux_utils.BOLD}sky jobs pool status {service_name}'
424
+ f'{ux_utils.RESET_BOLD}'
425
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To terminate the pool:\t'
426
+ f'{ux_utils.BOLD}sky jobs pool down {service_name}'
427
+ f'{ux_utils.RESET_BOLD}'
428
+ f'\n{ux_utils.INDENT_SYMBOL}To update the number of workers:\t'
429
+ f'{ux_utils.BOLD}sky jobs pool apply --pool {service_name} '
430
+ f'--workers 5{ux_utils.RESET_BOLD}'
431
+ '\n\n' + ux_utils.finishing_message('Successfully created pool '
432
+ f'{service_name!r}.'))
433
+ else:
434
+ logger.info(
435
+ f'{fore.CYAN}Service name: '
436
+ f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
437
+ f'\n{fore.CYAN}Endpoint URL: '
438
+ f'{style.BRIGHT}{endpoint}{style.RESET_ALL}'
439
+ f'\n📋 Useful Commands'
440
+ f'\n{ux_utils.INDENT_SYMBOL}To check service status:\t'
441
+ f'{ux_utils.BOLD}sky serve status {service_name} '
442
+ f'[--endpoint]{ux_utils.RESET_BOLD}'
443
+ f'\n{ux_utils.INDENT_SYMBOL}To teardown the service:\t'
444
+ f'{ux_utils.BOLD}sky serve down {service_name}'
445
+ f'{ux_utils.RESET_BOLD}'
446
+ f'\n{ux_utils.INDENT_SYMBOL}To see replica logs:\t'
447
+ f'{ux_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
448
+ f'{ux_utils.RESET_BOLD}'
449
+ f'\n{ux_utils.INDENT_SYMBOL}To see load balancer logs:\t'
450
+ f'{ux_utils.BOLD}sky serve logs --load-balancer {service_name}'
451
+ f'{ux_utils.RESET_BOLD}'
452
+ f'\n{ux_utils.INDENT_SYMBOL}To see controller logs:\t'
453
+ f'{ux_utils.BOLD}sky serve logs --controller {service_name}'
454
+ f'{ux_utils.RESET_BOLD}'
455
+ f'\n{ux_utils.INDENT_SYMBOL}To monitor the status:\t'
456
+ f'{ux_utils.BOLD}watch -n10 sky serve status {service_name}'
457
+ f'{ux_utils.RESET_BOLD}'
458
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To send a test request:\t'
459
+ f'{ux_utils.BOLD}curl {endpoint}'
460
+ f'{ux_utils.RESET_BOLD}'
461
+ '\n\n' + ux_utils.finishing_message(
462
+ 'Service is spinning up and replicas '
463
+ 'will be ready shortly.'))
464
+ return service_name, endpoint
465
+
466
+
467
+ def update(
468
+ task: Optional['task_lib.Task'],
469
+ service_name: str,
470
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
471
+ pool: bool = False,
472
+ workers: Optional[int] = None,
473
+ ) -> None:
474
+ """Updates an existing service or pool."""
475
+ noun = 'pool' if pool else 'service'
476
+ capnoun = noun.capitalize()
477
+
478
+ controller_type = controller_utils.get_controller_for_pool(pool)
479
+ handle = backend_utils.is_controller_accessible(
480
+ controller=controller_type,
481
+ stopped_message=
482
+ 'Service controller is stopped. There is no service to update. '
483
+ f'To spin up a new service, use {ux_utils.BOLD}'
484
+ f'sky serve up{ux_utils.RESET_BOLD}',
485
+ non_existent_message='Service does not exist. '
486
+ 'To spin up a new service, '
487
+ f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
488
+ )
489
+
490
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
491
+ backend = backend_utils.get_backend_from_handle(handle)
492
+ assert isinstance(backend, backends.CloudVmRayBackend)
493
+
494
+ service_record = _get_service_record(service_name, pool, handle, backend)
495
+
496
+ if service_record is None:
497
+ cmd = 'sky jobs pool up' if pool else 'sky serve up'
498
+ with ux_utils.print_exception_no_traceback():
499
+ raise RuntimeError(f'Cannot find {noun} {service_name!r}.'
500
+ f'To spin up a {noun}, use {ux_utils.BOLD}'
501
+ f'{cmd}{ux_utils.RESET_BOLD}')
502
+
503
+ # If task is None and workers is specified, load existing configuration
504
+ # and update replica count.
505
+ if task is None:
506
+ if workers is None:
507
+ with ux_utils.print_exception_no_traceback():
508
+ raise ValueError(
509
+ f'Cannot update {noun} without specifying '
510
+ f'task or workers. Please provide either a task '
511
+ f'or specify the number of workers.')
512
+
513
+ if not pool:
514
+ with ux_utils.print_exception_no_traceback():
515
+ raise ValueError(
516
+ 'Non-pool service, trying to update replicas to '
517
+ f'{workers} is not supported. Ignoring the update.')
518
+
519
+ # Load the existing task configuration from the service's YAML file
520
+ yaml_content = service_record['yaml_content']
521
+
522
+ # Load the existing task configuration
523
+ task = task_lib.Task.from_yaml_str(yaml_content)
524
+
525
+ if task.service is None:
526
+ with ux_utils.print_exception_no_traceback():
527
+ raise RuntimeError('No service configuration found in '
528
+ f'existing {noun} {service_name!r}')
529
+ task.set_service(task.service.copy(min_replicas=workers))
530
+
531
+ task.validate()
532
+ serve_utils.validate_service_task(task, pool=pool)
533
+
534
+ # Now apply the policy and handle task-specific logic
535
+ # Always apply the policy again here, even though it might have been applied
536
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
537
+ # and get the mutated config.
538
+ # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
539
+ # will not apply the config.
540
+ dag, _ = admin_policy_utils.apply(
541
+ task, request_name=request_names.AdminPolicyRequestName.SERVE_UPDATE)
542
+ task = dag.tasks[0]
543
+ if pool:
544
+ _maybe_display_run_warning(task)
545
+ # Use dummy run script for pool.
546
+ task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
547
+
548
+ assert task.service is not None
549
+ if not pool and task.service.tls_credential is not None:
550
+ logger.warning('Updating TLS keyfile and certfile is not supported. '
551
+ 'Any updates to the keyfile and certfile will not take '
552
+ 'effect. To update TLS keyfile and certfile, please '
553
+ 'tear down the service and spin up a new one.')
554
+
555
+ prompt = None
556
+ if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
557
+ ):
558
+ prompt = (f'{capnoun} {service_name!r} has a failed controller. '
559
+ f'Please clean up the {noun} and try again.')
560
+ elif (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT
561
+ ):
562
+ prompt = (f'{capnoun} {service_name!r} is still initializing '
563
+ 'its controller. Please try again later.')
564
+ if prompt is not None:
565
+ with ux_utils.print_exception_no_traceback():
566
+ raise RuntimeError(prompt)
567
+
568
+ if not pool:
569
+ original_lb_policy = service_record['load_balancing_policy']
570
+ assert task.service is not None, 'Service section not found.'
571
+ if original_lb_policy != task.service.load_balancing_policy:
572
+ logger.warning(
573
+ f'{colorama.Fore.YELLOW}Current load balancing policy '
574
+ f'{original_lb_policy!r} is different from the new policy '
575
+ f'{task.service.load_balancing_policy!r}. Updating the load '
576
+ 'balancing policy is not supported yet and it will be ignored. '
577
+ 'The service will continue to use the current load balancing '
578
+ f'policy.{colorama.Style.RESET_ALL}')
579
+
580
+ with rich_utils.safe_status(
581
+ ux_utils.spinner_message(f'Initializing {noun}')):
582
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
583
+ task, task_type='serve')
584
+
585
+ use_legacy = not handle.is_grpc_enabled_with_flag
586
+
587
+ if not use_legacy:
588
+ try:
589
+ current_version = serve_rpc_utils.RpcRunner.add_version(
590
+ handle, service_name)
591
+ except exceptions.SkyletMethodNotImplementedError:
592
+ use_legacy = True
593
+
594
+ if use_legacy:
595
+ code = serve_utils.ServeCodeGen.add_version(service_name)
596
+ returncode, version_string_payload, stderr = backend.run_on_head(
597
+ handle,
598
+ code,
599
+ require_outputs=True,
600
+ stream_logs=False,
601
+ separate_stderr=True)
602
+ try:
603
+ subprocess_utils.handle_returncode(returncode,
604
+ code,
605
+ 'Failed to add version',
606
+ stderr,
607
+ stream_logs=True)
608
+ except exceptions.CommandError as e:
609
+ raise RuntimeError(e.error_msg) from e
610
+
611
+ version_string = serve_utils.load_version_string(version_string_payload)
612
+ try:
613
+ current_version = int(version_string)
614
+ except ValueError as e:
615
+ with ux_utils.print_exception_no_traceback():
616
+ raise ValueError(f'Failed to parse version: {version_string}; '
617
+ f'Returncode: {returncode}') from e
618
+
619
+ with tempfile.NamedTemporaryFile(
620
+ prefix=f'{service_name}-v{current_version}',
621
+ mode='w') as service_file:
622
+ task_config = task.to_yaml_config()
623
+ yaml_utils.dump_yaml(service_file.name, task_config)
624
+ remote_task_yaml_path = serve_utils.generate_task_yaml_file_name(
625
+ service_name, current_version, expand_user=False)
626
+
627
+ with sky_logging.silent():
628
+ backend.sync_file_mounts(handle,
629
+ {remote_task_yaml_path: service_file.name},
630
+ storage_mounts=None)
631
+
632
+ use_legacy = not handle.is_grpc_enabled_with_flag
633
+
634
+ if not use_legacy:
635
+ try:
636
+ serve_rpc_utils.RpcRunner.update_service(
637
+ handle, service_name, current_version, mode, pool)
638
+ except exceptions.SkyletMethodNotImplementedError:
639
+ use_legacy = True
640
+
641
+ if use_legacy:
642
+ code = serve_utils.ServeCodeGen.update_service(service_name,
643
+ current_version,
644
+ mode=mode.value,
645
+ pool=pool)
646
+ returncode, _, stderr = backend.run_on_head(handle,
647
+ code,
648
+ require_outputs=True,
649
+ stream_logs=False,
650
+ separate_stderr=True)
651
+ try:
652
+ subprocess_utils.handle_returncode(returncode,
653
+ code,
654
+ f'Failed to update {noun}s',
655
+ stderr,
656
+ stream_logs=True)
657
+ except exceptions.CommandError as e:
658
+ raise RuntimeError(e.error_msg) from e
659
+
660
+ cmd = 'sky jobs pool status' if pool else 'sky serve status'
661
+ logger.info(
662
+ f'{colorama.Fore.GREEN}{capnoun} {service_name!r} update scheduled.'
663
+ f'{colorama.Style.RESET_ALL}\n'
664
+ f'Please use {ux_utils.BOLD}{cmd} {service_name} '
665
+ f'{ux_utils.RESET_BOLD}to check the latest status.')
666
+
667
+ if pool:
668
+ logs_cmd = f'`sky jobs pool logs {service_name} <worker_id>`'
669
+ unit_noun = 'Workers'
670
+
671
+ else:
672
+ logs_cmd = f'`sky serve logs {service_name} <replica_id>`'
673
+ unit_noun = 'Replicas'
674
+ logger.info(
675
+ ux_utils.finishing_message(
676
+ f'Successfully updated {noun} {service_name!r} '
677
+ f'to version {current_version}.',
678
+ follow_up_message=
679
+ f'\n{unit_noun} are updating, use {ux_utils.BOLD}{logs_cmd}'
680
+ f'{ux_utils.RESET_BOLD} to check their status.'))
681
+
682
+
683
+ def apply(
684
+ task: 'task_lib.Task',
685
+ workers: Optional[int],
686
+ service_name: str,
687
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
688
+ pool: bool = False,
689
+ ) -> None:
690
+ """Applies the config to the service or pool."""
691
+ with filelock.FileLock(serve_utils.get_service_filelock_path(service_name)):
692
+ try:
693
+ controller_type = controller_utils.get_controller_for_pool(pool)
694
+ handle = backend_utils.is_controller_accessible(
695
+ controller=controller_type, stopped_message='')
696
+ backend = backend_utils.get_backend_from_handle(handle)
697
+ assert isinstance(backend, backends.CloudVmRayBackend)
698
+ service_record = _get_service_record(service_name, pool, handle,
699
+ backend)
700
+ if service_record is not None:
701
+ return update(task, service_name, mode, pool, workers)
702
+ except exceptions.ClusterNotUpError:
703
+ pass
704
+ up(task, service_name, pool)
705
+
706
+
707
+ def down(
708
+ service_names: Optional[Union[str, List[str]]] = None,
709
+ all: bool = False, # pylint: disable=redefined-builtin
710
+ purge: bool = False,
711
+ pool: bool = False,
712
+ ) -> None:
713
+ """Tears down a service or pool."""
714
+ noun = 'pool' if pool else 'service'
715
+ if service_names is None:
716
+ service_names = []
717
+ if isinstance(service_names, str):
718
+ service_names = [service_names]
719
+ controller_type = controller_utils.get_controller_for_pool(pool)
720
+ handle = backend_utils.is_controller_accessible(
721
+ controller=controller_type,
722
+ stopped_message=f'All {noun}s should have terminated.')
723
+
724
+ service_names_str = ','.join(service_names)
725
+ if sum([bool(service_names), all]) != 1:
726
+ argument_str = (f'{noun}_names={service_names_str}'
727
+ if service_names else '')
728
+ argument_str += ' all' if all else ''
729
+ raise ValueError(f'Can only specify one of {noun}_names or all. '
730
+ f'Provided {argument_str!r}.')
731
+
732
+ service_names = None if all else service_names
733
+
734
+ try:
735
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
736
+ use_legacy = not handle.is_grpc_enabled_with_flag
737
+
738
+ if not use_legacy:
739
+ try:
740
+ stdout = serve_rpc_utils.RpcRunner.terminate_services(
741
+ handle, service_names, purge, pool)
742
+ except exceptions.SkyletMethodNotImplementedError:
743
+ use_legacy = True
744
+
745
+ if use_legacy:
746
+ backend = backend_utils.get_backend_from_handle(handle)
747
+ assert isinstance(backend, backends.CloudVmRayBackend)
748
+ code = serve_utils.ServeCodeGen.terminate_services(
749
+ service_names, purge, pool)
750
+
751
+ returncode, stdout, _ = backend.run_on_head(handle,
752
+ code,
753
+ require_outputs=True,
754
+ stream_logs=False)
755
+
756
+ subprocess_utils.handle_returncode(returncode, code,
757
+ f'Failed to terminate {noun}',
758
+ stdout)
759
+ except exceptions.FetchClusterInfoError as e:
760
+ raise RuntimeError(
761
+ 'Failed to fetch controller IP. Please refresh controller status '
762
+ f'by `sky status -r {controller_type.value.cluster_name}` and try '
763
+ 'again.') from e
764
+ except exceptions.CommandError as e:
765
+ raise RuntimeError(e.error_msg) from e
766
+ except grpc.RpcError as e:
767
+ raise RuntimeError(f'{e.details()} ({e.code()})') from e
768
+ except grpc.FutureTimeoutError as e:
769
+ raise RuntimeError('gRPC timed out') from e
770
+
771
+ logger.info(stdout)
772
+
773
+
774
+ def status(
775
+ service_names: Optional[Union[str, List[str]]] = None,
776
+ pool: bool = False,
777
+ ) -> List[Dict[str, Any]]:
778
+ """Gets statuses of services or pools."""
779
+ noun = 'pool' if pool else 'service'
780
+ if service_names is not None:
781
+ if isinstance(service_names, str):
782
+ service_names = [service_names]
783
+
784
+ try:
785
+ backend_utils.check_network_connection()
786
+ except exceptions.NetworkError as e:
787
+ with ux_utils.print_exception_no_traceback():
788
+ raise RuntimeError(f'Failed to refresh {noun}s status '
789
+ 'due to network error.') from e
790
+
791
+ controller_type = controller_utils.get_controller_for_pool(pool)
792
+ handle = backend_utils.is_controller_accessible(
793
+ controller=controller_type,
794
+ stopped_message=controller_type.value.default_hint_if_non_existent.
795
+ replace('service', noun))
796
+
797
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
798
+ use_legacy = not handle.is_grpc_enabled_with_flag
799
+
800
+ if not use_legacy:
801
+ try:
802
+ service_records = serve_rpc_utils.RpcRunner.get_service_status(
803
+ handle, service_names, pool)
804
+ except exceptions.SkyletMethodNotImplementedError:
805
+ use_legacy = True
806
+
807
+ if use_legacy:
808
+ backend = backend_utils.get_backend_from_handle(handle)
809
+ assert isinstance(backend, backends.CloudVmRayBackend)
810
+
811
+ code = serve_utils.ServeCodeGen.get_service_status(service_names,
812
+ pool=pool)
813
+ returncode, serve_status_payload, stderr = backend.run_on_head(
814
+ handle,
815
+ code,
816
+ require_outputs=True,
817
+ stream_logs=False,
818
+ separate_stderr=True)
819
+
820
+ try:
821
+ subprocess_utils.handle_returncode(returncode,
822
+ code,
823
+ f'Failed to fetch {noun}s',
824
+ stderr,
825
+ stream_logs=True)
826
+ except exceptions.CommandError as e:
827
+ raise RuntimeError(e.error_msg) from e
828
+
829
+ service_records = serve_utils.load_service_status(serve_status_payload)
830
+
831
+ # Get the endpoint for each service
832
+ for service_record in service_records:
833
+ service_record['endpoint'] = None
834
+ # Pool doesn't have an endpoint.
835
+ if pool:
836
+ continue
837
+ if service_record['load_balancer_port'] is not None:
838
+ try:
839
+ lb_port = service_record['load_balancer_port']
840
+ if not serve_utils.is_consolidation_mode(pool):
841
+ endpoint = backend_utils.get_endpoints(
842
+ cluster=common.SKY_SERVE_CONTROLLER_NAME,
843
+ port=lb_port).get(lb_port, None)
844
+ else:
845
+ endpoint = f'localhost:{lb_port}'
846
+ except exceptions.ClusterNotUpError:
847
+ pass
848
+ else:
849
+ protocol = ('https'
850
+ if service_record['tls_encrypted'] else 'http')
851
+ if endpoint is not None:
852
+ endpoint = endpoint.replace('https://',
853
+ '').replace('http://', '')
854
+ service_record['endpoint'] = f'{protocol}://{endpoint}'
855
+
856
+ return service_records
857
+
858
+
859
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
860
+
861
+
862
+ def tail_logs(
863
+ service_name: str,
864
+ *,
865
+ target: ServiceComponentOrStr,
866
+ replica_id: Optional[int] = None,
867
+ follow: bool = True,
868
+ tail: Optional[int] = None,
869
+ pool: bool = False,
870
+ ) -> None:
871
+ """Tail logs of a service or pool."""
872
+ if isinstance(target, str):
873
+ target = serve_utils.ServiceComponent(target)
874
+
875
+ if pool and target == serve_utils.ServiceComponent.LOAD_BALANCER:
876
+ raise ValueError(f'Target {target} is not supported for pool.')
877
+
878
+ if target == serve_utils.ServiceComponent.REPLICA:
879
+ if replica_id is None:
880
+ with ux_utils.print_exception_no_traceback():
881
+ raise ValueError(
882
+ '`replica_id` must be specified when using target=REPLICA.')
883
+ else:
884
+ if replica_id is not None:
885
+ with ux_utils.print_exception_no_traceback():
886
+ raise ValueError('`replica_id` must be None when using '
887
+ 'target=CONTROLLER/LOAD_BALANCER.')
888
+
889
+ controller_type = controller_utils.get_controller_for_pool(pool)
890
+ handle = backend_utils.is_controller_accessible(
891
+ controller=controller_type,
892
+ stopped_message=controller_type.value.default_hint_if_non_existent)
893
+
894
+ backend = backend_utils.get_backend_from_handle(handle)
895
+ assert isinstance(backend, backends.CloudVmRayBackend), backend
896
+
897
+ if target != serve_utils.ServiceComponent.REPLICA:
898
+ code = serve_utils.ServeCodeGen.stream_serve_process_logs(
899
+ service_name,
900
+ stream_controller=(
901
+ target == serve_utils.ServiceComponent.CONTROLLER),
902
+ follow=follow,
903
+ tail=tail,
904
+ pool=pool)
905
+ else:
906
+ assert replica_id is not None, service_name
907
+ code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
908
+ replica_id,
909
+ follow,
910
+ tail=tail,
911
+ pool=pool)
912
+
913
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
914
+ # kill the process, so we need to handle it manually here.
915
+ if threading.current_thread() is threading.main_thread():
916
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
917
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
918
+
919
+ # Refer to the notes in
920
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
921
+ backend.run_on_head(handle,
922
+ code,
923
+ stream_logs=True,
924
+ process_stream=False,
925
+ ssh_mode=command_runner.SshMode.INTERACTIVE)
926
+
927
+
928
+ def _get_all_replica_targets(
929
+ service_name: str, backend: backends.CloudVmRayBackend,
930
+ handle: backends.CloudVmRayResourceHandle,
931
+ pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
932
+ """Helper function to get targets for all live replicas."""
933
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
934
+ use_legacy = not handle.is_grpc_enabled_with_flag
935
+
936
+ if not use_legacy:
937
+ try:
938
+ service_records = serve_rpc_utils.RpcRunner.get_service_status(
939
+ handle, [service_name], pool)
940
+ except exceptions.SkyletMethodNotImplementedError:
941
+ use_legacy = True
942
+
943
+ if use_legacy:
944
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
945
+ pool=pool)
946
+ returncode, serve_status_payload, stderr = backend.run_on_head(
947
+ handle,
948
+ code,
949
+ require_outputs=True,
950
+ stream_logs=False,
951
+ separate_stderr=True)
952
+
953
+ try:
954
+ subprocess_utils.handle_returncode(returncode,
955
+ code,
956
+ 'Failed to fetch services',
957
+ stderr,
958
+ stream_logs=True)
959
+ except exceptions.CommandError as e:
960
+ raise RuntimeError(e.error_msg) from e
961
+
962
+ service_records = serve_utils.load_service_status(serve_status_payload)
963
+
964
+ if not service_records:
965
+ raise ValueError(f'Service {service_name!r} not found.')
966
+ assert len(service_records) == 1
967
+ service_record = service_records[0]
968
+
969
+ return {
970
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
971
+ replica_info['replica_id'])
972
+ for replica_info in service_record['replica_info']
973
+ }
974
+
975
+
976
+ def sync_down_logs(
977
+ service_name: str,
978
+ *,
979
+ local_dir: str,
980
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
981
+ None] = None,
982
+ replica_ids: Optional[List[int]] = None,
983
+ tail: Optional[int] = None,
984
+ pool: bool = False,
985
+ ) -> str:
986
+ """Sync down logs of a service or pool."""
987
+ noun = 'pool' if pool else 'service'
988
+ repnoun = 'worker' if pool else 'replica'
989
+ caprepnoun = repnoun.capitalize()
990
+
991
+ # Step 0) get the controller handle
992
+ with rich_utils.safe_status(
993
+ ux_utils.spinner_message(f'Checking {noun} status...')):
994
+ controller_type = controller_utils.get_controller_for_pool(pool)
995
+ handle = backend_utils.is_controller_accessible(
996
+ controller=controller_type,
997
+ stopped_message=controller_type.value.default_hint_if_non_existent)
998
+ backend: backends.CloudVmRayBackend = (
999
+ backend_utils.get_backend_from_handle(handle))
1000
+
1001
+ requested_components: Set[serve_utils.ServiceComponent] = set()
1002
+ if not targets:
1003
+ # No targets specified -> request all components
1004
+ requested_components = {
1005
+ serve_utils.ServiceComponent.CONTROLLER,
1006
+ serve_utils.ServiceComponent.LOAD_BALANCER,
1007
+ serve_utils.ServiceComponent.REPLICA
1008
+ }
1009
+ else:
1010
+ # Parse provided targets
1011
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
1012
+ requested_components = {serve_utils.ServiceComponent(targets)}
1013
+ else: # list
1014
+ requested_components = {
1015
+ serve_utils.ServiceComponent(t) for t in targets
1016
+ }
1017
+
1018
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
1019
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
1020
+ normalized_targets.add(
1021
+ serve_utils.ServiceComponentTarget(
1022
+ serve_utils.ServiceComponent.CONTROLLER))
1023
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
1024
+ normalized_targets.add(
1025
+ serve_utils.ServiceComponentTarget(
1026
+ serve_utils.ServiceComponent.LOAD_BALANCER))
1027
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
1028
+ with rich_utils.safe_status(
1029
+ ux_utils.spinner_message(f'Getting live {repnoun} infos...')):
1030
+ replica_targets = _get_all_replica_targets(service_name, backend,
1031
+ handle, pool)
1032
+ if not replica_ids:
1033
+ # Replica target requested but no specific IDs
1034
+ # -> Get all replica logs
1035
+ normalized_targets.update(replica_targets)
1036
+ else:
1037
+ # Replica target requested with specific IDs
1038
+ requested_replica_targets = [
1039
+ serve_utils.ServiceComponentTarget(
1040
+ serve_utils.ServiceComponent.REPLICA, rid)
1041
+ for rid in replica_ids
1042
+ ]
1043
+ for target in requested_replica_targets:
1044
+ if target not in replica_targets:
1045
+ logger.warning(f'{caprepnoun} ID {target.replica_id} not '
1046
+ f'found for {service_name}. Skipping...')
1047
+ else:
1048
+ normalized_targets.add(target)
1049
+
1050
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
1051
+ component = target.component
1052
+ # We need to set one side of the pipe to a logs stream, and the other
1053
+ # side to a file.
1054
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
1055
+ stream_logs_code: str
1056
+
1057
+ if component == serve_utils.ServiceComponent.CONTROLLER:
1058
+ stream_logs_code = (
1059
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
1060
+ service_name,
1061
+ stream_controller=True,
1062
+ follow=False,
1063
+ tail=tail,
1064
+ pool=pool))
1065
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
1066
+ stream_logs_code = (
1067
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
1068
+ service_name,
1069
+ stream_controller=False,
1070
+ follow=False,
1071
+ tail=tail,
1072
+ pool=pool))
1073
+ elif component == serve_utils.ServiceComponent.REPLICA:
1074
+ replica_id = target.replica_id
1075
+ assert replica_id is not None, service_name
1076
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
1077
+ service_name, replica_id, follow=False, tail=tail, pool=pool)
1078
+ else:
1079
+ assert False, component
1080
+
1081
+ # Refer to the notes in
1082
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
1083
+ backend.run_on_head(handle,
1084
+ stream_logs_code,
1085
+ stream_logs=False,
1086
+ process_stream=False,
1087
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
1088
+ log_path=log_path)
1089
+
1090
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
1091
+ list(normalized_targets))
1092
+
1093
+ return local_dir