skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -2,11 +2,10 @@
2
2
  import copy
3
3
  import dataclasses
4
4
  import enum
5
- import getpass
6
5
  import os
7
6
  import tempfile
8
7
  import typing
9
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
8
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Set
10
9
  import uuid
11
10
 
12
11
  import colorama
@@ -24,10 +23,14 @@ from sky.clouds import gcp
24
23
  from sky.data import data_utils
25
24
  from sky.data import storage as storage_lib
26
25
  from sky.jobs import constants as managed_job_constants
26
+ from sky.provision.kubernetes import constants as kubernetes_constants
27
27
  from sky.serve import constants as serve_constants
28
+ from sky.serve import serve_state
29
+ from sky.server import config as server_config
28
30
  from sky.setup_files import dependencies
29
31
  from sky.skylet import constants
30
32
  from sky.skylet import log_lib
33
+ from sky.utils import annotations
31
34
  from sky.utils import common
32
35
  from sky.utils import common_utils
33
36
  from sky.utils import config_utils
@@ -35,10 +38,16 @@ from sky.utils import env_options
35
38
  from sky.utils import registry
36
39
  from sky.utils import rich_utils
37
40
  from sky.utils import ux_utils
41
+ from sky.utils import yaml_utils
38
42
 
39
43
  if typing.TYPE_CHECKING:
44
+ import psutil
45
+
40
46
  from sky import task as task_lib
41
47
  from sky.backends import cloud_vm_ray_backend
48
+ else:
49
+ from sky.adaptors import common as adaptors_common
50
+ psutil = adaptors_common.LazyImport('psutil')
42
51
 
43
52
  logger = sky_logging.init_logger(__name__)
44
53
 
@@ -63,8 +72,9 @@ class _ControllerSpec:
63
72
  """Spec for skypilot controllers."""
64
73
  controller_type: str
65
74
  name: str
66
- cluster_name: str
67
- in_progress_hint: str
75
+ _cluster_name_func: Callable[[], str]
76
+ _cluster_name_from_server: Optional[str] # For client-side only
77
+ in_progress_hint: Callable[[bool], str]
68
78
  decline_cancel_hint: str
69
79
  _decline_down_when_failed_to_fetch_status_hint: str
70
80
  decline_down_for_dirty_controller_hint: str
@@ -84,6 +94,24 @@ class _ControllerSpec:
84
94
  return self._check_cluster_name_hint.format(
85
95
  cluster_name=self.cluster_name)
86
96
 
97
+ @property
98
+ def cluster_name(self) -> str:
99
+ """The cluster name of the controller.
100
+
101
+ On the server-side, the cluster name is the actual cluster name,
102
+ which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
103
+
104
+ On the client-side, the cluster name may not be accurate,
105
+ as we may not know the exact name, because we are missing
106
+ the server-side common.SERVER_ID. We have to wait until
107
+ we get the actual cluster name from the server.
108
+ """
109
+ return (self._cluster_name_from_server if self._cluster_name_from_server
110
+ is not None else self._cluster_name_func())
111
+
112
+ def set_cluster_name_from_server(self, cluster_name: str) -> None:
113
+ self._cluster_name_from_server = cluster_name
114
+
87
115
 
88
116
  # TODO: refactor controller class to not be an enum.
89
117
  class Controllers(enum.Enum):
@@ -93,10 +121,11 @@ class Controllers(enum.Enum):
93
121
  JOBS_CONTROLLER = _ControllerSpec(
94
122
  controller_type='jobs',
95
123
  name='managed jobs controller',
96
- cluster_name=common.JOB_CONTROLLER_NAME,
97
- in_progress_hint=(
98
- '* {job_info}To see all managed jobs: '
99
- f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
124
+ _cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
125
+ _cluster_name_from_server=None,
126
+ in_progress_hint=lambda _:
127
+ ('* {job_info}To see all managed jobs: '
128
+ f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
100
129
  decline_cancel_hint=(
101
130
  'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
102
131
  f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
@@ -124,10 +153,14 @@ class Controllers(enum.Enum):
124
153
  SKY_SERVE_CONTROLLER = _ControllerSpec(
125
154
  controller_type='serve',
126
155
  name='serve controller',
127
- cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
156
+ _cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
157
+ _cluster_name_from_server=None,
128
158
  in_progress_hint=(
129
- f'* To see detailed service status: {colorama.Style.BRIGHT}'
130
- f'sky serve status -v{colorama.Style.RESET_ALL}'),
159
+ lambda pool:
160
+ (f'* To see detailed pool status: {colorama.Style.BRIGHT}'
161
+ f'sky jobs pool status -v{colorama.Style.RESET_ALL}') if pool else
162
+ (f'* To see detailed service status: {colorama.Style.BRIGHT}'
163
+ f'sky serve status -v{colorama.Style.RESET_ALL}')),
131
164
  decline_cancel_hint=(
132
165
  'Cancelling the sky serve controller\'s jobs is not allowed.'),
133
166
  _decline_down_when_failed_to_fetch_status_hint=(
@@ -154,7 +187,9 @@ class Controllers(enum.Enum):
154
187
  default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
155
188
 
156
189
  @classmethod
157
- def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
190
+ def from_name(cls,
191
+ name: Optional[str],
192
+ expect_exact_match: bool = True) -> Optional['Controllers']:
158
193
  """Check if the cluster name is a controller name.
159
194
 
160
195
  Returns:
@@ -168,15 +203,32 @@ class Controllers(enum.Enum):
168
203
  # we may not know the exact name, because we are missing the server-side
169
204
  # common.SERVER_ID. So, we will assume anything that matches the prefix
170
205
  # is a controller.
206
+ prefix = None
171
207
  if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
172
208
  controller = cls.SKY_SERVE_CONTROLLER
209
+ prefix = common.SKY_SERVE_CONTROLLER_PREFIX
173
210
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
174
211
  controller = cls.JOBS_CONTROLLER
175
- if controller is not None and name != controller.value.cluster_name:
212
+ prefix = common.JOB_CONTROLLER_PREFIX
213
+
214
+ if controller is not None and expect_exact_match:
215
+ assert name == controller.value.cluster_name, (
216
+ name, controller.value.cluster_name)
217
+ elif controller is not None and name != controller.value.cluster_name:
176
218
  # The client-side cluster_name is not accurate. Assume that `name`
177
219
  # is the actual cluster name, so need to set the controller's
178
220
  # cluster name to the input name.
179
- controller.value.cluster_name = name
221
+
222
+ # Assert that the cluster name is well-formed. It should be
223
+ # {prefix}{hash}, where prefix is set above, and hash is a valid
224
+ # user hash.
225
+ assert prefix is not None, prefix
226
+ assert name.startswith(prefix), name
227
+ assert common_utils.is_valid_user_hash(name[len(prefix):]), (name,
228
+ prefix)
229
+
230
+ # Update the cluster name.
231
+ controller.value.set_cluster_name_from_server(name)
180
232
  return controller
181
233
 
182
234
  @classmethod
@@ -193,27 +245,35 @@ class Controllers(enum.Enum):
193
245
  return None
194
246
 
195
247
 
196
- def high_availability_specified(cluster_name: Optional[str],
197
- skip_warning: bool = True) -> bool:
248
+ def get_controller_for_pool(pool: bool) -> Controllers:
249
+ """Get the controller type."""
250
+ if pool:
251
+ return Controllers.JOBS_CONTROLLER
252
+ return Controllers.SKY_SERVE_CONTROLLER
253
+
254
+
255
+ def high_availability_specified(cluster_name: Optional[str]) -> bool:
198
256
  """Check if the controller high availability is specified in user config.
199
257
  """
200
- controller = Controllers.from_name(cluster_name)
258
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
201
259
  if controller is None:
202
260
  return False
203
261
 
262
+ if controller.value.controller_type == 'jobs':
263
+ # pylint: disable-next=import-outside-toplevel
264
+ from sky.jobs import utils as managed_job_utils
265
+ if managed_job_utils.is_consolidation_mode():
266
+ return True
267
+ elif controller.value.controller_type == 'serve':
268
+ # pylint: disable-next=import-outside-toplevel
269
+ from sky.serve import serve_utils
270
+ if serve_utils.is_consolidation_mode():
271
+ return True
272
+
204
273
  if skypilot_config.loaded():
205
- high_availability = skypilot_config.get_nested(
206
- (controller.value.controller_type, 'controller',
207
- 'high_availability'), False)
208
- if high_availability:
209
- if controller.value.controller_type != 'serve':
210
- if not skip_warning:
211
- print(f'{colorama.Fore.RED}High availability controller is'
212
- 'only supported for SkyServe controller. It cannot'
213
- f'be enabled for {controller.value.name}.'
214
- f'Skipping this flag.{colorama.Style.RESET_ALL}')
215
- else:
216
- return True
274
+ return skypilot_config.get_nested((controller.value.controller_type,
275
+ 'controller', 'high_availability'),
276
+ False)
217
277
  return False
218
278
 
219
279
 
@@ -250,6 +310,13 @@ def _get_cloud_dependencies_installation_commands(
250
310
  sky_check.get_cached_enabled_clouds_or_refresh(
251
311
  sky_cloud.CloudCapability.STORAGE))
252
312
  enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
313
+ enabled_k8s_and_ssh = [
314
+ repr(cloud)
315
+ for cloud in enabled_clouds
316
+ if isinstance(cloud, clouds.Kubernetes)
317
+ ]
318
+ k8s_and_ssh_label = ' and '.join(sorted(enabled_k8s_and_ssh))
319
+ k8s_dependencies_installed = False
253
320
 
254
321
  for cloud in enabled_clouds:
255
322
  cloud_python_dependencies: List[str] = copy.deepcopy(
@@ -269,10 +336,33 @@ def _get_cloud_dependencies_installation_commands(
269
336
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
270
337
  commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
271
338
  f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
272
- elif isinstance(cloud, clouds.Kubernetes):
339
+ if clouds.cloud_in_iterable(clouds.Kubernetes(), enabled_clouds):
340
+ # Install gke-gcloud-auth-plugin used for exec-auth with GKE.
341
+ # We install the plugin here instead of the next elif branch
342
+ # because gcloud is required to install the plugin, so the order
343
+ # of command execution is critical.
344
+
345
+ # We install plugin here regardless of whether exec-auth is
346
+ # actually used as exec-auth may be used in the future.
347
+ # TODO (kyuds): how to implement conservative installation?
348
+ commands.append(
349
+ '(command -v gke-gcloud-auth-plugin &>/dev/null || '
350
+ '(gcloud components install gke-gcloud-auth-plugin --quiet &>/dev/null))') # pylint: disable=line-too-long
351
+ elif isinstance(cloud, clouds.Nebius):
273
352
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
274
353
  commands.append(
275
- f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
354
+ f'echo -en "\\r{step_prefix}Nebius{empty_str}" && '
355
+ 'curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh ' # pylint: disable=line-too-long
356
+ '| sudo NEBIUS_INSTALL_FOLDER=/usr/local/bin bash &> /dev/null && '
357
+ 'nebius profile create --profile sky '
358
+ '--endpoint api.nebius.cloud '
359
+ '--service-account-file $HOME/.nebius/credentials.json '
360
+ '&> /dev/null || echo "Unable to create Nebius profile."')
361
+ elif (isinstance(cloud, clouds.Kubernetes) and
362
+ not k8s_dependencies_installed):
363
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
364
+ commands.append(
365
+ f'echo -en "\\r{step_prefix}{k8s_and_ssh_label}{empty_str}" && '
276
366
  # Install k8s + skypilot dependencies
277
367
  'sudo bash -c "if '
278
368
  '! command -v curl &> /dev/null || '
@@ -292,7 +382,10 @@ def _get_cloud_dependencies_installation_commands(
292
382
  '(curl -s -LO "https://dl.k8s.io/release/v1.31.6'
293
383
  '/bin/linux/$ARCH/kubectl" && '
294
384
  'sudo install -o root -g root -m 0755 '
295
- 'kubectl /usr/local/bin/kubectl))')
385
+ 'kubectl /usr/local/bin/kubectl)) && '
386
+ f'echo -e \'#!/bin/bash\\nexport PATH="{kubernetes_constants.SKY_K8S_EXEC_AUTH_PATH}"\\nexec "$@"\' | sudo tee /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER} > /dev/null && ' # pylint: disable=line-too-long
387
+ f'sudo chmod +x /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER}') # pylint: disable=line-too-long
388
+ k8s_dependencies_installed = True
296
389
  elif isinstance(cloud, clouds.Cudo):
297
390
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
298
391
  commands.append(
@@ -345,7 +438,7 @@ def check_cluster_name_not_controller(
345
438
  Returns:
346
439
  None, if the cluster name is not a controller name.
347
440
  """
348
- controller = Controllers.from_name(cluster_name)
441
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
349
442
  if controller is not None:
350
443
  msg = controller.value.check_cluster_name_hint
351
444
  if operation_str is not None:
@@ -355,10 +448,11 @@ def check_cluster_name_not_controller(
355
448
 
356
449
 
357
450
  # Internal only:
358
- def download_and_stream_latest_job_log(
451
+ def download_and_stream_job_log(
359
452
  backend: 'cloud_vm_ray_backend.CloudVmRayBackend',
360
453
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
361
- local_dir: str) -> Optional[str]:
454
+ local_dir: str,
455
+ job_ids: Optional[List[str]] = None) -> Optional[str]:
362
456
  """Downloads and streams the latest job log.
363
457
 
364
458
  This function is only used by jobs controller and sky serve controller.
@@ -376,7 +470,7 @@ def download_and_stream_latest_job_log(
376
470
  # multi-node cluster is preempted, and we recover the managed job
377
471
  # on the existing cluster, which leads to a larger job_id. Those
378
472
  # job_ids all represent the same logical managed job.
379
- job_ids=None,
473
+ job_ids=job_ids,
380
474
  local_dir=local_dir)
381
475
  except Exception as e: # pylint: disable=broad-except
382
476
  # We want to avoid crashing the controller. sync_down_logs() is pretty
@@ -394,7 +488,7 @@ def download_and_stream_latest_job_log(
394
488
  return None
395
489
 
396
490
  log_dir = list(log_dirs.values())[0]
397
- log_file = os.path.join(log_dir, 'run.log')
491
+ log_file = os.path.expanduser(os.path.join(log_dir, 'run.log'))
398
492
 
399
493
  # Print the logs to the console.
400
494
  # TODO(zhwu): refactor this into log_utils, along with the refactoring for
@@ -439,10 +533,13 @@ def shared_controller_vars_to_fill(
439
533
  # before popping allowed_contexts. If it is not on Kubernetes,
440
534
  # we may be able to use allowed_contexts.
441
535
  local_user_config.pop('allowed_contexts', None)
536
+ # Remove api_server config so that the controller does not try to use
537
+ # a remote API server.
538
+ local_user_config.pop('api_server', None)
442
539
  with tempfile.NamedTemporaryFile(
443
540
  delete=False,
444
541
  suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
445
- common_utils.dump_yaml(temp_file.name, dict(**local_user_config))
542
+ yaml_utils.dump_yaml(temp_file.name, dict(**local_user_config))
446
543
  local_user_config_path = temp_file.name
447
544
 
448
545
  vars_to_fill: Dict[str, Any] = {
@@ -461,7 +558,7 @@ def shared_controller_vars_to_fill(
461
558
  env_vars.update({
462
559
  # Should not use $USER here, as that env var can be empty when
463
560
  # running in a container.
464
- constants.USER_ENV_VAR: getpass.getuser(),
561
+ constants.USER_ENV_VAR: common_utils.get_current_user_name(),
465
562
  constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
466
563
  # Skip cloud identity check to avoid the overhead.
467
564
  env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
@@ -472,7 +569,15 @@ def shared_controller_vars_to_fill(
472
569
  # with a remote API server.
473
570
  constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
474
571
  common_utils.get_using_remote_api_server()),
572
+ constants.IS_SKYPILOT_SERVE_CONTROLLER:
573
+ ('true'
574
+ if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),
475
575
  })
576
+ override_concurrent_launches = os.environ.get(
577
+ constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
578
+ if override_concurrent_launches is not None:
579
+ env_vars[constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES] = str(
580
+ int(override_concurrent_launches))
476
581
  if skypilot_config.loaded():
477
582
  # Only set the SKYPILOT_CONFIG env var if the user has a config file.
478
583
  env_vars[
@@ -504,6 +609,30 @@ def get_controller_resources(
504
609
  if custom_controller_resources_config is not None:
505
610
  controller_resources_config_copied.update(
506
611
  custom_controller_resources_config)
612
+ # Compatibility with the old way of specifying the controller autostop
613
+ # config. TODO(cooperc): Remove this before 0.12.0.
614
+ custom_controller_autostop_config = skypilot_config.get_nested(
615
+ (controller.value.controller_type, 'controller', 'autostop'), None)
616
+ if custom_controller_autostop_config is not None:
617
+ logger.warning(
618
+ f'{colorama.Fore.YELLOW}Warning: Config value '
619
+ f'`{controller.value.controller_type}.controller.autostop` '
620
+ 'is deprecated. Please use '
621
+ f'`{controller.value.controller_type}.controller.resources.'
622
+ f'autostop` instead.{colorama.Style.RESET_ALL}')
623
+ # Only set the autostop config if it is not already specified.
624
+ if controller_resources_config_copied.get('autostop') is None:
625
+ controller_resources_config_copied['autostop'] = (
626
+ custom_controller_autostop_config)
627
+ else:
628
+ logger.warning(f'{colorama.Fore.YELLOW}Ignoring the old '
629
+ 'config, since it is already specified in '
630
+ f'resources.{colorama.Style.RESET_ALL}')
631
+ # Set the default autostop config for the controller, if not already
632
+ # specified.
633
+ if controller_resources_config_copied.get('autostop') is None:
634
+ controller_resources_config_copied['autostop'] = (
635
+ controller.value.default_autostop_config)
507
636
 
508
637
  try:
509
638
  controller_resources = resources.Resources.from_yaml_config(
@@ -529,12 +658,16 @@ def get_controller_resources(
529
658
  controller_resources_to_use: resources.Resources = list(
530
659
  controller_resources)[0]
531
660
 
532
- controller_record = global_user_state.get_cluster_from_name(
661
+ controller_handle = global_user_state.get_handle_from_cluster_name(
533
662
  controller.value.cluster_name)
534
- if controller_record is not None:
535
- handle = controller_record.get('handle', None)
536
- if handle is not None:
537
- controller_resources_to_use = handle.launched_resources
663
+ if controller_handle is not None:
664
+ if controller_handle is not None:
665
+ # Use the existing resources, but override the autostop config with
666
+ # the one currently specified in the config.
667
+ controller_resources_to_use = (
668
+ controller_handle.launched_resources.copy(
669
+ autostop=controller_resources_config_copied.get('autostop'))
670
+ )
538
671
 
539
672
  # If the controller and replicas are from the same cloud (and region/zone),
540
673
  # it should provide better connectivity. We will let the controller choose
@@ -595,8 +728,9 @@ def get_controller_resources(
595
728
  controller_zone = controller_resources_to_use.zone
596
729
 
597
730
  # Filter clouds if controller_resources_to_use.cloud is specified.
598
- filtered_clouds = ({controller_cloud} if controller_cloud is not None else
599
- requested_clouds_with_region_zone.keys())
731
+ filtered_clouds: Set[str] = {controller_cloud
732
+ } if controller_cloud is not None else set(
733
+ requested_clouds_with_region_zone.keys())
600
734
 
601
735
  # Filter regions and zones and construct the result.
602
736
  result: Set[resources.Resources] = set()
@@ -605,15 +739,17 @@ def get_controller_resources(
605
739
  {None: {None}})
606
740
 
607
741
  # Filter regions if controller_resources_to_use.region is specified.
608
- filtered_regions = ({controller_region} if controller_region is not None
609
- else regions.keys())
742
+ filtered_regions: Set[Optional[str]] = ({
743
+ controller_region
744
+ } if controller_region is not None else set(regions.keys()))
610
745
 
611
746
  for region in filtered_regions:
612
747
  zones = regions.get(region, {None})
613
748
 
614
749
  # Filter zones if controller_resources_to_use.zone is specified.
615
- filtered_zones = ({controller_zone}
616
- if controller_zone is not None else zones)
750
+ filtered_zones: Set[Optional[str]] = ({
751
+ controller_zone
752
+ } if controller_zone is not None else set(zones))
617
753
 
618
754
  # Create combinations of cloud, region, and zone.
619
755
  for zone in filtered_zones:
@@ -628,38 +764,15 @@ def get_controller_resources(
628
764
  return result
629
765
 
630
766
 
631
- def get_controller_autostop_config(
632
- controller: Controllers) -> Tuple[Optional[int], bool]:
633
- """Get the autostop config for the controller.
634
-
635
- Returns:
636
- A tuple of (idle_minutes_to_autostop, down), which correspond to the
637
- values passed to execution.launch().
638
- """
639
- controller_autostop_config_copied: Dict[str, Any] = copy.copy(
640
- controller.value.default_autostop_config)
641
- if skypilot_config.loaded():
642
- custom_controller_autostop_config = skypilot_config.get_nested(
643
- (controller.value.controller_type, 'controller', 'autostop'), None)
644
- if custom_controller_autostop_config is False:
645
- # Disabled with `autostop: false` in config.
646
- # To indicate autostop is disabled, we return None for
647
- # idle_minutes_to_autostop.
648
- return None, False
649
- elif custom_controller_autostop_config is True:
650
- # Enabled with default values. There is no change in behavior, but
651
- # this is included by for completeness, since `False` is valid.
652
- pass
653
- elif custom_controller_autostop_config is not None:
654
- # We have specific config values.
655
- # Override the controller autostop config with the ones specified in
656
- # the config.
657
- assert isinstance(custom_controller_autostop_config, dict)
658
- controller_autostop_config_copied.update(
659
- custom_controller_autostop_config)
660
-
661
- return (controller_autostop_config_copied['idle_minutes'],
662
- controller_autostop_config_copied['down'])
767
+ def get_controller_mem_size_gb() -> float:
768
+ try:
769
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
770
+ 'r',
771
+ encoding='utf-8') as f:
772
+ return float(f.read())
773
+ except FileNotFoundError:
774
+ pass
775
+ return common_utils.get_mem_size_gb()
663
776
 
664
777
 
665
778
  def _setup_proxy_command_on_controller(
@@ -690,7 +803,7 @@ def _setup_proxy_command_on_controller(
690
803
  # NOTE: suppose that we have a controller in old VPC, then user
691
804
  # changes 'vpc_name' in the config and does a 'job launch' /
692
805
  # 'serve up'. In general, the old controller may not successfully
693
- # launch the job in the new VPC. This happens if the two VPCs dont
806
+ # launch the job in the new VPC. This happens if the two VPCs don't
694
807
  # have peering set up. Like other places in the code, we assume
695
808
  # properly setting up networking is user's responsibilities.
696
809
  # TODO(zongheng): consider adding a basic check that checks
@@ -701,7 +814,11 @@ def _setup_proxy_command_on_controller(
701
814
  config = config_utils.Config.from_dict(user_config)
702
815
  proxy_command_key = (str(controller_launched_cloud).lower(),
703
816
  'ssh_proxy_command')
704
- ssh_proxy_command = config.get_nested(proxy_command_key, None)
817
+ ssh_proxy_command = skypilot_config.get_effective_region_config(
818
+ cloud=str(controller_launched_cloud).lower(),
819
+ region=None,
820
+ keys=('ssh_proxy_command',),
821
+ default_value=None)
705
822
  if isinstance(ssh_proxy_command, str):
706
823
  config.set_nested(proxy_command_key, None)
707
824
  elif isinstance(ssh_proxy_command, dict):
@@ -731,9 +848,9 @@ def replace_skypilot_config_path_in_file_mounts(
731
848
  continue
732
849
  if local_path.endswith(_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX):
733
850
  with tempfile.NamedTemporaryFile('w', delete=False) as f:
734
- user_config = common_utils.read_yaml(local_path)
851
+ user_config = yaml_utils.read_yaml(local_path)
735
852
  config = _setup_proxy_command_on_controller(cloud, user_config)
736
- common_utils.dump_yaml(f.name, dict(**config))
853
+ yaml_utils.dump_yaml(f.name, dict(**config))
737
854
  file_mounts[remote_path] = f.name
738
855
  replaced = True
739
856
  if replaced:
@@ -776,7 +893,7 @@ def translate_local_file_mounts_to_two_hop(
776
893
  file_mount_id = 0
777
894
 
778
895
  file_mounts_to_translate = task.file_mounts or {}
779
- if task.workdir is not None:
896
+ if task.workdir is not None and isinstance(task.workdir, str):
780
897
  file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
781
898
  task.workdir = None
782
899
 
@@ -844,7 +961,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
844
961
  copy_mounts = {}
845
962
 
846
963
  has_local_source_paths_file_mounts = bool(copy_mounts)
847
- has_local_source_paths_workdir = task.workdir is not None
964
+ has_local_source_paths_workdir = (task.workdir is not None and
965
+ isinstance(task.workdir, str))
848
966
 
849
967
  msg = None
850
968
  if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
@@ -892,7 +1010,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
892
1010
 
893
1011
  # Step 1: Translate the workdir to SkyPilot storage.
894
1012
  new_storage_mounts = {}
895
- if task.workdir is not None:
1013
+ if task.workdir is not None and isinstance(task.workdir, str):
896
1014
  workdir = task.workdir
897
1015
  task.workdir = None
898
1016
  if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
@@ -1113,3 +1231,179 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
1113
1231
  task.update_storage_mounts(updated_mount_storages)
1114
1232
  if msg:
1115
1233
  logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
1234
+
1235
+
1236
+ # ======================= Resources Management Functions =======================
1237
+
1238
+ # Monitoring process for service is 512MB. This is based on an old
1239
+ # estimation but we keep it here for now.
1240
+ # TODO(tian): Remeasure this.
1241
+ SERVE_MONITORING_MEMORY_MB = 512
1242
+ # The resource consumption ratio of service launch to serve down.
1243
+ SERVE_LAUNCH_RATIO = 2.0
1244
+
1245
+ # The _RESOURCES_LOCK should be held whenever we are checking the parallelism
1246
+ # control or updating the schedule_state of any job or service. Any code that
1247
+ # takes this lock must conclude by calling maybe_schedule_next_jobs.
1248
+ _RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
1249
+
1250
+ # keep 2GB reserved after the controllers
1251
+ MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
1252
+
1253
+ # NOTE: In the current implementation, we only consider the memory
1254
+ # The ratio of resources consumption for managed jobs and pool/serve.
1255
+ # This measures pool_resources / jobs_resources. If 2 GB memory is allocated to
1256
+ # jobs, then 2 * POOL_JOBS_RESOURCES_RATIO GB memory is allocated to pool/serve.
1257
+ POOL_JOBS_RESOURCES_RATIO = 1
1258
+ # Number of ongoing launches launches allowed per worker. Can probably be
1259
+ # increased a bit to around 16 but keeping it lower to just to be safe
1260
+ LAUNCHES_PER_WORKER = 8
1261
+ # Number of ongoing launches allowed per service. Can probably be increased
1262
+ # a bit as well.
1263
+ LAUNCHES_PER_SERVICE = 4
1264
+
1265
+ # Based on testing, each worker takes around 200-300MB memory. Keeping it
1266
+ # higher to be safe.
1267
+ JOB_WORKER_MEMORY_MB = 400
1268
+ # this can probably be increased to around 300-400 but keeping it lower to just
1269
+ # to be safe
1270
+ MAX_JOBS_PER_WORKER = 200
1271
+ # Maximum number of controllers that can be running. Hard to handle more than
1272
+ # 512 launches at once.
1273
+ MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
1274
+ # Limit the number of jobs that can be running at once on the entire jobs
1275
+ # controller cluster. It's hard to handle cancellation of more than 2000 jobs at
1276
+ # once.
1277
+ # TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
1278
+ # hardcoded max limit.
1279
+ MAX_TOTAL_RUNNING_JOBS = 2000
1280
+
1281
+
1282
+ def compute_memory_reserved_for_controllers(
1283
+ reserve_for_controllers: bool, reserve_extra_for_pool: bool) -> float:
1284
+ reserved_memory_mb = 0.0
1285
+ if reserve_for_controllers:
1286
+ reserved_memory_mb = float(MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB)
1287
+ if reserve_extra_for_pool:
1288
+ reserved_memory_mb *= (1. + POOL_JOBS_RESOURCES_RATIO)
1289
+ return reserved_memory_mb
1290
+
1291
+
1292
+ def _get_total_usable_memory_mb(pool: bool, consolidation_mode: bool) -> float:
1293
+ controller_reserved = compute_memory_reserved_for_controllers(
1294
+ reserve_for_controllers=True, reserve_extra_for_pool=pool)
1295
+ total_memory_mb = (common_utils.get_mem_size_gb() * 1024 -
1296
+ controller_reserved)
1297
+ if not consolidation_mode:
1298
+ return total_memory_mb
1299
+ config = server_config.compute_server_config(
1300
+ deploy=True, quiet=True, reserved_memory_mb=controller_reserved)
1301
+ used = 0.0
1302
+ used += ((config.long_worker_config.garanteed_parallelism +
1303
+ config.long_worker_config.burstable_parallelism) *
1304
+ server_config.LONG_WORKER_MEM_GB * 1024)
1305
+ used += ((config.short_worker_config.garanteed_parallelism +
1306
+ config.short_worker_config.burstable_parallelism) *
1307
+ server_config.SHORT_WORKER_MEM_GB * 1024)
1308
+ return total_memory_mb - used
1309
+
1310
+
1311
+ def _is_consolidation_mode(pool: bool) -> bool:
1312
+ return skypilot_config.get_nested(
1313
+ ('jobs' if pool else 'serve', 'controller', 'consolidation_mode'),
1314
+ default_value=False)
1315
+
1316
+
1317
+ @annotations.lru_cache(scope='request')
1318
+ def _get_parallelism(pool: bool, raw_resource_per_unit: float) -> int:
1319
+ """Returns the number of jobs controllers / services that should be running.
1320
+
1321
+ This is the number of controllers / services that should be running
1322
+ to maximize resource utilization.
1323
+
1324
+ In consolidation mode, we use the existing API server so our resource
1325
+ requirements are just for the job controllers / services. We try taking
1326
+ up as much memory as possible left over from the API server.
1327
+
1328
+ In non-consolidation mode, we have to take into account the memory of the
1329
+ API server workers. We limit to only 8 launches per worker, so our logic is
1330
+ each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
1331
+ leave some leftover room for ssh codegen and ray status overhead.
1332
+ """
1333
+ consolidation_mode = _is_consolidation_mode(pool)
1334
+
1335
+ total_memory_mb = _get_total_usable_memory_mb(pool, consolidation_mode)
1336
+
1337
+ # In consolidation mode, we assume the API server is running in deployment
1338
+ # mode, hence resource management (i.e. how many requests are allowed) is
1339
+ # done by the API server.
1340
+ resource_per_unit_worker = 0.
1341
+ # Otherwise, it runs a local API server on the jobs/serve controller.
1342
+ # We need to do the resource management ourselves.
1343
+ if not consolidation_mode:
1344
+ launches_per_worker = (LAUNCHES_PER_WORKER
1345
+ if pool else LAUNCHES_PER_SERVICE)
1346
+ resource_per_unit_worker = (launches_per_worker *
1347
+ server_config.LONG_WORKER_MEM_GB * 1024)
1348
+
1349
+ # If running pool on jobs controller, we need to account for the resources
1350
+ # consumed by the jobs.
1351
+ ratio = (1. + POOL_JOBS_RESOURCES_RATIO) if pool else 1.
1352
+ resource_per_unit = ratio * (raw_resource_per_unit +
1353
+ resource_per_unit_worker)
1354
+
1355
+ return max(int(total_memory_mb / resource_per_unit), 1)
1356
+
1357
+
1358
+ def get_number_of_jobs_controllers() -> int:
1359
+ return min(
1360
+ MAX_CONTROLLERS,
1361
+ _get_parallelism(pool=True, raw_resource_per_unit=JOB_WORKER_MEMORY_MB))
1362
+
1363
+
1364
+ @annotations.lru_cache(scope='global', maxsize=1)
1365
+ def get_resources_lock_path() -> str:
1366
+ path = os.path.expanduser(_RESOURCES_LOCK)
1367
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1368
+ return path
1369
+
1370
+
1371
+ def _get_number_of_services(pool: bool) -> int:
1372
+ return _get_parallelism(pool=pool,
1373
+ raw_resource_per_unit=SERVE_MONITORING_MEMORY_MB *
1374
+ POOL_JOBS_RESOURCES_RATIO)
1375
+
1376
+
1377
+ @annotations.lru_cache(scope='request')
1378
+ def _get_request_parallelism(pool: bool) -> int:
1379
+ # NOTE(dev): One smoke test depends on this value.
1380
+ # tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
1381
+ # assumes 4 concurrent launches.
1382
+ override_concurrent_launches = os.environ.get(
1383
+ constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
1384
+ if override_concurrent_launches is not None and not pool:
1385
+ return int(override_concurrent_launches)
1386
+ # Limitation per service x number of services
1387
+ launches_per_worker = (LAUNCHES_PER_WORKER
1388
+ if pool else LAUNCHES_PER_SERVICE)
1389
+ return (launches_per_worker * POOL_JOBS_RESOURCES_RATIO *
1390
+ _get_number_of_services(pool))
1391
+
1392
+
1393
+ def can_provision(pool: bool) -> bool:
1394
+ # TODO(tian): probe API server to see if there is any pending provision
1395
+ # requests.
1396
+ return can_terminate(pool)
1397
+
1398
+
1399
+ def can_start_new_process(pool: bool) -> bool:
1400
+ return serve_state.get_num_services() < _get_number_of_services(pool)
1401
+
1402
+
1403
+ def can_terminate(pool: bool) -> bool:
1404
+ # TODO(tian): probe API server to see if there is any pending terminate
1405
+ # requests.
1406
+ num_terminating = (
1407
+ serve_state.total_number_provisioning_replicas() +
1408
+ serve_state.total_number_terminating_replicas() / SERVE_LAUNCH_RATIO)
1409
+ return num_terminating < _get_request_parallelism(pool)