skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  """ReplicaManager: handles the creation and deletion of endpoint replicas."""
2
2
  import dataclasses
3
- import enum
4
3
  import functools
5
4
  import multiprocessing
6
5
  from multiprocessing import pool as mp_pool
@@ -12,53 +11,56 @@ import typing
12
11
  from typing import Any, Dict, List, Optional, Tuple
13
12
 
14
13
  import colorama
15
- import psutil
14
+ import filelock
16
15
  import requests
17
16
 
18
- import sky
19
17
  from sky import backends
20
18
  from sky import core
21
19
  from sky import exceptions
22
20
  from sky import execution
23
21
  from sky import global_user_state
24
22
  from sky import sky_logging
23
+ from sky import task as task_lib
25
24
  from sky.backends import backend_utils
26
25
  from sky.serve import constants as serve_constants
27
26
  from sky.serve import serve_state
28
27
  from sky.serve import serve_utils
29
28
  from sky.serve import service
30
29
  from sky.serve import spot_placer
30
+ from sky.server.requests import request_names
31
31
  from sky.skylet import constants
32
32
  from sky.skylet import job_lib
33
33
  from sky.usage import usage_lib
34
34
  from sky.utils import common_utils
35
35
  from sky.utils import controller_utils
36
36
  from sky.utils import env_options
37
+ from sky.utils import resources_utils
37
38
  from sky.utils import status_lib
38
39
  from sky.utils import ux_utils
40
+ from sky.utils import yaml_utils
39
41
 
40
42
  if typing.TYPE_CHECKING:
41
- from sky import resources
42
43
  from sky.serve import service_spec
43
44
 
44
45
  logger = sky_logging.init_logger(__name__)
45
46
 
46
47
  _JOB_STATUS_FETCH_INTERVAL = 30
47
48
  _PROCESS_POOL_REFRESH_INTERVAL = 20
48
- # TODO(tian): Maybe let user determine this threshold
49
- _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
50
49
  _RETRY_INIT_GAP_SECONDS = 60
51
50
  _DEFAULT_DRAIN_SECONDS = 120
52
51
 
53
- # Since sky.launch is very resource demanding, we limit the number of
54
- # concurrent sky.launch process to avoid overloading the machine.
55
- _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
52
+ # TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
53
+ # 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
54
+ # old ReplicaInfo in database will still tries to unpickle using ProcessStatus
55
+ # in replica_managers. We set this alias to avoid breaking changes. See #6729
56
+ # for more details.
57
+ ProcessStatus = common_utils.ProcessStatus
56
58
 
57
59
 
58
60
  # TODO(tian): Combine this with
59
61
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
60
62
  def launch_cluster(replica_id: int,
61
- task_yaml_path: str,
63
+ service_task_yaml_path: str,
62
64
  cluster_name: str,
63
65
  resources_override: Optional[Dict[str, Any]] = None,
64
66
  retry_until_up: bool = True,
@@ -78,8 +80,9 @@ def launch_cluster(replica_id: int,
78
80
  f'{cluster_name} with resources override: '
79
81
  f'{resources_override}')
80
82
  try:
81
- config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
82
- task = sky.Task.from_yaml_config(config)
83
+ config = yaml_utils.read_yaml(
84
+ os.path.expanduser(service_task_yaml_path))
85
+ task = task_lib.Task.from_yaml_config(config)
83
86
  if resources_override is not None:
84
87
  resources = task.resources
85
88
  overrided_resources = [
@@ -105,6 +108,8 @@ def launch_cluster(replica_id: int,
105
108
  execution.launch(task,
106
109
  cluster_name,
107
110
  retry_until_up=retry_until_up,
111
+ _request_name=request_names.AdminPolicyRequestName.
112
+ SERVE_LAUNCH_REPLICA,
108
113
  _is_launched_by_sky_serve_controller=True)
109
114
  logger.info(f'Replica cluster {cluster_name} launched.')
110
115
  except (exceptions.InvalidClusterNameError,
@@ -173,17 +178,19 @@ def terminate_cluster(cluster_name: str,
173
178
  time.sleep(gap_seconds)
174
179
 
175
180
 
176
- def _get_resources_ports(task_yaml: str) -> str:
181
+ def _get_resources_ports(service_task_yaml_path: str) -> str:
177
182
  """Get the resources ports used by the task."""
178
- task = sky.Task.from_yaml(task_yaml)
183
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
179
184
  # Already checked all ports are valid in sky.serve.core.up
180
185
  assert task.resources, task
181
186
  assert task.service is not None, task
187
+ if task.service.pool:
188
+ return '-'
182
189
  assert task.service.ports is not None, task
183
190
  return task.service.ports
184
191
 
185
192
 
186
- def _should_use_spot(task_yaml: str,
193
+ def _should_use_spot(service_task_yaml_path: str,
187
194
  resource_override: Optional[Dict[str, Any]]) -> bool:
188
195
  """Get whether the task should use spot."""
189
196
  if resource_override is not None:
@@ -191,7 +198,7 @@ def _should_use_spot(task_yaml: str,
191
198
  if use_spot_override is not None:
192
199
  assert isinstance(use_spot_override, bool)
193
200
  return use_spot_override
194
- task = sky.Task.from_yaml(task_yaml)
201
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
195
202
  spot_use_resources = [
196
203
  resources for resources in task.resources if resources.use_spot
197
204
  ]
@@ -200,6 +207,12 @@ def _should_use_spot(task_yaml: str,
200
207
  return len(spot_use_resources) == len(task.resources)
201
208
 
202
209
 
210
+ # Every function that calls serve_state.add_or_update_replica should acquire
211
+ # this lock. It is to prevent race condition when the replica status is updated
212
+ # by multiple threads at the same time. The modification of replica info is
213
+ # 2 database calls: read the whole replica info object, unpickle it, and modify
214
+ # corresponding fields. Then it is write back to the database. We need to ensure
215
+ # the read-modify-write operation is atomic.
203
216
  def with_lock(func):
204
217
 
205
218
  @functools.wraps(func)
@@ -210,22 +223,6 @@ def with_lock(func):
210
223
  return wrapper
211
224
 
212
225
 
213
- class ProcessStatus(enum.Enum):
214
- """Process status."""
215
-
216
- # The process is running
217
- RUNNING = 'RUNNING'
218
-
219
- # The process is finished and succeeded
220
- SUCCEEDED = 'SUCCEEDED'
221
-
222
- # The process is interrupted
223
- INTERRUPTED = 'INTERRUPTED'
224
-
225
- # The process failed
226
- FAILED = 'FAILED'
227
-
228
-
229
226
  @dataclasses.dataclass
230
227
  class ReplicaStatusProperty:
231
228
  """Some properties that determine replica status.
@@ -237,15 +234,16 @@ class ReplicaStatusProperty:
237
234
  first_ready_time: The first time the service is ready.
238
235
  sky_down_status: Process status of sky.down.
239
236
  """
240
- # None means sky.launch is not called yet.
241
- sky_launch_status: Optional[ProcessStatus] = None
237
+ # sky.launch will always be scheduled on creation of ReplicaStatusProperty.
238
+ sky_launch_status: common_utils.ProcessStatus = (
239
+ common_utils.ProcessStatus.SCHEDULED)
242
240
  user_app_failed: bool = False
243
241
  service_ready_now: bool = False
244
242
  # None means readiness probe is not succeeded yet;
245
243
  # -1 means the initial delay seconds is exceeded.
246
244
  first_ready_time: Optional[float] = None
247
245
  # None means sky.down is not called yet.
248
- sky_down_status: Optional[ProcessStatus] = None
246
+ sky_down_status: Optional[common_utils.ProcessStatus] = None
249
247
  # Whether the termination is caused by autoscaler's decision
250
248
  is_scale_down: bool = False
251
249
  # The replica's spot instance was preempted.
@@ -300,7 +298,7 @@ class ReplicaStatusProperty:
300
298
  (1) Job status;
301
299
  (2) Readiness probe.
302
300
  """
303
- if self.sky_launch_status != ProcessStatus.SUCCEEDED:
301
+ if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
304
302
  return False
305
303
  if self.sky_down_status is not None:
306
304
  return False
@@ -314,37 +312,43 @@ class ReplicaStatusProperty:
314
312
 
315
313
  def to_replica_status(self) -> serve_state.ReplicaStatus:
316
314
  """Convert status property to human-readable replica status."""
317
- if self.sky_launch_status is None:
315
+ # Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
316
+ # we use None to represent sky.launch is not called yet.
317
+ if (self.sky_launch_status is None or
318
+ self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
318
319
  # Pending to launch
319
320
  return serve_state.ReplicaStatus.PENDING
320
- if self.sky_launch_status == ProcessStatus.RUNNING:
321
- if self.sky_down_status == ProcessStatus.FAILED:
321
+ if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
322
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
322
323
  return serve_state.ReplicaStatus.FAILED_CLEANUP
323
- if self.sky_down_status == ProcessStatus.SUCCEEDED:
324
+ if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
324
325
  # This indicate it is a scale_down with correct teardown.
325
326
  # Should have been cleaned from the replica table.
326
327
  return serve_state.ReplicaStatus.UNKNOWN
327
328
  # Still launching
328
329
  return serve_state.ReplicaStatus.PROVISIONING
329
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
330
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
330
331
  # sky.down is running and a scale down interrupted sky.launch
331
332
  return serve_state.ReplicaStatus.SHUTTING_DOWN
332
333
  if self.sky_down_status is not None:
333
334
  if self.preempted:
334
335
  # Replica (spot) is preempted
335
336
  return serve_state.ReplicaStatus.PREEMPTED
336
- if self.sky_down_status == ProcessStatus.RUNNING:
337
+ if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
338
+ # sky.down is scheduled to run, but not started yet.
339
+ return serve_state.ReplicaStatus.SHUTTING_DOWN
340
+ if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
337
341
  # sky.down is running
338
342
  return serve_state.ReplicaStatus.SHUTTING_DOWN
339
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
343
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
340
344
  return serve_state.ReplicaStatus.SHUTTING_DOWN
341
- if self.sky_down_status == ProcessStatus.FAILED:
345
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
342
346
  # sky.down failed
343
347
  return serve_state.ReplicaStatus.FAILED_CLEANUP
344
348
  if self.user_app_failed:
345
349
  # Failed on user setup/run
346
350
  return serve_state.ReplicaStatus.FAILED
347
- if self.sky_launch_status == ProcessStatus.FAILED:
351
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
348
352
  # sky.launch failed
349
353
  return serve_state.ReplicaStatus.FAILED_PROVISION
350
354
  if self.first_ready_time is None:
@@ -360,7 +364,7 @@ class ReplicaStatusProperty:
360
364
  # This indicate it is a scale_down with correct teardown.
361
365
  # Should have been cleaned from the replica table.
362
366
  return serve_state.ReplicaStatus.UNKNOWN
363
- if self.sky_launch_status == ProcessStatus.FAILED:
367
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
364
368
  # sky.launch failed
365
369
  # The down process has not been started if it reaches here,
366
370
  # due to the `if self.sky_down_status is not None`` check above.
@@ -421,11 +425,12 @@ class ReplicaInfo:
421
425
  based on the cluster name.
422
426
  """
423
427
  if cluster_record is None:
424
- cluster_record = global_user_state.get_cluster_from_name(
428
+ handle = global_user_state.get_handle_from_cluster_name(
425
429
  self.cluster_name)
426
- if cluster_record is None:
430
+ else:
431
+ handle = cluster_record['handle']
432
+ if handle is None:
427
433
  return None
428
- handle = cluster_record['handle']
429
434
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
430
435
  return handle
431
436
 
@@ -442,10 +447,16 @@ class ReplicaInfo:
442
447
  handle = self.handle()
443
448
  if handle is None:
444
449
  return None
450
+ if self.replica_port == '-':
451
+ # This is a pool replica so there is no endpoint and it's filled
452
+ # with this dummy value. We return None here so that we can
453
+ # get the active ready replicas and perform autoscaling. Otherwise,
454
+ # would error out when trying to get the endpoint.
455
+ return None
445
456
  replica_port_int = int(self.replica_port)
446
457
  try:
447
- endpoint_dict = core.endpoints(handle.cluster_name,
448
- replica_port_int)
458
+ endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
459
+ replica_port_int)
449
460
  except exceptions.ClusterNotUpError:
450
461
  return None
451
462
  endpoint = endpoint_dict.get(replica_port_int, None)
@@ -465,26 +476,36 @@ class ReplicaInfo:
465
476
  f'replica {self.replica_id}.')
466
477
  return replica_status
467
478
 
468
- def to_info_dict(self, with_handle: bool) -> Dict[str, Any]:
479
+ def to_info_dict(self,
480
+ with_handle: bool,
481
+ with_url: bool = True) -> Dict[str, Any]:
469
482
  cluster_record = global_user_state.get_cluster_from_name(
470
- self.cluster_name)
483
+ self.cluster_name, include_user_info=False, summary_response=True)
471
484
  info_dict = {
472
485
  'replica_id': self.replica_id,
473
486
  'name': self.cluster_name,
474
487
  'status': self.status,
475
488
  'version': self.version,
476
- 'endpoint': self.url,
489
+ 'endpoint': self.url if with_url else None,
477
490
  'is_spot': self.is_spot,
478
491
  'launched_at': (cluster_record['launched_at']
479
492
  if cluster_record is not None else None),
480
493
  }
481
494
  if with_handle:
482
- info_dict['handle'] = self.handle(cluster_record)
495
+ handle = self.handle(cluster_record)
496
+ info_dict['handle'] = handle
497
+ if handle is not None:
498
+ info_dict['cloud'] = repr(handle.launched_resources.cloud)
499
+ info_dict['region'] = handle.launched_resources.region
500
+ info_dict['resources_str'] = (
501
+ resources_utils.get_readable_resources_repr(
502
+ handle, simplified_only=True)[0])
483
503
  return info_dict
484
504
 
485
505
  def __repr__(self) -> str:
486
- info_dict = self.to_info_dict(
487
- with_handle=env_options.Options.SHOW_DEBUG_INFO.get())
506
+ show_details = env_options.Options.SHOW_DEBUG_INFO.get()
507
+ info_dict = self.to_info_dict(with_handle=show_details,
508
+ with_url=show_details)
488
509
  handle_str = ''
489
510
  if 'handle' in info_dict:
490
511
  handle_str = f', handle={info_dict["handle"]}'
@@ -498,6 +519,33 @@ class ReplicaInfo:
498
519
  f'launched_at={info_dict["launched_at"]}{handle_str})')
499
520
  return info
500
521
 
522
+ def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
523
+ """Probe the replica for pool management.
524
+
525
+ This function will check the first job status of the cluster, which is a
526
+ dummy job that only echoes "setup done". The success of this job means
527
+ the setup command is done and the replica is ready to be used. Check
528
+ sky/serve/server/core.py::up for more details.
529
+
530
+ Returns:
531
+ Tuple of (self, is_ready, probe_time).
532
+ """
533
+ probe_time = time.time()
534
+ try:
535
+ handle = backend_utils.check_cluster_available(
536
+ self.cluster_name, operation='probing pool')
537
+ if handle is None:
538
+ return self, False, probe_time
539
+ backend = backend_utils.get_backend_from_handle(handle)
540
+ statuses = backend.get_job_status(handle, [1], stream_logs=False)
541
+ if statuses[1] == job_lib.JobStatus.SUCCEEDED:
542
+ return self, True, probe_time
543
+ return self, False, probe_time
544
+ except Exception as e: # pylint: disable=broad-except
545
+ logger.error(f'Error when probing pool of {self.cluster_name}: '
546
+ f'{common_utils.format_exception(e)}.')
547
+ return self, False, probe_time
548
+
501
549
  def probe(
502
550
  self,
503
551
  readiness_path: str,
@@ -587,6 +635,7 @@ class ReplicaManager:
587
635
  self._service_name: str = service_name
588
636
  self._uptime: Optional[float] = None
589
637
  self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
638
+ self._is_pool: bool = spec.pool
590
639
  header_keys = None
591
640
  if spec.readiness_headers is not None:
592
641
  header_keys = list(spec.readiness_headers.keys())
@@ -600,6 +649,15 @@ class ReplicaManager:
600
649
  # Oldest version among the currently provisioned and launched replicas
601
650
  self.least_recent_version: int = serve_constants.INITIAL_VERSION
602
651
 
652
+ def _consecutive_failure_threshold_timeout(self) -> int:
653
+ """The timeout for the consecutive failure threshold in seconds.
654
+
655
+ We reduce the timeout for pool to 10 seconds to make the pool more
656
+ responsive to the failure.
657
+ """
658
+ # TODO(tian): Maybe let user determine this threshold
659
+ return 10 if self._is_pool else 180
660
+
603
661
  def scale_up(self,
604
662
  resources_override: Optional[Dict[str, Any]] = None) -> None:
605
663
  """Scale up the service by 1 replica with resources_override.
@@ -634,10 +692,10 @@ class SkyPilotReplicaManager(ReplicaManager):
634
692
  """
635
693
 
636
694
  def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
637
- task_yaml_path: str) -> None:
695
+ service_task_yaml_path: str) -> None:
638
696
  super().__init__(service_name, spec)
639
- self._task_yaml_path = task_yaml_path
640
- task = sky.Task.from_yaml(task_yaml_path)
697
+ self.service_task_yaml_path = service_task_yaml_path
698
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
641
699
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
642
700
  spot_placer.SpotPlacer.from_task(spec, task))
643
701
  # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -657,6 +715,7 @@ class SkyPilotReplicaManager(ReplicaManager):
657
715
 
658
716
  self._recover_replica_operations()
659
717
 
718
+ @with_lock
660
719
  def _recover_replica_operations(self):
661
720
  """Let's see are there something to do for ReplicaManager in a
662
721
  recovery run"""
@@ -697,9 +756,8 @@ class SkyPilotReplicaManager(ReplicaManager):
697
756
  # Replica management functions #
698
757
  ################################
699
758
 
700
- # Adding lock here to make sure spot placer's current locations are
701
- # consistent with the replicas' status.
702
- @with_lock
759
+ # We don't need to add lock here since every caller of this function
760
+ # will acquire the lock.
703
761
  def _launch_replica(
704
762
  self,
705
763
  replica_id: int,
@@ -714,7 +772,8 @@ class SkyPilotReplicaManager(ReplicaManager):
714
772
  self._service_name, replica_id)
715
773
  log_file_name = serve_utils.generate_replica_launch_log_file_name(
716
774
  self._service_name, replica_id)
717
- use_spot = _should_use_spot(self._task_yaml_path, resources_override)
775
+ use_spot = _should_use_spot(self.service_task_yaml_path,
776
+ resources_override)
718
777
  retry_until_up = True
719
778
  location = None
720
779
  if use_spot and self._spot_placer is not None:
@@ -742,10 +801,10 @@ class SkyPilotReplicaManager(ReplicaManager):
742
801
  launch_cluster,
743
802
  log_file_name,
744
803
  ).run,
745
- args=(replica_id, self._task_yaml_path, cluster_name,
804
+ args=(replica_id, self.service_task_yaml_path, cluster_name,
746
805
  resources_override, retry_until_up),
747
806
  )
748
- replica_port = _get_resources_ports(self._task_yaml_path)
807
+ replica_port = _get_resources_ports(self.service_task_yaml_path)
749
808
 
750
809
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
751
810
  location, self.latest_version, resources_override)
@@ -754,11 +813,61 @@ class SkyPilotReplicaManager(ReplicaManager):
754
813
  # to avoid too many sky.launch running at the same time.
755
814
  self._launch_process_pool[replica_id] = p
756
815
 
816
+ @with_lock
757
817
  def scale_up(self,
758
818
  resources_override: Optional[Dict[str, Any]] = None) -> None:
759
819
  self._launch_replica(self._next_replica_id, resources_override)
760
820
  self._next_replica_id += 1
761
821
 
822
+ def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
823
+ if exitcode != 0:
824
+ logger.error(f'Down process for replica {info.replica_id} '
825
+ f'exited abnormally with code {exitcode}.')
826
+ info.status_property.sky_down_status = (
827
+ common_utils.ProcessStatus.FAILED)
828
+ else:
829
+ info.status_property.sky_down_status = (
830
+ common_utils.ProcessStatus.SUCCEEDED)
831
+ # Failed replica still count as a replica. In our current design, we
832
+ # want to fail early if user code have any error. This will prevent
833
+ # infinite loop of teardown and re-provision. However, there is a
834
+ # special case that if the replica is UP for longer than
835
+ # initial_delay_seconds, we assume it is just some random failure and
836
+ # we should restart the replica. Please refer to the implementation of
837
+ # `is_scale_down_succeeded` for more details.
838
+ # TODO(tian): Currently, restart replicas that failed within
839
+ # initial_delay_seconds is not supported. We should add it
840
+ # later when we support `sky serve update`.
841
+ removal_reason = None
842
+ if info.status_property.is_scale_down:
843
+ # This means the cluster is deleted due to an autoscaler
844
+ # decision or the cluster is recovering from preemption.
845
+ # Delete the replica info so it won't count as a replica.
846
+ if info.status_property.preempted:
847
+ removal_reason = 'for preemption recovery'
848
+ else:
849
+ removal_reason = 'normally'
850
+ # Don't keep failed record for version mismatch replicas,
851
+ # since user should fixed the error before update.
852
+ elif info.version != self.latest_version:
853
+ removal_reason = 'for version outdated'
854
+ elif info.status_property.purged:
855
+ removal_reason = 'for purge'
856
+ elif info.status_property.failed_spot_availability:
857
+ removal_reason = 'for spot availability failure'
858
+ else:
859
+ logger.info(f'Termination of replica {info.replica_id} '
860
+ 'finished. Replica info is kept since some '
861
+ 'failure detected.')
862
+ serve_state.add_or_update_replica(self._service_name,
863
+ info.replica_id, info)
864
+ if removal_reason is not None:
865
+ serve_state.remove_replica(self._service_name, info.replica_id)
866
+ logger.info(f'Replica {info.replica_id} removed from the '
867
+ f'replica table {removal_reason}.')
868
+
869
+ # We don't need to add lock here since every caller of this function
870
+ # will acquire the lock.
762
871
  def _terminate_replica(self,
763
872
  replica_id: int,
764
873
  sync_down_logs: bool,
@@ -776,7 +885,8 @@ class SkyPilotReplicaManager(ReplicaManager):
776
885
  info = serve_state.get_replica_info_from_id(self._service_name,
777
886
  replica_id)
778
887
  assert info is not None
779
- info.status_property.sky_launch_status = ProcessStatus.INTERRUPTED
888
+ info.status_property.sky_launch_status = (
889
+ common_utils.ProcessStatus.INTERRUPTED)
780
890
  serve_state.add_or_update_replica(self._service_name, replica_id,
781
891
  info)
782
892
  launch_process = self._launch_process_pool[replica_id]
@@ -820,9 +930,9 @@ class SkyPilotReplicaManager(ReplicaManager):
820
930
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
821
931
  replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
822
932
  'replica_jobs')
823
- job_log_file_name = (
824
- controller_utils.download_and_stream_latest_job_log(
825
- backend, handle, replica_job_logs_dir))
933
+ job_ids = ['1'] if self._is_pool else None
934
+ job_log_file_name = controller_utils.download_and_stream_job_log(
935
+ backend, handle, replica_job_logs_dir, job_ids)
826
936
  if job_log_file_name is not None:
827
937
  logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
828
938
  with open(log_file_name, 'a',
@@ -848,18 +958,30 @@ class SkyPilotReplicaManager(ReplicaManager):
848
958
 
849
959
  logger.info(f'preempted: {info.status_property.preempted}, '
850
960
  f'replica_id: {replica_id}')
961
+ info.status_property.is_scale_down = is_scale_down
962
+ info.status_property.purged = purge
963
+
964
+ # If the cluster does not exist, it means either the cluster never
965
+ # exists (e.g., the cluster is scaled down before it gets a chance to
966
+ # provision) or the cluster is preempted and cleaned up by the status
967
+ # refresh. In this case, we skip spawning a new down process to save
968
+ # controller resources.
969
+ if not global_user_state.cluster_with_name_exists(info.cluster_name):
970
+ self._handle_sky_down_finish(info, exitcode=0)
971
+ return
972
+
973
+ # Otherwise, start the process to terminate the cluster.
851
974
  p = multiprocessing.Process(
852
975
  target=ux_utils.RedirectOutputForProcess(terminate_cluster,
853
976
  log_file_name, 'a').run,
854
977
  args=(info.cluster_name, replica_drain_delay_seconds),
855
978
  )
856
- info.status_property.sky_down_status = ProcessStatus.RUNNING
857
- info.status_property.is_scale_down = is_scale_down
858
- info.status_property.purged = purge
979
+ info.status_property.sky_down_status = (
980
+ common_utils.ProcessStatus.SCHEDULED)
859
981
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
860
- p.start()
861
982
  self._down_process_pool[replica_id] = p
862
983
 
984
+ @with_lock
863
985
  def scale_down(self, replica_id: int, purge: bool = False) -> None:
864
986
  self._terminate_replica(
865
987
  replica_id,
@@ -868,6 +990,8 @@ class SkyPilotReplicaManager(ReplicaManager):
868
990
  is_scale_down=True,
869
991
  purge=purge)
870
992
 
993
+ # We don't need to add lock here since every caller of this function
994
+ # will acquire the lock.
871
995
  def _handle_preemption(self, info: ReplicaInfo) -> bool:
872
996
  """Handle preemption of the replica if any error happened.
873
997
 
@@ -930,18 +1054,19 @@ class SkyPilotReplicaManager(ReplicaManager):
930
1054
  # To avoid `dictionary changed size during iteration` error.
931
1055
  launch_process_pool_snapshot = list(self._launch_process_pool.items())
932
1056
  for replica_id, p in launch_process_pool_snapshot:
933
- if not p.is_alive():
1057
+ if p.is_alive():
1058
+ continue
1059
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
934
1060
  info = serve_state.get_replica_info_from_id(
935
1061
  self._service_name, replica_id)
936
1062
  assert info is not None, replica_id
937
1063
  error_in_sky_launch = False
938
1064
  if info.status == serve_state.ReplicaStatus.PENDING:
939
1065
  # sky.launch not started yet
940
- if (serve_state.total_number_provisioning_replicas() <
941
- _MAX_NUM_LAUNCH):
1066
+ if controller_utils.can_provision():
942
1067
  p.start()
943
1068
  info.status_property.sky_launch_status = (
944
- ProcessStatus.RUNNING)
1069
+ common_utils.ProcessStatus.RUNNING)
945
1070
  else:
946
1071
  # sky.launch finished
947
1072
  # TODO(tian): Try-catch in process, and have an enum return
@@ -958,11 +1083,11 @@ class SkyPilotReplicaManager(ReplicaManager):
958
1083
  f'exited abnormally with code {p.exitcode}.'
959
1084
  ' Terminating...')
960
1085
  info.status_property.sky_launch_status = (
961
- ProcessStatus.FAILED)
1086
+ common_utils.ProcessStatus.FAILED)
962
1087
  error_in_sky_launch = True
963
1088
  else:
964
1089
  info.status_property.sky_launch_status = (
965
- ProcessStatus.SUCCEEDED)
1090
+ common_utils.ProcessStatus.SUCCEEDED)
966
1091
  if self._spot_placer is not None and info.is_spot:
967
1092
  # TODO(tian): Currently, we set the location to
968
1093
  # preemptive if the launch process failed. This is
@@ -990,59 +1115,25 @@ class SkyPilotReplicaManager(ReplicaManager):
990
1115
  replica_drain_delay_seconds=0)
991
1116
  down_process_pool_snapshot = list(self._down_process_pool.items())
992
1117
  for replica_id, p in down_process_pool_snapshot:
993
- if not p.is_alive():
994
- logger.info(
995
- f'Terminate process for replica {replica_id} finished.')
996
- del self._down_process_pool[replica_id]
997
- info = serve_state.get_replica_info_from_id(
998
- self._service_name, replica_id)
999
- assert info is not None, replica_id
1000
- if p.exitcode != 0:
1001
- logger.error(f'Down process for replica {replica_id} '
1002
- f'exited abnormally with code {p.exitcode}.')
1003
- info.status_property.sky_down_status = (
1004
- ProcessStatus.FAILED)
1005
- else:
1118
+ if p.is_alive():
1119
+ continue
1120
+ info = serve_state.get_replica_info_from_id(self._service_name,
1121
+ replica_id)
1122
+ assert info is not None, replica_id
1123
+ if (info.status_property.sky_down_status ==
1124
+ common_utils.ProcessStatus.SCHEDULED):
1125
+ # sky.down not started yet
1126
+ if controller_utils.can_terminate():
1127
+ p.start()
1006
1128
  info.status_property.sky_down_status = (
1007
- ProcessStatus.SUCCEEDED)
1008
- # Failed replica still count as a replica. In our current
1009
- # design, we want to fail early if user code have any error.
1010
- # This will prevent infinite loop of teardown and
1011
- # re-provision. However, there is a special case that if the
1012
- # replica is UP for longer than initial_delay_seconds, we
1013
- # assume it is just some random failure and we should restart
1014
- # the replica. Please refer to the implementation of
1015
- # `is_scale_down_succeeded` for more details.
1016
- # TODO(tian): Currently, restart replicas that failed within
1017
- # initial_delay_seconds is not supported. We should add it
1018
- # later when we support `sky serve update`.
1019
- removal_reason = None
1020
- if info.status_property.is_scale_down:
1021
- # This means the cluster is deleted due to an autoscaler
1022
- # decision or the cluster is recovering from preemption.
1023
- # Delete the replica info so it won't count as a replica.
1024
- if info.status_property.preempted:
1025
- removal_reason = 'for preemption recovery'
1026
- else:
1027
- removal_reason = 'normally'
1028
- # Don't keep failed record for version mismatch replicas,
1029
- # since user should fixed the error before update.
1030
- elif info.version != self.latest_version:
1031
- removal_reason = 'for version outdated'
1032
- elif info.status_property.purged:
1033
- removal_reason = 'for purge'
1034
- elif info.status_property.failed_spot_availability:
1035
- removal_reason = 'for spot availability failure'
1036
- else:
1037
- logger.info(f'Termination of replica {replica_id} '
1038
- 'finished. Replica info is kept since some '
1039
- 'failure detected.')
1129
+ common_utils.ProcessStatus.RUNNING)
1040
1130
  serve_state.add_or_update_replica(self._service_name,
1041
1131
  replica_id, info)
1042
- if removal_reason is not None:
1043
- serve_state.remove_replica(self._service_name, replica_id)
1044
- logger.info(f'Replica {replica_id} removed from the '
1045
- f'replica table {removal_reason}.')
1132
+ else:
1133
+ logger.info(
1134
+ f'Terminate process for replica {replica_id} finished.')
1135
+ del self._down_process_pool[replica_id]
1136
+ self._handle_sky_down_finish(info, exitcode=p.exitcode)
1046
1137
 
1047
1138
  # Clean old version
1048
1139
  replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1098,9 +1189,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1098
1189
  handle = info.handle()
1099
1190
  assert handle is not None, info
1100
1191
  # Use None to fetch latest job, which stands for user task job
1192
+ job_ids = [1] if self._is_pool else None
1101
1193
  try:
1102
1194
  job_statuses = backend.get_job_status(handle,
1103
- None,
1195
+ job_ids,
1104
1196
  stream_logs=False)
1105
1197
  except exceptions.CommandError:
1106
1198
  # If the job status fetch failed, it is likely that the
@@ -1110,7 +1202,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1110
1202
  continue
1111
1203
  # Re-raise the exception if it is not preempted.
1112
1204
  raise
1113
- job_status = list(job_statuses.values())[0]
1205
+ job_status = job_statuses[1] if self._is_pool else list(
1206
+ job_statuses.values())[0]
1114
1207
  if job_status in job_lib.JobStatus.user_code_failure_states():
1115
1208
  info.status_property.user_app_failed = True
1116
1209
  serve_state.add_or_update_replica(self._service_name,
@@ -1154,18 +1247,24 @@ class SkyPilotReplicaManager(ReplicaManager):
1154
1247
  for info in infos:
1155
1248
  if not info.status_property.should_track_service_status():
1156
1249
  continue
1157
- replica_to_probe.append(
1158
- f'replica_{info.replica_id}(url={info.url})')
1159
- probe_futures.append(
1160
- pool.apply_async(
1161
- info.probe,
1162
- (
1163
- self._get_readiness_path(info.version),
1164
- self._get_post_data(info.version),
1165
- self._get_readiness_timeout_seconds(info.version),
1166
- self._get_readiness_headers(info.version),
1167
- ),
1168
- ),)
1250
+ if self._is_pool:
1251
+ replica_to_probe.append(f'replica_{info.replica_id}(cluster'
1252
+ f'_name={info.cluster_name})')
1253
+ probe_futures.append(pool.apply_async(info.probe_pool))
1254
+ else:
1255
+ replica_to_probe.append(
1256
+ f'replica_{info.replica_id}(url={info.url})')
1257
+ probe_futures.append(
1258
+ pool.apply_async(
1259
+ info.probe,
1260
+ (
1261
+ self._get_readiness_path(info.version),
1262
+ self._get_post_data(info.version),
1263
+ self._get_readiness_timeout_seconds(
1264
+ info.version),
1265
+ self._get_readiness_headers(info.version),
1266
+ ),
1267
+ ),)
1169
1268
  logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
1170
1269
 
1171
1270
  # Since futures.as_completed will return futures in the order of
@@ -1202,8 +1301,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1202
1301
  consecutive_failure_time = (
1203
1302
  info.consecutive_failure_times[-1] -
1204
1303
  info.consecutive_failure_times[0])
1205
- if (consecutive_failure_time >=
1206
- _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT):
1304
+ failure_threshold = (
1305
+ self._consecutive_failure_threshold_timeout())
1306
+ if consecutive_failure_time >= failure_threshold:
1207
1307
  logger.info(
1208
1308
  f'Replica {info.replica_id} is not ready for '
1209
1309
  'too long and exceeding consecutive failure '
@@ -1214,8 +1314,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1214
1314
  f'Replica {info.replica_id} is not ready '
1215
1315
  'but within consecutive failure threshold '
1216
1316
  f'({consecutive_failure_time}s / '
1217
- f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). '
1218
- 'Skipping.')
1317
+ f'{failure_threshold}s). Skipping.')
1219
1318
  else:
1220
1319
  initial_delay_seconds = self._get_initial_delay_seconds(
1221
1320
  info.version)
@@ -1290,11 +1389,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1290
1389
  logger.error(f'Invalid version: {version}, '
1291
1390
  f'latest version: {self.latest_version}')
1292
1391
  return
1293
- task_yaml_path = serve_utils.generate_task_yaml_file_name(
1392
+ service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1294
1393
  self._service_name, version)
1295
1394
  serve_state.add_or_update_version(self._service_name, version, spec)
1296
1395
  self.latest_version = version
1297
- self._task_yaml_path = task_yaml_path
1396
+ self.service_task_yaml_path = service_task_yaml_path
1298
1397
  self._update_mode = update_mode
1299
1398
 
1300
1399
  # Reuse all replicas that have the same config as the new version
@@ -1302,32 +1401,40 @@ class SkyPilotReplicaManager(ReplicaManager):
1302
1401
  # the latest version. This can significantly improve the speed
1303
1402
  # for updating an existing service with only config changes to the
1304
1403
  # service specs, e.g. scale down the service.
1305
- new_config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
1404
+ new_config = yaml_utils.read_yaml(
1405
+ os.path.expanduser(service_task_yaml_path))
1306
1406
  # Always create new replicas and scale down old ones when file_mounts
1307
1407
  # are not empty.
1308
1408
  if new_config.get('file_mounts', None) != {}:
1309
1409
  return
1310
- for key in ['service']:
1311
- new_config.pop(key)
1410
+ for key in ['service', 'pool', '_user_specified_yaml']:
1411
+ new_config.pop(key, None)
1412
+ new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
1413
+
1312
1414
  replica_infos = serve_state.get_replica_infos(self._service_name)
1313
1415
  for info in replica_infos:
1314
1416
  if info.version < version and not info.is_terminal:
1315
1417
  # Assume user does not change the yaml file on the controller.
1316
- old_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1317
- self._service_name, info.version)
1318
- old_config = common_utils.read_yaml(
1319
- os.path.expanduser(old_task_yaml_path))
1320
- for key in ['service']:
1321
- old_config.pop(key)
1418
+ old_service_task_yaml_path = (
1419
+ serve_utils.generate_task_yaml_file_name(
1420
+ self._service_name, info.version))
1421
+ old_config = yaml_utils.read_yaml(
1422
+ os.path.expanduser(old_service_task_yaml_path))
1423
+ for key in ['service', 'pool', '_user_specified_yaml']:
1424
+ old_config.pop(key, None)
1322
1425
  # Bump replica version if all fields except for service are
1323
1426
  # the same.
1324
1427
  # Here, we manually convert the any_of field to a set to avoid
1325
1428
  # only the difference in the random order of the any_of fields.
1326
1429
  old_config_any_of = old_config.get('resources',
1327
1430
  {}).pop('any_of', [])
1328
- new_config_any_of = new_config.get('resources',
1329
- {}).pop('any_of', [])
1330
- if set(old_config_any_of) != set(new_config_any_of):
1431
+
1432
+ if (resources_utils.normalize_any_of_resources_config(
1433
+ old_config_any_of) != resources_utils.
1434
+ normalize_any_of_resources_config(new_config_any_of)):
1435
+ logger.info('Replica config changed (any_of), skipping. '
1436
+ f'old: {old_config_any_of}, '
1437
+ f'new: {new_config_any_of}')
1331
1438
  continue
1332
1439
  # File mounts should both be empty, as update always
1333
1440
  # create new buckets if they are not empty.
@@ -1341,6 +1448,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1341
1448
  info.version = version
1342
1449
  serve_state.add_or_update_replica(self._service_name,
1343
1450
  info.replica_id, info)
1451
+ else:
1452
+ logger.info('Replica config changed (rest), skipping. '
1453
+ f'old: {old_config}, '
1454
+ f'new: {new_config}')
1344
1455
 
1345
1456
  def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
1346
1457
  spec = serve_state.get_spec(self._service_name, version)