skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,9 @@
1
1
  """ReplicaManager: handles the creation and deletion of endpoint replicas."""
2
2
  import dataclasses
3
- import enum
4
3
  import functools
5
- import multiprocessing
6
4
  from multiprocessing import pool as mp_pool
7
5
  import os
6
+ import pathlib
8
7
  import threading
9
8
  import time
10
9
  import traceback
@@ -12,17 +11,16 @@ import typing
12
11
  from typing import Any, Dict, List, Optional, Tuple
13
12
 
14
13
  import colorama
15
- import psutil
14
+ import filelock
16
15
  import requests
17
16
 
18
- import sky
19
17
  from sky import backends
20
- from sky import core
21
18
  from sky import exceptions
22
- from sky import execution
23
19
  from sky import global_user_state
24
20
  from sky import sky_logging
21
+ from sky import task as task_lib
25
22
  from sky.backends import backend_utils
23
+ from sky.client import sdk
26
24
  from sky.serve import constants as serve_constants
27
25
  from sky.serve import serve_state
28
26
  from sky.serve import serve_utils
@@ -32,34 +30,47 @@ from sky.skylet import constants
32
30
  from sky.skylet import job_lib
33
31
  from sky.usage import usage_lib
34
32
  from sky.utils import common_utils
33
+ from sky.utils import context
35
34
  from sky.utils import controller_utils
36
35
  from sky.utils import env_options
36
+ from sky.utils import resources_utils
37
37
  from sky.utils import status_lib
38
+ from sky.utils import thread_utils
38
39
  from sky.utils import ux_utils
40
+ from sky.utils import yaml_utils
39
41
 
40
42
  if typing.TYPE_CHECKING:
41
- from sky import resources
43
+ import logging
44
+
42
45
  from sky.serve import service_spec
43
46
 
44
47
  logger = sky_logging.init_logger(__name__)
45
48
 
46
49
  _JOB_STATUS_FETCH_INTERVAL = 30
47
50
  _PROCESS_POOL_REFRESH_INTERVAL = 20
48
- # TODO(tian): Maybe let user determine this threshold
49
- _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
50
51
  _RETRY_INIT_GAP_SECONDS = 60
51
52
  _DEFAULT_DRAIN_SECONDS = 120
53
+ _WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS = 15
52
54
 
53
- # Since sky.launch is very resource demanding, we limit the number of
54
- # concurrent sky.launch process to avoid overloading the machine.
55
- _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
55
+ # TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
56
+ # 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
57
+ # old ReplicaInfo in database will still tries to unpickle using ProcessStatus
58
+ # in replica_managers. We set this alias to avoid breaking changes. See #6729
59
+ # for more details.
60
+ ProcessStatus = common_utils.ProcessStatus
56
61
 
57
62
 
58
63
  # TODO(tian): Combine this with
59
64
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
65
+ # Use context.contextual to enable per-launch output redirection.
66
+ @context.contextual
60
67
  def launch_cluster(replica_id: int,
61
- task_yaml_path: str,
68
+ yaml_content: str,
62
69
  cluster_name: str,
70
+ log_file: str,
71
+ replica_to_request_id: thread_utils.ThreadSafeDict[int, str],
72
+ replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
73
+ int, bool],
63
74
  resources_override: Optional[Dict[str, Any]] = None,
64
75
  retry_until_up: bool = True,
65
76
  max_retry: int = 3) -> None:
@@ -73,13 +84,16 @@ def launch_cluster(replica_id: int,
73
84
  or some error happened before provisioning and will happen again
74
85
  if retry.
75
86
  """
87
+ ctx = context.get()
88
+ assert ctx is not None, 'Context is not initialized'
89
+ ctx.redirect_log(pathlib.Path(log_file))
90
+
76
91
  if resources_override is not None:
77
92
  logger.info(f'Scaling up replica (id: {replica_id}) cluster '
78
93
  f'{cluster_name} with resources override: '
79
94
  f'{resources_override}')
80
95
  try:
81
- config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
82
- task = sky.Task.from_yaml_config(config)
96
+ task = task_lib.Task.from_yaml_str(yaml_content)
83
97
  if resources_override is not None:
84
98
  resources = task.resources
85
99
  overrided_resources = [
@@ -96,16 +110,31 @@ def launch_cluster(replica_id: int,
96
110
  raise RuntimeError(
97
111
  f'Failed to launch the sky serve replica cluster {cluster_name} '
98
112
  'due to failing to initialize sky.Task from yaml file.') from e
113
+
114
+ def _check_is_cancelled() -> bool:
115
+ is_cancelled = replica_to_launch_cancelled.get(replica_id, False)
116
+ if is_cancelled:
117
+ logger.info(f'Replica {replica_id} launch cancelled.')
118
+ # Pop the value to indicate that the signal was received.
119
+ replica_to_launch_cancelled.pop(replica_id)
120
+ return is_cancelled
121
+
99
122
  retry_cnt = 0
100
123
  backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS)
101
124
  while True:
102
125
  retry_cnt += 1
103
126
  try:
127
+ if _check_is_cancelled():
128
+ return
104
129
  usage_lib.messages.usage.set_internal()
105
- execution.launch(task,
106
- cluster_name,
107
- retry_until_up=retry_until_up,
108
- _is_launched_by_sky_serve_controller=True)
130
+ request_id = sdk.launch(task,
131
+ cluster_name,
132
+ retry_until_up=retry_until_up,
133
+ _is_launched_by_sky_serve_controller=True)
134
+ logger.info(f'Replica cluster {cluster_name} launch requested '
135
+ f'with request_id: {request_id}.')
136
+ replica_to_request_id[replica_id] = request_id
137
+ sdk.stream_and_get(request_id)
109
138
  logger.info(f'Replica cluster {cluster_name} launched.')
110
139
  except (exceptions.InvalidClusterNameError,
111
140
  exceptions.NoCloudAccessError,
@@ -130,22 +159,44 @@ def launch_cluster(replica_id: int,
130
159
  else: # No exception, the launch succeeds.
131
160
  return
132
161
 
133
- terminate_cluster(cluster_name)
162
+ # Cleanup the request id and the failed cluster.
163
+ replica_to_request_id.pop(replica_id)
164
+ # If it is cancelled, no need to terminate the cluster. It will be
165
+ # handled by the termination thread.
166
+ if _check_is_cancelled():
167
+ return
168
+ terminate_cluster(cluster_name, log_file=log_file)
169
+
134
170
  if retry_cnt >= max_retry:
135
171
  raise RuntimeError('Failed to launch the sky serve replica cluster '
136
172
  f'{cluster_name} after {max_retry} retries.')
173
+
137
174
  gap_seconds = backoff.current_backoff()
138
175
  logger.info('Retrying to launch the sky serve replica cluster '
139
176
  f'in {gap_seconds:.1f} seconds.')
140
- time.sleep(gap_seconds)
177
+ start_backoff = time.time()
178
+ # Check if it is cancelled every 0.1 seconds.
179
+ while time.time() - start_backoff < gap_seconds:
180
+ if _check_is_cancelled():
181
+ return
182
+ time.sleep(0.1)
141
183
 
142
184
 
143
185
  # TODO(tian): Combine this with
144
186
  # sky/spot/recovery_strategy.py::terminate_cluster
187
+ @context.contextual
145
188
  def terminate_cluster(cluster_name: str,
189
+ log_file: str,
146
190
  replica_drain_delay_seconds: int = 0,
147
191
  max_retry: int = 3) -> None:
148
192
  """Terminate the sky serve replica cluster."""
193
+ # Setup logging redirection.
194
+ ctx = context.get()
195
+ assert ctx is not None, 'Context is not initialized'
196
+ ctx.redirect_log(pathlib.Path(log_file))
197
+
198
+ logger.info(f'Terminating replica cluster {cluster_name} with '
199
+ f'replica_drain_delay_seconds: {replica_drain_delay_seconds}')
149
200
  time.sleep(replica_drain_delay_seconds)
150
201
  retry_cnt = 0
151
202
  backoff = common_utils.Backoff()
@@ -153,7 +204,10 @@ def terminate_cluster(cluster_name: str,
153
204
  retry_cnt += 1
154
205
  try:
155
206
  usage_lib.messages.usage.set_internal()
156
- core.down(cluster_name)
207
+ logger.info(f'Sending down request to cluster {cluster_name}')
208
+ request_id = sdk.down(cluster_name)
209
+ sdk.stream_and_get(request_id)
210
+ logger.info(f'Replica cluster {cluster_name} terminated.')
157
211
  return
158
212
  except ValueError:
159
213
  # The cluster is already terminated.
@@ -173,17 +227,19 @@ def terminate_cluster(cluster_name: str,
173
227
  time.sleep(gap_seconds)
174
228
 
175
229
 
176
- def _get_resources_ports(task_yaml: str) -> str:
230
+ def _get_resources_ports(yaml_content: str) -> str:
177
231
  """Get the resources ports used by the task."""
178
- task = sky.Task.from_yaml(task_yaml)
232
+ task = task_lib.Task.from_yaml_str(yaml_content)
179
233
  # Already checked all ports are valid in sky.serve.core.up
180
234
  assert task.resources, task
181
235
  assert task.service is not None, task
236
+ if task.service.pool:
237
+ return '-'
182
238
  assert task.service.ports is not None, task
183
239
  return task.service.ports
184
240
 
185
241
 
186
- def _should_use_spot(task_yaml: str,
242
+ def _should_use_spot(yaml_content: str,
187
243
  resource_override: Optional[Dict[str, Any]]) -> bool:
188
244
  """Get whether the task should use spot."""
189
245
  if resource_override is not None:
@@ -191,7 +247,7 @@ def _should_use_spot(task_yaml: str,
191
247
  if use_spot_override is not None:
192
248
  assert isinstance(use_spot_override, bool)
193
249
  return use_spot_override
194
- task = sky.Task.from_yaml(task_yaml)
250
+ task = task_lib.Task.from_yaml_str(yaml_content)
195
251
  spot_use_resources = [
196
252
  resources for resources in task.resources if resources.use_spot
197
253
  ]
@@ -200,6 +256,12 @@ def _should_use_spot(task_yaml: str,
200
256
  return len(spot_use_resources) == len(task.resources)
201
257
 
202
258
 
259
+ # Every function that calls serve_state.add_or_update_replica should acquire
260
+ # this lock. It is to prevent race condition when the replica status is updated
261
+ # by multiple threads at the same time. The modification of replica info is
262
+ # 2 database calls: read the whole replica info object, unpickle it, and modify
263
+ # corresponding fields. Then it is write back to the database. We need to ensure
264
+ # the read-modify-write operation is atomic.
203
265
  def with_lock(func):
204
266
 
205
267
  @functools.wraps(func)
@@ -210,22 +272,6 @@ def with_lock(func):
210
272
  return wrapper
211
273
 
212
274
 
213
- class ProcessStatus(enum.Enum):
214
- """Process status."""
215
-
216
- # The process is running
217
- RUNNING = 'RUNNING'
218
-
219
- # The process is finished and succeeded
220
- SUCCEEDED = 'SUCCEEDED'
221
-
222
- # The process is interrupted
223
- INTERRUPTED = 'INTERRUPTED'
224
-
225
- # The process failed
226
- FAILED = 'FAILED'
227
-
228
-
229
275
  @dataclasses.dataclass
230
276
  class ReplicaStatusProperty:
231
277
  """Some properties that determine replica status.
@@ -237,15 +283,16 @@ class ReplicaStatusProperty:
237
283
  first_ready_time: The first time the service is ready.
238
284
  sky_down_status: Process status of sky.down.
239
285
  """
240
- # None means sky.launch is not called yet.
241
- sky_launch_status: Optional[ProcessStatus] = None
286
+ # sky.launch will always be scheduled on creation of ReplicaStatusProperty.
287
+ sky_launch_status: common_utils.ProcessStatus = (
288
+ common_utils.ProcessStatus.SCHEDULED)
242
289
  user_app_failed: bool = False
243
290
  service_ready_now: bool = False
244
291
  # None means readiness probe is not succeeded yet;
245
292
  # -1 means the initial delay seconds is exceeded.
246
293
  first_ready_time: Optional[float] = None
247
294
  # None means sky.down is not called yet.
248
- sky_down_status: Optional[ProcessStatus] = None
295
+ sky_down_status: Optional[common_utils.ProcessStatus] = None
249
296
  # Whether the termination is caused by autoscaler's decision
250
297
  is_scale_down: bool = False
251
298
  # The replica's spot instance was preempted.
@@ -300,7 +347,7 @@ class ReplicaStatusProperty:
300
347
  (1) Job status;
301
348
  (2) Readiness probe.
302
349
  """
303
- if self.sky_launch_status != ProcessStatus.SUCCEEDED:
350
+ if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
304
351
  return False
305
352
  if self.sky_down_status is not None:
306
353
  return False
@@ -314,37 +361,43 @@ class ReplicaStatusProperty:
314
361
 
315
362
  def to_replica_status(self) -> serve_state.ReplicaStatus:
316
363
  """Convert status property to human-readable replica status."""
317
- if self.sky_launch_status is None:
364
+ # Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
365
+ # we use None to represent sky.launch is not called yet.
366
+ if (self.sky_launch_status is None or
367
+ self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
318
368
  # Pending to launch
319
369
  return serve_state.ReplicaStatus.PENDING
320
- if self.sky_launch_status == ProcessStatus.RUNNING:
321
- if self.sky_down_status == ProcessStatus.FAILED:
370
+ if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
371
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
322
372
  return serve_state.ReplicaStatus.FAILED_CLEANUP
323
- if self.sky_down_status == ProcessStatus.SUCCEEDED:
373
+ if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
324
374
  # This indicate it is a scale_down with correct teardown.
325
375
  # Should have been cleaned from the replica table.
326
376
  return serve_state.ReplicaStatus.UNKNOWN
327
377
  # Still launching
328
378
  return serve_state.ReplicaStatus.PROVISIONING
329
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
379
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
330
380
  # sky.down is running and a scale down interrupted sky.launch
331
381
  return serve_state.ReplicaStatus.SHUTTING_DOWN
332
382
  if self.sky_down_status is not None:
333
383
  if self.preempted:
334
384
  # Replica (spot) is preempted
335
385
  return serve_state.ReplicaStatus.PREEMPTED
336
- if self.sky_down_status == ProcessStatus.RUNNING:
386
+ if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
387
+ # sky.down is scheduled to run, but not started yet.
388
+ return serve_state.ReplicaStatus.SHUTTING_DOWN
389
+ if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
337
390
  # sky.down is running
338
391
  return serve_state.ReplicaStatus.SHUTTING_DOWN
339
- if self.sky_launch_status == ProcessStatus.INTERRUPTED:
392
+ if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
340
393
  return serve_state.ReplicaStatus.SHUTTING_DOWN
341
- if self.sky_down_status == ProcessStatus.FAILED:
394
+ if self.sky_down_status == common_utils.ProcessStatus.FAILED:
342
395
  # sky.down failed
343
396
  return serve_state.ReplicaStatus.FAILED_CLEANUP
344
397
  if self.user_app_failed:
345
398
  # Failed on user setup/run
346
399
  return serve_state.ReplicaStatus.FAILED
347
- if self.sky_launch_status == ProcessStatus.FAILED:
400
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
348
401
  # sky.launch failed
349
402
  return serve_state.ReplicaStatus.FAILED_PROVISION
350
403
  if self.first_ready_time is None:
@@ -360,18 +413,18 @@ class ReplicaStatusProperty:
360
413
  # This indicate it is a scale_down with correct teardown.
361
414
  # Should have been cleaned from the replica table.
362
415
  return serve_state.ReplicaStatus.UNKNOWN
363
- if self.sky_launch_status == ProcessStatus.FAILED:
416
+ if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
364
417
  # sky.launch failed
365
- # The down process has not been started if it reaches here,
418
+ # The down thread has not been started if it reaches here,
366
419
  # due to the `if self.sky_down_status is not None`` check above.
367
- # However, it should have been started by _refresh_process_pool.
420
+ # However, it should have been started by _refresh_thread_pool.
368
421
  # If not started, this means some bug prevent sky.down from
369
422
  # executing. It is also a potential resource leak, so we mark
370
423
  # it as FAILED_CLEANUP.
371
424
  return serve_state.ReplicaStatus.FAILED_CLEANUP
372
425
  if self.user_app_failed:
373
426
  # Failed on user setup/run
374
- # Same as above, the down process should have been started.
427
+ # Same as above, the down thread should have been started.
375
428
  return serve_state.ReplicaStatus.FAILED_CLEANUP
376
429
  if self.service_ready_now:
377
430
  # Service is ready
@@ -421,11 +474,12 @@ class ReplicaInfo:
421
474
  based on the cluster name.
422
475
  """
423
476
  if cluster_record is None:
424
- cluster_record = global_user_state.get_cluster_from_name(
477
+ handle = global_user_state.get_handle_from_cluster_name(
425
478
  self.cluster_name)
426
- if cluster_record is None:
479
+ else:
480
+ handle = cluster_record['handle']
481
+ if handle is None:
427
482
  return None
428
- handle = cluster_record['handle']
429
483
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
430
484
  return handle
431
485
 
@@ -442,10 +496,16 @@ class ReplicaInfo:
442
496
  handle = self.handle()
443
497
  if handle is None:
444
498
  return None
499
+ if self.replica_port == '-':
500
+ # This is a pool replica so there is no endpoint and it's filled
501
+ # with this dummy value. We return None here so that we can
502
+ # get the active ready replicas and perform autoscaling. Otherwise,
503
+ # would error out when trying to get the endpoint.
504
+ return None
445
505
  replica_port_int = int(self.replica_port)
446
506
  try:
447
- endpoint_dict = core.endpoints(handle.cluster_name,
448
- replica_port_int)
507
+ endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
508
+ replica_port_int)
449
509
  except exceptions.ClusterNotUpError:
450
510
  return None
451
511
  endpoint = endpoint_dict.get(replica_port_int, None)
@@ -465,26 +525,36 @@ class ReplicaInfo:
465
525
  f'replica {self.replica_id}.')
466
526
  return replica_status
467
527
 
468
- def to_info_dict(self, with_handle: bool) -> Dict[str, Any]:
528
+ def to_info_dict(self,
529
+ with_handle: bool,
530
+ with_url: bool = True) -> Dict[str, Any]:
469
531
  cluster_record = global_user_state.get_cluster_from_name(
470
- self.cluster_name)
532
+ self.cluster_name, include_user_info=False, summary_response=True)
471
533
  info_dict = {
472
534
  'replica_id': self.replica_id,
473
535
  'name': self.cluster_name,
474
536
  'status': self.status,
475
537
  'version': self.version,
476
- 'endpoint': self.url,
538
+ 'endpoint': self.url if with_url else None,
477
539
  'is_spot': self.is_spot,
478
540
  'launched_at': (cluster_record['launched_at']
479
541
  if cluster_record is not None else None),
480
542
  }
481
543
  if with_handle:
482
- info_dict['handle'] = self.handle(cluster_record)
544
+ handle = self.handle(cluster_record)
545
+ info_dict['handle'] = handle
546
+ if handle is not None:
547
+ info_dict['cloud'] = repr(handle.launched_resources.cloud)
548
+ info_dict['region'] = handle.launched_resources.region
549
+ info_dict['resources_str'] = (
550
+ resources_utils.get_readable_resources_repr(
551
+ handle, simplified_only=True)[0])
483
552
  return info_dict
484
553
 
485
554
  def __repr__(self) -> str:
486
- info_dict = self.to_info_dict(
487
- with_handle=env_options.Options.SHOW_DEBUG_INFO.get())
555
+ show_details = env_options.Options.SHOW_DEBUG_INFO.get()
556
+ info_dict = self.to_info_dict(with_handle=show_details,
557
+ with_url=show_details)
488
558
  handle_str = ''
489
559
  if 'handle' in info_dict:
490
560
  handle_str = f', handle={info_dict["handle"]}'
@@ -498,6 +568,33 @@ class ReplicaInfo:
498
568
  f'launched_at={info_dict["launched_at"]}{handle_str})')
499
569
  return info
500
570
 
571
+ def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
572
+ """Probe the replica for pool management.
573
+
574
+ This function will check the first job status of the cluster, which is a
575
+ dummy job that only echoes "setup done". The success of this job means
576
+ the setup command is done and the replica is ready to be used. Check
577
+ sky/serve/server/core.py::up for more details.
578
+
579
+ Returns:
580
+ Tuple of (self, is_ready, probe_time).
581
+ """
582
+ probe_time = time.time()
583
+ try:
584
+ handle = backend_utils.check_cluster_available(
585
+ self.cluster_name, operation='probing pool')
586
+ if handle is None:
587
+ return self, False, probe_time
588
+ backend = backend_utils.get_backend_from_handle(handle)
589
+ statuses = backend.get_job_status(handle, [1], stream_logs=False)
590
+ if statuses[1] == job_lib.JobStatus.SUCCEEDED:
591
+ return self, True, probe_time
592
+ return self, False, probe_time
593
+ except Exception as e: # pylint: disable=broad-except
594
+ logger.error(f'Error when probing pool of {self.cluster_name}: '
595
+ f'{common_utils.format_exception(e)}.')
596
+ return self, False, probe_time
597
+
501
598
  def probe(
502
599
  self,
503
600
  readiness_path: str,
@@ -580,13 +677,14 @@ class ReplicaInfo:
580
677
  class ReplicaManager:
581
678
  """Each replica manager monitors one service."""
582
679
 
583
- def __init__(self, service_name: str,
584
- spec: 'service_spec.SkyServiceSpec') -> None:
680
+ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
681
+ version: int) -> None:
585
682
  self.lock = threading.Lock()
586
683
  self._next_replica_id: int = 1
587
684
  self._service_name: str = service_name
588
685
  self._uptime: Optional[float] = None
589
686
  self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
687
+ self._is_pool: bool = spec.pool
590
688
  header_keys = None
591
689
  if spec.readiness_headers is not None:
592
690
  header_keys = list(spec.readiness_headers.keys())
@@ -596,9 +694,18 @@ class ReplicaManager:
596
694
  f'Readiness header keys: {header_keys}')
597
695
 
598
696
  # Newest version among the currently provisioned and launched replicas
599
- self.latest_version: int = serve_constants.INITIAL_VERSION
697
+ self.latest_version: int = version
600
698
  # Oldest version among the currently provisioned and launched replicas
601
- self.least_recent_version: int = serve_constants.INITIAL_VERSION
699
+ self.least_recent_version: int = version
700
+
701
+ def _consecutive_failure_threshold_timeout(self) -> int:
702
+ """The timeout for the consecutive failure threshold in seconds.
703
+
704
+ We reduce the timeout for pool to 10 seconds to make the pool more
705
+ responsive to the failure.
706
+ """
707
+ # TODO(tian): Maybe let user determine this threshold
708
+ return 10 if self._is_pool else 180
602
709
 
603
710
  def scale_up(self,
604
711
  resources_override: Optional[Dict[str, Any]] = None) -> None:
@@ -625,8 +732,8 @@ class SkyPilotReplicaManager(ReplicaManager):
625
732
  """Replica Manager for SkyPilot clusters.
626
733
 
627
734
  It will run three daemon to monitor the status of the replicas:
628
- (1) _process_pool_refresher: Refresh the launch/down process pool
629
- to monitor the progress of the launch/down process.
735
+ (1) _thread_pool_refresher: Refresh the launch/down thread pool
736
+ to monitor the progress of the launch/down thread.
630
737
  (2) _job_status_fetcher: Fetch the job status of the service to
631
738
  monitor the status of the service jobs.
632
739
  (3) _replica_prober: Do readiness probe to the replicas to monitor
@@ -634,40 +741,41 @@ class SkyPilotReplicaManager(ReplicaManager):
634
741
  """
635
742
 
636
743
  def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
637
- task_yaml_path: str) -> None:
638
- super().__init__(service_name, spec)
639
- self._task_yaml_path = task_yaml_path
640
- task = sky.Task.from_yaml(task_yaml_path)
744
+ version: int) -> None:
745
+ super().__init__(service_name, spec, version)
746
+ self.yaml_content = serve_state.get_yaml_content(service_name, version)
747
+ task = task_lib.Task.from_yaml_str(self.yaml_content)
641
748
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
642
749
  spot_placer.SpotPlacer.from_task(spec, task))
643
- # TODO(tian): Store launch/down pid in the replica table, to make the
644
- # manager more persistent. Current blocker is that we need to manually
645
- # poll the Process (by join or is_launch), otherwise, it will never
646
- # finish and become a zombie process. Probably we could use
647
- # psutil.Process(p.pid).status() == psutil.STATUS_ZOMBIE to check
648
- # such cases.
649
- self._launch_process_pool: serve_utils.ThreadSafeDict[
650
- int, multiprocessing.Process] = serve_utils.ThreadSafeDict()
651
- self._down_process_pool: serve_utils.ThreadSafeDict[
652
- int, multiprocessing.Process] = serve_utils.ThreadSafeDict()
653
-
654
- threading.Thread(target=self._process_pool_refresher).start()
750
+ # TODO(tian): Store launch/down request id in the replica table, to make
751
+ # the manager more persistent.
752
+ self._launch_thread_pool: thread_utils.ThreadSafeDict[
753
+ int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
754
+ self._replica_to_request_id: thread_utils.ThreadSafeDict[
755
+ int, str] = thread_utils.ThreadSafeDict()
756
+ self._replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
757
+ int, bool] = thread_utils.ThreadSafeDict()
758
+ self._down_thread_pool: thread_utils.ThreadSafeDict[
759
+ int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
760
+
761
+ threading.Thread(target=self._thread_pool_refresher).start()
655
762
  threading.Thread(target=self._job_status_fetcher).start()
656
763
  threading.Thread(target=self._replica_prober).start()
657
764
 
658
765
  self._recover_replica_operations()
659
766
 
767
+ @with_lock
660
768
  def _recover_replica_operations(self):
661
769
  """Let's see are there something to do for ReplicaManager in a
662
770
  recovery run"""
663
- assert (not self._launch_process_pool and not self._down_process_pool
664
- ), 'We should not have any running processes in a recovery run'
771
+ assert (not self._launch_thread_pool and not self._down_thread_pool
772
+ ), 'We should not have any running threads in a recovery run'
665
773
 
666
774
  # There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
667
775
  # _launch_replica.
668
776
  # We prioritize PROVISIONING replicas since they were previously
669
777
  # launched but may have been interrupted and need to be restarted.
670
- # This is why we process PENDING replicas only after PROVISIONING
778
+ # This is why we handle PENDING replicas only after PROVISIONING
671
779
  # replicas.
672
780
  to_up_replicas = serve_state.get_replicas_at_status(
673
781
  self._service_name, serve_state.ReplicaStatus.PROVISIONING)
@@ -697,16 +805,15 @@ class SkyPilotReplicaManager(ReplicaManager):
697
805
  # Replica management functions #
698
806
  ################################
699
807
 
700
- # Adding lock here to make sure spot placer's current locations are
701
- # consistent with the replicas' status.
702
- @with_lock
808
+ # We don't need to add lock here since every caller of this function
809
+ # will acquire the lock.
703
810
  def _launch_replica(
704
811
  self,
705
812
  replica_id: int,
706
813
  resources_override: Optional[Dict[str, Any]] = None,
707
814
  ) -> None:
708
- if replica_id in self._launch_process_pool:
709
- logger.warning(f'Launch process for replica {replica_id} '
815
+ if replica_id in self._launch_thread_pool:
816
+ logger.warning(f'Launch thread for replica {replica_id} '
710
817
  'already exists. Skipping.')
711
818
  return
712
819
  logger.info(f'Launching replica {replica_id}...')
@@ -714,7 +821,7 @@ class SkyPilotReplicaManager(ReplicaManager):
714
821
  self._service_name, replica_id)
715
822
  log_file_name = serve_utils.generate_replica_launch_log_file_name(
716
823
  self._service_name, replica_id)
717
- use_spot = _should_use_spot(self._task_yaml_path, resources_override)
824
+ use_spot = _should_use_spot(self.yaml_content, resources_override)
718
825
  retry_until_up = True
719
826
  location = None
720
827
  if use_spot and self._spot_placer is not None:
@@ -737,28 +844,78 @@ class SkyPilotReplicaManager(ReplicaManager):
737
844
  location = self._spot_placer.select_next_location(
738
845
  current_spot_locations)
739
846
  resources_override.update(location.to_dict())
740
- p = multiprocessing.Process(
741
- target=ux_utils.RedirectOutputForProcess(
742
- launch_cluster,
743
- log_file_name,
744
- ).run,
745
- args=(replica_id, self._task_yaml_path, cluster_name,
746
- resources_override, retry_until_up),
847
+ t = thread_utils.SafeThread(
848
+ target=launch_cluster,
849
+ args=(replica_id, self.yaml_content, cluster_name, log_file_name,
850
+ self._replica_to_request_id,
851
+ self._replica_to_launch_cancelled, resources_override,
852
+ retry_until_up),
747
853
  )
748
- replica_port = _get_resources_ports(self._task_yaml_path)
854
+ replica_port = _get_resources_ports(self.yaml_content)
749
855
 
750
856
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
751
857
  location, self.latest_version, resources_override)
752
858
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
753
- # Don't start right now; we will start it later in _refresh_process_pool
859
+ # Don't start right now; we will start it later in _refresh_thread_pool
754
860
  # to avoid too many sky.launch running at the same time.
755
- self._launch_process_pool[replica_id] = p
861
+ self._launch_thread_pool[replica_id] = t
756
862
 
863
+ @with_lock
757
864
  def scale_up(self,
758
865
  resources_override: Optional[Dict[str, Any]] = None) -> None:
759
866
  self._launch_replica(self._next_replica_id, resources_override)
760
867
  self._next_replica_id += 1
761
868
 
869
+ def _handle_sky_down_finish(self, info: ReplicaInfo,
870
+ format_exc: Optional[str]) -> None:
871
+ if format_exc is not None:
872
+ logger.error(f'Down thread for replica {info.replica_id} '
873
+ f'exited abnormally with exception {format_exc}.')
874
+ info.status_property.sky_down_status = (
875
+ common_utils.ProcessStatus.FAILED)
876
+ else:
877
+ info.status_property.sky_down_status = (
878
+ common_utils.ProcessStatus.SUCCEEDED)
879
+ # Failed replica still count as a replica. In our current design, we
880
+ # want to fail early if user code have any error. This will prevent
881
+ # infinite loop of teardown and re-provision. However, there is a
882
+ # special case that if the replica is UP for longer than
883
+ # initial_delay_seconds, we assume it is just some random failure and
884
+ # we should restart the replica. Please refer to the implementation of
885
+ # `is_scale_down_succeeded` for more details.
886
+ # TODO(tian): Currently, restart replicas that failed within
887
+ # initial_delay_seconds is not supported. We should add it
888
+ # later when we support `sky serve update`.
889
+ removal_reason = None
890
+ if info.status_property.is_scale_down:
891
+ # This means the cluster is deleted due to an autoscaler
892
+ # decision or the cluster is recovering from preemption.
893
+ # Delete the replica info so it won't count as a replica.
894
+ if info.status_property.preempted:
895
+ removal_reason = 'for preemption recovery'
896
+ else:
897
+ removal_reason = 'normally'
898
+ # Don't keep failed record for version mismatch replicas,
899
+ # since user should fixed the error before update.
900
+ elif info.version != self.latest_version:
901
+ removal_reason = 'for version outdated'
902
+ elif info.status_property.purged:
903
+ removal_reason = 'for purge'
904
+ elif info.status_property.failed_spot_availability:
905
+ removal_reason = 'for spot availability failure'
906
+ else:
907
+ logger.info(f'Termination of replica {info.replica_id} '
908
+ 'finished. Replica info is kept since some '
909
+ 'failure detected.')
910
+ serve_state.add_or_update_replica(self._service_name,
911
+ info.replica_id, info)
912
+ if removal_reason is not None:
913
+ serve_state.remove_replica(self._service_name, info.replica_id)
914
+ logger.info(f'Replica {info.replica_id} removed from the '
915
+ f'replica table {removal_reason}.')
916
+
917
+ # We don't need to add lock here since every caller of this function
918
+ # will acquire the lock.
762
919
  def _terminate_replica(self,
763
920
  replica_id: int,
764
921
  sync_down_logs: bool,
@@ -772,24 +929,55 @@ class SkyPilotReplicaManager(ReplicaManager):
772
929
  'the logs should always be synced down. '
773
930
  'So that the user can see the logs to debug.')
774
931
 
775
- if replica_id in self._launch_process_pool:
932
+ if replica_id in self._launch_thread_pool:
776
933
  info = serve_state.get_replica_info_from_id(self._service_name,
777
934
  replica_id)
778
935
  assert info is not None
779
- info.status_property.sky_launch_status = ProcessStatus.INTERRUPTED
936
+ info.status_property.sky_launch_status = (
937
+ common_utils.ProcessStatus.INTERRUPTED)
780
938
  serve_state.add_or_update_replica(self._service_name, replica_id,
781
939
  info)
782
- launch_process = self._launch_process_pool[replica_id]
783
- if launch_process.is_alive():
784
- assert launch_process.pid is not None
785
- launch_process.terminate()
786
- launch_process.join()
787
- logger.info(f'Interrupted launch process for replica {replica_id} '
788
- 'and deleted the cluster.')
789
- del self._launch_process_pool[replica_id]
790
-
791
- if replica_id in self._down_process_pool:
792
- logger.warning(f'Terminate process for replica {replica_id} '
940
+ launch_thread = self._launch_thread_pool[replica_id]
941
+ if launch_thread.is_alive():
942
+ self._replica_to_launch_cancelled[replica_id] = True
943
+ start_wait_time = time.time()
944
+ timeout_reached = False
945
+ while True:
946
+ # Launch request id found. cancel it.
947
+ if replica_id in self._replica_to_request_id:
948
+ request_id = self._replica_to_request_id[replica_id]
949
+ sdk.api_cancel(request_id)
950
+ break
951
+ if replica_id not in self._replica_to_launch_cancelled:
952
+ # Indicates that the cancellation was received.
953
+ break
954
+ if not launch_thread.is_alive():
955
+ # It's possible that the launch thread immediately
956
+ # finished after we check. Exit the loop now.
957
+ break
958
+ if (time.time() - start_wait_time >
959
+ _WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS):
960
+ timeout_reached = True
961
+ break
962
+ time.sleep(0.1)
963
+ if timeout_reached:
964
+ logger.warning(
965
+ 'Failed to cancel launch request for replica '
966
+ f'{replica_id} after '
967
+ f'{_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS} seconds. '
968
+ 'Force waiting the launch thread to finish.')
969
+ else:
970
+ logger.info('Interrupted launch thread for replica '
971
+ f'{replica_id} and deleted the cluster.')
972
+ launch_thread.join()
973
+ else:
974
+ logger.info(f'Launch thread for replica {replica_id} '
975
+ 'already finished. Delete the cluster now.')
976
+ self._launch_thread_pool.pop(replica_id)
977
+ self._replica_to_request_id.pop(replica_id)
978
+
979
+ if replica_id in self._down_thread_pool:
980
+ logger.warning(f'Terminate thread for replica {replica_id} '
793
981
  'already exists. Skipping.')
794
982
  return
795
983
 
@@ -820,9 +1008,9 @@ class SkyPilotReplicaManager(ReplicaManager):
820
1008
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
821
1009
  replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
822
1010
  'replica_jobs')
823
- job_log_file_name = (
824
- controller_utils.download_and_stream_latest_job_log(
825
- backend, handle, replica_job_logs_dir))
1011
+ job_ids = ['1'] if self._is_pool else None
1012
+ job_log_file_name = controller_utils.download_and_stream_job_log(
1013
+ backend, handle, replica_job_logs_dir, job_ids)
826
1014
  if job_log_file_name is not None:
827
1015
  logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
828
1016
  with open(log_file_name, 'a',
@@ -848,18 +1036,30 @@ class SkyPilotReplicaManager(ReplicaManager):
848
1036
 
849
1037
  logger.info(f'preempted: {info.status_property.preempted}, '
850
1038
  f'replica_id: {replica_id}')
851
- p = multiprocessing.Process(
852
- target=ux_utils.RedirectOutputForProcess(terminate_cluster,
853
- log_file_name, 'a').run,
854
- args=(info.cluster_name, replica_drain_delay_seconds),
855
- )
856
- info.status_property.sky_down_status = ProcessStatus.RUNNING
857
1039
  info.status_property.is_scale_down = is_scale_down
858
1040
  info.status_property.purged = purge
1041
+
1042
+ # If the cluster does not exist, it means either the cluster never
1043
+ # exists (e.g., the cluster is scaled down before it gets a chance to
1044
+ # provision) or the cluster is preempted and cleaned up by the status
1045
+ # refresh. In this case, we skip spawning a new down thread to save
1046
+ # controller resources.
1047
+ if not global_user_state.cluster_with_name_exists(info.cluster_name):
1048
+ self._handle_sky_down_finish(info, format_exc=None)
1049
+ return
1050
+
1051
+ # Otherwise, start the thread to terminate the cluster.
1052
+ t = thread_utils.SafeThread(
1053
+ target=terminate_cluster,
1054
+ args=(info.cluster_name, log_file_name,
1055
+ replica_drain_delay_seconds),
1056
+ )
1057
+ info.status_property.sky_down_status = (
1058
+ common_utils.ProcessStatus.SCHEDULED)
859
1059
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
860
- p.start()
861
- self._down_process_pool[replica_id] = p
1060
+ self._down_thread_pool[replica_id] = t
862
1061
 
1062
+ @with_lock
863
1063
  def scale_down(self, replica_id: int, purge: bool = False) -> None:
864
1064
  self._terminate_replica(
865
1065
  replica_id,
@@ -868,6 +1068,8 @@ class SkyPilotReplicaManager(ReplicaManager):
868
1068
  is_scale_down=True,
869
1069
  purge=purge)
870
1070
 
1071
+ # We don't need to add lock here since every caller of this function
1072
+ # will acquire the lock.
871
1073
  def _handle_preemption(self, info: ReplicaInfo) -> bool:
872
1074
  """Handle preemption of the replica if any error happened.
873
1075
 
@@ -920,52 +1122,54 @@ class SkyPilotReplicaManager(ReplicaManager):
920
1122
  #################################
921
1123
 
922
1124
  @with_lock
923
- def _refresh_process_pool(self) -> None:
924
- """Refresh the launch/down process pool.
1125
+ def _refresh_thread_pool(self) -> None:
1126
+ """Refresh the launch/down thread pool.
925
1127
 
926
- This function will checks all sky.launch and sky.down process on
1128
+ This function will checks all sky.launch and sky.down thread on
927
1129
  the fly. If any of them finished, it will update the status of the
928
1130
  corresponding replica.
929
1131
  """
930
1132
  # To avoid `dictionary changed size during iteration` error.
931
- launch_process_pool_snapshot = list(self._launch_process_pool.items())
932
- for replica_id, p in launch_process_pool_snapshot:
933
- if not p.is_alive():
1133
+ launch_thread_pool_snapshot = list(self._launch_thread_pool.items())
1134
+ for replica_id, t in launch_thread_pool_snapshot:
1135
+ if t.is_alive():
1136
+ continue
1137
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
934
1138
  info = serve_state.get_replica_info_from_id(
935
1139
  self._service_name, replica_id)
936
1140
  assert info is not None, replica_id
937
1141
  error_in_sky_launch = False
938
1142
  if info.status == serve_state.ReplicaStatus.PENDING:
939
1143
  # sky.launch not started yet
940
- if (serve_state.total_number_provisioning_replicas() <
941
- _MAX_NUM_LAUNCH):
942
- p.start()
1144
+ if controller_utils.can_provision(self._is_pool):
1145
+ t.start()
943
1146
  info.status_property.sky_launch_status = (
944
- ProcessStatus.RUNNING)
1147
+ common_utils.ProcessStatus.RUNNING)
945
1148
  else:
946
1149
  # sky.launch finished
947
- # TODO(tian): Try-catch in process, and have an enum return
1150
+ # TODO(tian): Try-catch in thread, and have an enum return
948
1151
  # value to indicate which type of failure happened.
949
1152
  # Currently we only have user code failure since the
950
1153
  # retry_until_up flag is set to True, but it will be helpful
951
1154
  # when we enable user choose whether to retry or not.
952
1155
  logger.info(
953
- f'Launch process for replica {replica_id} finished.')
954
- del self._launch_process_pool[replica_id]
955
- if p.exitcode != 0:
1156
+ f'Launch thread for replica {replica_id} finished.')
1157
+ self._launch_thread_pool.pop(replica_id)
1158
+ self._replica_to_request_id.pop(replica_id)
1159
+ if t.format_exc is not None:
956
1160
  logger.warning(
957
- f'Launch process for replica {replica_id} '
958
- f'exited abnormally with code {p.exitcode}.'
959
- ' Terminating...')
1161
+ f'Launch thread for replica {replica_id} '
1162
+ f'exited abnormally with exception '
1163
+ f'{t.format_exc}. Terminating...')
960
1164
  info.status_property.sky_launch_status = (
961
- ProcessStatus.FAILED)
1165
+ common_utils.ProcessStatus.FAILED)
962
1166
  error_in_sky_launch = True
963
1167
  else:
964
1168
  info.status_property.sky_launch_status = (
965
- ProcessStatus.SUCCEEDED)
1169
+ common_utils.ProcessStatus.SUCCEEDED)
966
1170
  if self._spot_placer is not None and info.is_spot:
967
1171
  # TODO(tian): Currently, we set the location to
968
- # preemptive if the launch process failed. This is
1172
+ # preemptive if the launch thread failed. This is
969
1173
  # because if the error is not related to the
970
1174
  # availability of the location, then all locations
971
1175
  # should failed for same reason. So it does not matter
@@ -975,7 +1179,7 @@ class SkyPilotReplicaManager(ReplicaManager):
975
1179
  # availability of the location later.
976
1180
  location = info.get_spot_location()
977
1181
  assert location is not None
978
- if p.exitcode != 0:
1182
+ if t.format_exc is not None:
979
1183
  self._spot_placer.set_preemptive(location)
980
1184
  info.status_property.failed_spot_availability = True
981
1185
  else:
@@ -988,61 +1192,27 @@ class SkyPilotReplicaManager(ReplicaManager):
988
1192
  self._terminate_replica(replica_id,
989
1193
  sync_down_logs=True,
990
1194
  replica_drain_delay_seconds=0)
991
- down_process_pool_snapshot = list(self._down_process_pool.items())
992
- for replica_id, p in down_process_pool_snapshot:
993
- if not p.is_alive():
994
- logger.info(
995
- f'Terminate process for replica {replica_id} finished.')
996
- del self._down_process_pool[replica_id]
997
- info = serve_state.get_replica_info_from_id(
998
- self._service_name, replica_id)
999
- assert info is not None, replica_id
1000
- if p.exitcode != 0:
1001
- logger.error(f'Down process for replica {replica_id} '
1002
- f'exited abnormally with code {p.exitcode}.')
1003
- info.status_property.sky_down_status = (
1004
- ProcessStatus.FAILED)
1005
- else:
1195
+ down_thread_pool_snapshot = list(self._down_thread_pool.items())
1196
+ for replica_id, t in down_thread_pool_snapshot:
1197
+ if t.is_alive():
1198
+ continue
1199
+ info = serve_state.get_replica_info_from_id(self._service_name,
1200
+ replica_id)
1201
+ assert info is not None, replica_id
1202
+ if (info.status_property.sky_down_status ==
1203
+ common_utils.ProcessStatus.SCHEDULED):
1204
+ # sky.down not started yet
1205
+ if controller_utils.can_terminate(self._is_pool):
1206
+ t.start()
1006
1207
  info.status_property.sky_down_status = (
1007
- ProcessStatus.SUCCEEDED)
1008
- # Failed replica still count as a replica. In our current
1009
- # design, we want to fail early if user code have any error.
1010
- # This will prevent infinite loop of teardown and
1011
- # re-provision. However, there is a special case that if the
1012
- # replica is UP for longer than initial_delay_seconds, we
1013
- # assume it is just some random failure and we should restart
1014
- # the replica. Please refer to the implementation of
1015
- # `is_scale_down_succeeded` for more details.
1016
- # TODO(tian): Currently, restart replicas that failed within
1017
- # initial_delay_seconds is not supported. We should add it
1018
- # later when we support `sky serve update`.
1019
- removal_reason = None
1020
- if info.status_property.is_scale_down:
1021
- # This means the cluster is deleted due to an autoscaler
1022
- # decision or the cluster is recovering from preemption.
1023
- # Delete the replica info so it won't count as a replica.
1024
- if info.status_property.preempted:
1025
- removal_reason = 'for preemption recovery'
1026
- else:
1027
- removal_reason = 'normally'
1028
- # Don't keep failed record for version mismatch replicas,
1029
- # since user should fixed the error before update.
1030
- elif info.version != self.latest_version:
1031
- removal_reason = 'for version outdated'
1032
- elif info.status_property.purged:
1033
- removal_reason = 'for purge'
1034
- elif info.status_property.failed_spot_availability:
1035
- removal_reason = 'for spot availability failure'
1036
- else:
1037
- logger.info(f'Termination of replica {replica_id} '
1038
- 'finished. Replica info is kept since some '
1039
- 'failure detected.')
1208
+ common_utils.ProcessStatus.RUNNING)
1040
1209
  serve_state.add_or_update_replica(self._service_name,
1041
1210
  replica_id, info)
1042
- if removal_reason is not None:
1043
- serve_state.remove_replica(self._service_name, replica_id)
1044
- logger.info(f'Replica {replica_id} removed from the '
1045
- f'replica table {removal_reason}.')
1211
+ else:
1212
+ logger.info(
1213
+ f'Terminate thread for replica {replica_id} finished.')
1214
+ self._down_thread_pool.pop(replica_id)
1215
+ self._handle_sky_down_finish(info, format_exc=t.format_exc)
1046
1216
 
1047
1217
  # Clean old version
1048
1218
  replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1052,25 +1222,25 @@ class SkyPilotReplicaManager(ReplicaManager):
1052
1222
  if self.least_recent_version < current_least_recent_version:
1053
1223
  for version in range(self.least_recent_version,
1054
1224
  current_least_recent_version):
1055
- task_yaml = serve_utils.generate_task_yaml_file_name(
1225
+ yaml_content = serve_utils.get_yaml_content(
1056
1226
  self._service_name, version)
1057
1227
  # Delete old version metadata.
1058
1228
  serve_state.delete_version(self._service_name, version)
1059
1229
  # Delete storage buckets of older versions.
1060
- service.cleanup_storage(task_yaml)
1230
+ service.cleanup_storage(yaml_content)
1061
1231
  # newest version will be cleaned in serve down
1062
1232
  self.least_recent_version = current_least_recent_version
1063
1233
 
1064
- def _process_pool_refresher(self) -> None:
1065
- """Periodically refresh the launch/down process pool."""
1234
+ def _thread_pool_refresher(self) -> None:
1235
+ """Periodically refresh the launch/down thread pool."""
1066
1236
  while True:
1067
- logger.debug('Refreshing process pool.')
1237
+ logger.debug('Refreshing thread pool.')
1068
1238
  try:
1069
- self._refresh_process_pool()
1239
+ self._refresh_thread_pool()
1070
1240
  except Exception as e: # pylint: disable=broad-except
1071
1241
  # No matter what error happens, we should keep the
1072
- # process pool refresher running.
1073
- logger.error('Error in process pool refresher: '
1242
+ # thread pool refresher running.
1243
+ logger.error('Error in thread pool refresher: '
1074
1244
  f'{common_utils.format_exception(e)}')
1075
1245
  with ux_utils.enable_traceback():
1076
1246
  logger.error(f' Traceback: {traceback.format_exc()}')
@@ -1098,9 +1268,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1098
1268
  handle = info.handle()
1099
1269
  assert handle is not None, info
1100
1270
  # Use None to fetch latest job, which stands for user task job
1271
+ job_ids = [1] if self._is_pool else None
1101
1272
  try:
1102
1273
  job_statuses = backend.get_job_status(handle,
1103
- None,
1274
+ job_ids,
1104
1275
  stream_logs=False)
1105
1276
  except exceptions.CommandError:
1106
1277
  # If the job status fetch failed, it is likely that the
@@ -1110,7 +1281,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1110
1281
  continue
1111
1282
  # Re-raise the exception if it is not preempted.
1112
1283
  raise
1113
- job_status = list(job_statuses.values())[0]
1284
+ job_status = job_statuses[1] if self._is_pool else list(
1285
+ job_statuses.values())[0]
1114
1286
  if job_status in job_lib.JobStatus.user_code_failure_states():
1115
1287
  info.status_property.user_app_failed = True
1116
1288
  serve_state.add_or_update_replica(self._service_name,
@@ -1154,18 +1326,24 @@ class SkyPilotReplicaManager(ReplicaManager):
1154
1326
  for info in infos:
1155
1327
  if not info.status_property.should_track_service_status():
1156
1328
  continue
1157
- replica_to_probe.append(
1158
- f'replica_{info.replica_id}(url={info.url})')
1159
- probe_futures.append(
1160
- pool.apply_async(
1161
- info.probe,
1162
- (
1163
- self._get_readiness_path(info.version),
1164
- self._get_post_data(info.version),
1165
- self._get_readiness_timeout_seconds(info.version),
1166
- self._get_readiness_headers(info.version),
1167
- ),
1168
- ),)
1329
+ if self._is_pool:
1330
+ replica_to_probe.append(f'replica_{info.replica_id}(cluster'
1331
+ f'_name={info.cluster_name})')
1332
+ probe_futures.append(pool.apply_async(info.probe_pool))
1333
+ else:
1334
+ replica_to_probe.append(
1335
+ f'replica_{info.replica_id}(url={info.url})')
1336
+ probe_futures.append(
1337
+ pool.apply_async(
1338
+ info.probe,
1339
+ (
1340
+ self._get_readiness_path(info.version),
1341
+ self._get_post_data(info.version),
1342
+ self._get_readiness_timeout_seconds(
1343
+ info.version),
1344
+ self._get_readiness_headers(info.version),
1345
+ ),
1346
+ ),)
1169
1347
  logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
1170
1348
 
1171
1349
  # Since futures.as_completed will return futures in the order of
@@ -1202,8 +1380,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1202
1380
  consecutive_failure_time = (
1203
1381
  info.consecutive_failure_times[-1] -
1204
1382
  info.consecutive_failure_times[0])
1205
- if (consecutive_failure_time >=
1206
- _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT):
1383
+ failure_threshold = (
1384
+ self._consecutive_failure_threshold_timeout())
1385
+ if consecutive_failure_time >= failure_threshold:
1207
1386
  logger.info(
1208
1387
  f'Replica {info.replica_id} is not ready for '
1209
1388
  'too long and exceeding consecutive failure '
@@ -1214,8 +1393,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1214
1393
  f'Replica {info.replica_id} is not ready '
1215
1394
  'but within consecutive failure threshold '
1216
1395
  f'({consecutive_failure_time}s / '
1217
- f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). '
1218
- 'Skipping.')
1396
+ f'{failure_threshold}s). Skipping.')
1219
1397
  else:
1220
1398
  initial_delay_seconds = self._get_initial_delay_seconds(
1221
1399
  info.version)
@@ -1290,11 +1468,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1290
1468
  logger.error(f'Invalid version: {version}, '
1291
1469
  f'latest version: {self.latest_version}')
1292
1470
  return
1293
- task_yaml_path = serve_utils.generate_task_yaml_file_name(
1294
- self._service_name, version)
1295
- serve_state.add_or_update_version(self._service_name, version, spec)
1471
+ yaml_content = serve_state.get_yaml_content(self._service_name, version)
1296
1472
  self.latest_version = version
1297
- self._task_yaml_path = task_yaml_path
1473
+ self.yaml_content = yaml_content
1298
1474
  self._update_mode = update_mode
1299
1475
 
1300
1476
  # Reuse all replicas that have the same config as the new version
@@ -1302,32 +1478,37 @@ class SkyPilotReplicaManager(ReplicaManager):
1302
1478
  # the latest version. This can significantly improve the speed
1303
1479
  # for updating an existing service with only config changes to the
1304
1480
  # service specs, e.g. scale down the service.
1305
- new_config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
1481
+ new_config = yaml_utils.safe_load(yaml_content)
1306
1482
  # Always create new replicas and scale down old ones when file_mounts
1307
1483
  # are not empty.
1308
1484
  if new_config.get('file_mounts', None) != {}:
1309
1485
  return
1310
- for key in ['service']:
1311
- new_config.pop(key)
1486
+ for key in ['service', 'pool', '_user_specified_yaml']:
1487
+ new_config.pop(key, None)
1488
+ new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
1489
+
1312
1490
  replica_infos = serve_state.get_replica_infos(self._service_name)
1313
1491
  for info in replica_infos:
1314
1492
  if info.version < version and not info.is_terminal:
1315
1493
  # Assume user does not change the yaml file on the controller.
1316
- old_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1494
+ old_yaml_content = serve_state.get_yaml_content(
1317
1495
  self._service_name, info.version)
1318
- old_config = common_utils.read_yaml(
1319
- os.path.expanduser(old_task_yaml_path))
1320
- for key in ['service']:
1321
- old_config.pop(key)
1496
+ old_config = yaml_utils.safe_load(old_yaml_content)
1497
+ for key in ['service', 'pool', '_user_specified_yaml']:
1498
+ old_config.pop(key, None)
1322
1499
  # Bump replica version if all fields except for service are
1323
1500
  # the same.
1324
1501
  # Here, we manually convert the any_of field to a set to avoid
1325
1502
  # only the difference in the random order of the any_of fields.
1326
1503
  old_config_any_of = old_config.get('resources',
1327
1504
  {}).pop('any_of', [])
1328
- new_config_any_of = new_config.get('resources',
1329
- {}).pop('any_of', [])
1330
- if set(old_config_any_of) != set(new_config_any_of):
1505
+
1506
+ if (resources_utils.normalize_any_of_resources_config(
1507
+ old_config_any_of) != resources_utils.
1508
+ normalize_any_of_resources_config(new_config_any_of)):
1509
+ logger.info('Replica config changed (any_of), skipping. '
1510
+ f'old: {old_config_any_of}, '
1511
+ f'new: {new_config_any_of}')
1331
1512
  continue
1332
1513
  # File mounts should both be empty, as update always
1333
1514
  # create new buckets if they are not empty.
@@ -1341,6 +1522,10 @@ class SkyPilotReplicaManager(ReplicaManager):
1341
1522
  info.version = version
1342
1523
  serve_state.add_or_update_replica(self._service_name,
1343
1524
  info.replica_id, info)
1525
+ else:
1526
+ logger.info('Replica config changed (rest), skipping. '
1527
+ f'old: {old_config}, '
1528
+ f'new: {new_config}')
1344
1529
 
1345
1530
  def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
1346
1531
  spec = serve_state.get_spec(self._service_name, version)