skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,9 @@
1
1
  """ReplicaManager: handles the creation and deletion of endpoint replicas."""
2
2
  import dataclasses
3
3
  import functools
4
- import multiprocessing
5
4
  from multiprocessing import pool as mp_pool
6
5
  import os
6
+ import pathlib
7
7
  import threading
8
8
  import time
9
9
  import traceback
@@ -15,14 +15,12 @@ import filelock
15
15
  import requests
16
16
 
17
17
  from sky import backends
18
- from sky import core
19
18
  from sky import exceptions
20
- from sky import execution
21
19
  from sky import global_user_state
22
20
  from sky import sky_logging
23
21
  from sky import task as task_lib
24
22
  from sky.backends import backend_utils
25
- from sky.jobs import scheduler as jobs_scheduler
23
+ from sky.client import sdk
26
24
  from sky.serve import constants as serve_constants
27
25
  from sky.serve import serve_state
28
26
  from sky.serve import serve_utils
@@ -32,14 +30,18 @@ from sky.skylet import constants
32
30
  from sky.skylet import job_lib
33
31
  from sky.usage import usage_lib
34
32
  from sky.utils import common_utils
33
+ from sky.utils import context
35
34
  from sky.utils import controller_utils
36
35
  from sky.utils import env_options
37
36
  from sky.utils import resources_utils
38
37
  from sky.utils import status_lib
38
+ from sky.utils import thread_utils
39
39
  from sky.utils import ux_utils
40
40
  from sky.utils import yaml_utils
41
41
 
42
42
  if typing.TYPE_CHECKING:
43
+ import logging
44
+
43
45
  from sky.serve import service_spec
44
46
 
45
47
  logger = sky_logging.init_logger(__name__)
@@ -48,6 +50,7 @@ _JOB_STATUS_FETCH_INTERVAL = 30
48
50
  _PROCESS_POOL_REFRESH_INTERVAL = 20
49
51
  _RETRY_INIT_GAP_SECONDS = 60
50
52
  _DEFAULT_DRAIN_SECONDS = 120
53
+ _WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS = 15
51
54
 
52
55
  # TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
53
56
  # 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
@@ -59,9 +62,15 @@ ProcessStatus = common_utils.ProcessStatus
59
62
 
60
63
  # TODO(tian): Combine this with
61
64
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
65
+ # Use context.contextual to enable per-launch output redirection.
66
+ @context.contextual
62
67
  def launch_cluster(replica_id: int,
63
- service_task_yaml_path: str,
68
+ yaml_content: str,
64
69
  cluster_name: str,
70
+ log_file: str,
71
+ replica_to_request_id: thread_utils.ThreadSafeDict[int, str],
72
+ replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
73
+ int, bool],
65
74
  resources_override: Optional[Dict[str, Any]] = None,
66
75
  retry_until_up: bool = True,
67
76
  max_retry: int = 3) -> None:
@@ -75,14 +84,16 @@ def launch_cluster(replica_id: int,
75
84
  or some error happened before provisioning and will happen again
76
85
  if retry.
77
86
  """
87
+ ctx = context.get()
88
+ assert ctx is not None, 'Context is not initialized'
89
+ ctx.redirect_log(pathlib.Path(log_file))
90
+
78
91
  if resources_override is not None:
79
92
  logger.info(f'Scaling up replica (id: {replica_id}) cluster '
80
93
  f'{cluster_name} with resources override: '
81
94
  f'{resources_override}')
82
95
  try:
83
- config = yaml_utils.read_yaml(
84
- os.path.expanduser(service_task_yaml_path))
85
- task = task_lib.Task.from_yaml_config(config)
96
+ task = task_lib.Task.from_yaml_str(yaml_content)
86
97
  if resources_override is not None:
87
98
  resources = task.resources
88
99
  overrided_resources = [
@@ -99,16 +110,31 @@ def launch_cluster(replica_id: int,
99
110
  raise RuntimeError(
100
111
  f'Failed to launch the sky serve replica cluster {cluster_name} '
101
112
  'due to failing to initialize sky.Task from yaml file.') from e
113
+
114
+ def _check_is_cancelled() -> bool:
115
+ is_cancelled = replica_to_launch_cancelled.get(replica_id, False)
116
+ if is_cancelled:
117
+ logger.info(f'Replica {replica_id} launch cancelled.')
118
+ # Pop the value to indicate that the signal was received.
119
+ replica_to_launch_cancelled.pop(replica_id)
120
+ return is_cancelled
121
+
102
122
  retry_cnt = 0
103
123
  backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS)
104
124
  while True:
105
125
  retry_cnt += 1
106
126
  try:
127
+ if _check_is_cancelled():
128
+ return
107
129
  usage_lib.messages.usage.set_internal()
108
- execution.launch(task,
109
- cluster_name,
110
- retry_until_up=retry_until_up,
111
- _is_launched_by_sky_serve_controller=True)
130
+ request_id = sdk.launch(task,
131
+ cluster_name,
132
+ retry_until_up=retry_until_up,
133
+ _is_launched_by_sky_serve_controller=True)
134
+ logger.info(f'Replica cluster {cluster_name} launch requested '
135
+ f'with request_id: {request_id}.')
136
+ replica_to_request_id[replica_id] = request_id
137
+ sdk.stream_and_get(request_id)
112
138
  logger.info(f'Replica cluster {cluster_name} launched.')
113
139
  except (exceptions.InvalidClusterNameError,
114
140
  exceptions.NoCloudAccessError,
@@ -133,22 +159,44 @@ def launch_cluster(replica_id: int,
133
159
  else: # No exception, the launch succeeds.
134
160
  return
135
161
 
136
- terminate_cluster(cluster_name)
162
+ # Cleanup the request id and the failed cluster.
163
+ replica_to_request_id.pop(replica_id)
164
+ # If it is cancelled, no need to terminate the cluster. It will be
165
+ # handled by the termination thread.
166
+ if _check_is_cancelled():
167
+ return
168
+ terminate_cluster(cluster_name, log_file=log_file)
169
+
137
170
  if retry_cnt >= max_retry:
138
171
  raise RuntimeError('Failed to launch the sky serve replica cluster '
139
172
  f'{cluster_name} after {max_retry} retries.')
173
+
140
174
  gap_seconds = backoff.current_backoff()
141
175
  logger.info('Retrying to launch the sky serve replica cluster '
142
176
  f'in {gap_seconds:.1f} seconds.')
143
- time.sleep(gap_seconds)
177
+ start_backoff = time.time()
178
+ # Check if it is cancelled every 0.1 seconds.
179
+ while time.time() - start_backoff < gap_seconds:
180
+ if _check_is_cancelled():
181
+ return
182
+ time.sleep(0.1)
144
183
 
145
184
 
146
185
  # TODO(tian): Combine this with
147
186
  # sky/spot/recovery_strategy.py::terminate_cluster
187
+ @context.contextual
148
188
  def terminate_cluster(cluster_name: str,
189
+ log_file: str,
149
190
  replica_drain_delay_seconds: int = 0,
150
191
  max_retry: int = 3) -> None:
151
192
  """Terminate the sky serve replica cluster."""
193
+ # Setup logging redirection.
194
+ ctx = context.get()
195
+ assert ctx is not None, 'Context is not initialized'
196
+ ctx.redirect_log(pathlib.Path(log_file))
197
+
198
+ logger.info(f'Terminating replica cluster {cluster_name} with '
199
+ f'replica_drain_delay_seconds: {replica_drain_delay_seconds}')
152
200
  time.sleep(replica_drain_delay_seconds)
153
201
  retry_cnt = 0
154
202
  backoff = common_utils.Backoff()
@@ -156,7 +204,10 @@ def terminate_cluster(cluster_name: str,
156
204
  retry_cnt += 1
157
205
  try:
158
206
  usage_lib.messages.usage.set_internal()
159
- core.down(cluster_name)
207
+ logger.info(f'Sending down request to cluster {cluster_name}')
208
+ request_id = sdk.down(cluster_name)
209
+ sdk.stream_and_get(request_id)
210
+ logger.info(f'Replica cluster {cluster_name} terminated.')
160
211
  return
161
212
  except ValueError:
162
213
  # The cluster is already terminated.
@@ -176,9 +227,9 @@ def terminate_cluster(cluster_name: str,
176
227
  time.sleep(gap_seconds)
177
228
 
178
229
 
179
- def _get_resources_ports(service_task_yaml_path: str) -> str:
230
+ def _get_resources_ports(yaml_content: str) -> str:
180
231
  """Get the resources ports used by the task."""
181
- task = task_lib.Task.from_yaml(service_task_yaml_path)
232
+ task = task_lib.Task.from_yaml_str(yaml_content)
182
233
  # Already checked all ports are valid in sky.serve.core.up
183
234
  assert task.resources, task
184
235
  assert task.service is not None, task
@@ -188,7 +239,7 @@ def _get_resources_ports(service_task_yaml_path: str) -> str:
188
239
  return task.service.ports
189
240
 
190
241
 
191
- def _should_use_spot(service_task_yaml_path: str,
242
+ def _should_use_spot(yaml_content: str,
192
243
  resource_override: Optional[Dict[str, Any]]) -> bool:
193
244
  """Get whether the task should use spot."""
194
245
  if resource_override is not None:
@@ -196,7 +247,7 @@ def _should_use_spot(service_task_yaml_path: str,
196
247
  if use_spot_override is not None:
197
248
  assert isinstance(use_spot_override, bool)
198
249
  return use_spot_override
199
- task = task_lib.Task.from_yaml(service_task_yaml_path)
250
+ task = task_lib.Task.from_yaml_str(yaml_content)
200
251
  spot_use_resources = [
201
252
  resources for resources in task.resources if resources.use_spot
202
253
  ]
@@ -364,16 +415,16 @@ class ReplicaStatusProperty:
364
415
  return serve_state.ReplicaStatus.UNKNOWN
365
416
  if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
366
417
  # sky.launch failed
367
- # The down process has not been started if it reaches here,
418
+ # The down thread has not been started if it reaches here,
368
419
  # due to the `if self.sky_down_status is not None`` check above.
369
- # However, it should have been started by _refresh_process_pool.
420
+ # However, it should have been started by _refresh_thread_pool.
370
421
  # If not started, this means some bug prevent sky.down from
371
422
  # executing. It is also a potential resource leak, so we mark
372
423
  # it as FAILED_CLEANUP.
373
424
  return serve_state.ReplicaStatus.FAILED_CLEANUP
374
425
  if self.user_app_failed:
375
426
  # Failed on user setup/run
376
- # Same as above, the down process should have been started.
427
+ # Same as above, the down thread should have been started.
377
428
  return serve_state.ReplicaStatus.FAILED_CLEANUP
378
429
  if self.service_ready_now:
379
430
  # Service is ready
@@ -423,11 +474,12 @@ class ReplicaInfo:
423
474
  based on the cluster name.
424
475
  """
425
476
  if cluster_record is None:
426
- cluster_record = global_user_state.get_cluster_from_name(
477
+ handle = global_user_state.get_handle_from_cluster_name(
427
478
  self.cluster_name)
428
- if cluster_record is None:
479
+ else:
480
+ handle = cluster_record['handle']
481
+ if handle is None:
429
482
  return None
430
- handle = cluster_record['handle']
431
483
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
432
484
  return handle
433
485
 
@@ -444,6 +496,12 @@ class ReplicaInfo:
444
496
  handle = self.handle()
445
497
  if handle is None:
446
498
  return None
499
+ if self.replica_port == '-':
500
+ # This is a pool replica so there is no endpoint and it's filled
501
+ # with this dummy value. We return None here so that we can
502
+ # get the active ready replicas and perform autoscaling. Otherwise,
503
+ # would error out when trying to get the endpoint.
504
+ return None
447
505
  replica_port_int = int(self.replica_port)
448
506
  try:
449
507
  endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
@@ -471,7 +529,7 @@ class ReplicaInfo:
471
529
  with_handle: bool,
472
530
  with_url: bool = True) -> Dict[str, Any]:
473
531
  cluster_record = global_user_state.get_cluster_from_name(
474
- self.cluster_name)
532
+ self.cluster_name, include_user_info=False, summary_response=True)
475
533
  info_dict = {
476
534
  'replica_id': self.replica_id,
477
535
  'name': self.cluster_name,
@@ -489,8 +547,8 @@ class ReplicaInfo:
489
547
  info_dict['cloud'] = repr(handle.launched_resources.cloud)
490
548
  info_dict['region'] = handle.launched_resources.region
491
549
  info_dict['resources_str'] = (
492
- resources_utils.get_readable_resources_repr(handle,
493
- simplify=True))
550
+ resources_utils.get_readable_resources_repr(
551
+ handle, simplified_only=True)[0])
494
552
  return info_dict
495
553
 
496
554
  def __repr__(self) -> str:
@@ -619,8 +677,8 @@ class ReplicaInfo:
619
677
  class ReplicaManager:
620
678
  """Each replica manager monitors one service."""
621
679
 
622
- def __init__(self, service_name: str,
623
- spec: 'service_spec.SkyServiceSpec') -> None:
680
+ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
681
+ version: int) -> None:
624
682
  self.lock = threading.Lock()
625
683
  self._next_replica_id: int = 1
626
684
  self._service_name: str = service_name
@@ -636,9 +694,9 @@ class ReplicaManager:
636
694
  f'Readiness header keys: {header_keys}')
637
695
 
638
696
  # Newest version among the currently provisioned and launched replicas
639
- self.latest_version: int = serve_constants.INITIAL_VERSION
697
+ self.latest_version: int = version
640
698
  # Oldest version among the currently provisioned and launched replicas
641
- self.least_recent_version: int = serve_constants.INITIAL_VERSION
699
+ self.least_recent_version: int = version
642
700
 
643
701
  def _consecutive_failure_threshold_timeout(self) -> int:
644
702
  """The timeout for the consecutive failure threshold in seconds.
@@ -674,8 +732,8 @@ class SkyPilotReplicaManager(ReplicaManager):
674
732
  """Replica Manager for SkyPilot clusters.
675
733
 
676
734
  It will run three daemon to monitor the status of the replicas:
677
- (1) _process_pool_refresher: Refresh the launch/down process pool
678
- to monitor the progress of the launch/down process.
735
+ (1) _thread_pool_refresher: Refresh the launch/down thread pool
736
+ to monitor the progress of the launch/down thread.
679
737
  (2) _job_status_fetcher: Fetch the job status of the service to
680
738
  monitor the status of the service jobs.
681
739
  (3) _replica_prober: Do readiness probe to the replicas to monitor
@@ -683,24 +741,24 @@ class SkyPilotReplicaManager(ReplicaManager):
683
741
  """
684
742
 
685
743
  def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
686
- service_task_yaml_path: str) -> None:
687
- super().__init__(service_name, spec)
688
- self.service_task_yaml_path = service_task_yaml_path
689
- task = task_lib.Task.from_yaml(service_task_yaml_path)
744
+ version: int) -> None:
745
+ super().__init__(service_name, spec, version)
746
+ self.yaml_content = serve_state.get_yaml_content(service_name, version)
747
+ task = task_lib.Task.from_yaml_str(self.yaml_content)
690
748
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
691
749
  spot_placer.SpotPlacer.from_task(spec, task))
692
- # TODO(tian): Store launch/down pid in the replica table, to make the
693
- # manager more persistent. Current blocker is that we need to manually
694
- # poll the Process (by join or is_launch), otherwise, it will never
695
- # finish and become a zombie process. Probably we could use
696
- # psutil.Process(p.pid).status() == psutil.STATUS_ZOMBIE to check
697
- # such cases.
698
- self._launch_process_pool: serve_utils.ThreadSafeDict[
699
- int, multiprocessing.Process] = serve_utils.ThreadSafeDict()
700
- self._down_process_pool: serve_utils.ThreadSafeDict[
701
- int, multiprocessing.Process] = serve_utils.ThreadSafeDict()
702
-
703
- threading.Thread(target=self._process_pool_refresher).start()
750
+ # TODO(tian): Store launch/down request id in the replica table, to make
751
+ # the manager more persistent.
752
+ self._launch_thread_pool: thread_utils.ThreadSafeDict[
753
+ int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
754
+ self._replica_to_request_id: thread_utils.ThreadSafeDict[
755
+ int, str] = thread_utils.ThreadSafeDict()
756
+ self._replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
757
+ int, bool] = thread_utils.ThreadSafeDict()
758
+ self._down_thread_pool: thread_utils.ThreadSafeDict[
759
+ int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
760
+
761
+ threading.Thread(target=self._thread_pool_refresher).start()
704
762
  threading.Thread(target=self._job_status_fetcher).start()
705
763
  threading.Thread(target=self._replica_prober).start()
706
764
 
@@ -710,14 +768,14 @@ class SkyPilotReplicaManager(ReplicaManager):
710
768
  def _recover_replica_operations(self):
711
769
  """Let's see are there something to do for ReplicaManager in a
712
770
  recovery run"""
713
- assert (not self._launch_process_pool and not self._down_process_pool
714
- ), 'We should not have any running processes in a recovery run'
771
+ assert (not self._launch_thread_pool and not self._down_thread_pool
772
+ ), 'We should not have any running threads in a recovery run'
715
773
 
716
774
  # There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
717
775
  # _launch_replica.
718
776
  # We prioritize PROVISIONING replicas since they were previously
719
777
  # launched but may have been interrupted and need to be restarted.
720
- # This is why we process PENDING replicas only after PROVISIONING
778
+ # This is why we handle PENDING replicas only after PROVISIONING
721
779
  # replicas.
722
780
  to_up_replicas = serve_state.get_replicas_at_status(
723
781
  self._service_name, serve_state.ReplicaStatus.PROVISIONING)
@@ -754,8 +812,8 @@ class SkyPilotReplicaManager(ReplicaManager):
754
812
  replica_id: int,
755
813
  resources_override: Optional[Dict[str, Any]] = None,
756
814
  ) -> None:
757
- if replica_id in self._launch_process_pool:
758
- logger.warning(f'Launch process for replica {replica_id} '
815
+ if replica_id in self._launch_thread_pool:
816
+ logger.warning(f'Launch thread for replica {replica_id} '
759
817
  'already exists. Skipping.')
760
818
  return
761
819
  logger.info(f'Launching replica {replica_id}...')
@@ -763,8 +821,7 @@ class SkyPilotReplicaManager(ReplicaManager):
763
821
  self._service_name, replica_id)
764
822
  log_file_name = serve_utils.generate_replica_launch_log_file_name(
765
823
  self._service_name, replica_id)
766
- use_spot = _should_use_spot(self.service_task_yaml_path,
767
- resources_override)
824
+ use_spot = _should_use_spot(self.yaml_content, resources_override)
768
825
  retry_until_up = True
769
826
  location = None
770
827
  if use_spot and self._spot_placer is not None:
@@ -787,22 +844,21 @@ class SkyPilotReplicaManager(ReplicaManager):
787
844
  location = self._spot_placer.select_next_location(
788
845
  current_spot_locations)
789
846
  resources_override.update(location.to_dict())
790
- p = multiprocessing.Process(
791
- target=ux_utils.RedirectOutputForProcess(
792
- launch_cluster,
793
- log_file_name,
794
- ).run,
795
- args=(replica_id, self.service_task_yaml_path, cluster_name,
796
- resources_override, retry_until_up),
847
+ t = thread_utils.SafeThread(
848
+ target=launch_cluster,
849
+ args=(replica_id, self.yaml_content, cluster_name, log_file_name,
850
+ self._replica_to_request_id,
851
+ self._replica_to_launch_cancelled, resources_override,
852
+ retry_until_up),
797
853
  )
798
- replica_port = _get_resources_ports(self.service_task_yaml_path)
854
+ replica_port = _get_resources_ports(self.yaml_content)
799
855
 
800
856
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
801
857
  location, self.latest_version, resources_override)
802
858
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
803
- # Don't start right now; we will start it later in _refresh_process_pool
859
+ # Don't start right now; we will start it later in _refresh_thread_pool
804
860
  # to avoid too many sky.launch running at the same time.
805
- self._launch_process_pool[replica_id] = p
861
+ self._launch_thread_pool[replica_id] = t
806
862
 
807
863
  @with_lock
808
864
  def scale_up(self,
@@ -810,10 +866,11 @@ class SkyPilotReplicaManager(ReplicaManager):
810
866
  self._launch_replica(self._next_replica_id, resources_override)
811
867
  self._next_replica_id += 1
812
868
 
813
- def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
814
- if exitcode != 0:
815
- logger.error(f'Down process for replica {info.replica_id} '
816
- f'exited abnormally with code {exitcode}.')
869
+ def _handle_sky_down_finish(self, info: ReplicaInfo,
870
+ format_exc: Optional[str]) -> None:
871
+ if format_exc is not None:
872
+ logger.error(f'Down thread for replica {info.replica_id} '
873
+ f'exited abnormally with exception {format_exc}.')
817
874
  info.status_property.sky_down_status = (
818
875
  common_utils.ProcessStatus.FAILED)
819
876
  else:
@@ -872,7 +929,7 @@ class SkyPilotReplicaManager(ReplicaManager):
872
929
  'the logs should always be synced down. '
873
930
  'So that the user can see the logs to debug.')
874
931
 
875
- if replica_id in self._launch_process_pool:
932
+ if replica_id in self._launch_thread_pool:
876
933
  info = serve_state.get_replica_info_from_id(self._service_name,
877
934
  replica_id)
878
935
  assert info is not None
@@ -880,17 +937,47 @@ class SkyPilotReplicaManager(ReplicaManager):
880
937
  common_utils.ProcessStatus.INTERRUPTED)
881
938
  serve_state.add_or_update_replica(self._service_name, replica_id,
882
939
  info)
883
- launch_process = self._launch_process_pool[replica_id]
884
- if launch_process.is_alive():
885
- assert launch_process.pid is not None
886
- launch_process.terminate()
887
- launch_process.join()
888
- logger.info(f'Interrupted launch process for replica {replica_id} '
889
- 'and deleted the cluster.')
890
- del self._launch_process_pool[replica_id]
891
-
892
- if replica_id in self._down_process_pool:
893
- logger.warning(f'Terminate process for replica {replica_id} '
940
+ launch_thread = self._launch_thread_pool[replica_id]
941
+ if launch_thread.is_alive():
942
+ self._replica_to_launch_cancelled[replica_id] = True
943
+ start_wait_time = time.time()
944
+ timeout_reached = False
945
+ while True:
946
+ # Launch request id found. cancel it.
947
+ if replica_id in self._replica_to_request_id:
948
+ request_id = self._replica_to_request_id[replica_id]
949
+ sdk.api_cancel(request_id)
950
+ break
951
+ if replica_id not in self._replica_to_launch_cancelled:
952
+ # Indicates that the cancellation was received.
953
+ break
954
+ if not launch_thread.is_alive():
955
+ # It's possible that the launch thread immediately
956
+ # finished after we check. Exit the loop now.
957
+ break
958
+ if (time.time() - start_wait_time >
959
+ _WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS):
960
+ timeout_reached = True
961
+ break
962
+ time.sleep(0.1)
963
+ if timeout_reached:
964
+ logger.warning(
965
+ 'Failed to cancel launch request for replica '
966
+ f'{replica_id} after '
967
+ f'{_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS} seconds. '
968
+ 'Force waiting the launch thread to finish.')
969
+ else:
970
+ logger.info('Interrupted launch thread for replica '
971
+ f'{replica_id} and deleted the cluster.')
972
+ launch_thread.join()
973
+ else:
974
+ logger.info(f'Launch thread for replica {replica_id} '
975
+ 'already finished. Delete the cluster now.')
976
+ self._launch_thread_pool.pop(replica_id)
977
+ self._replica_to_request_id.pop(replica_id)
978
+
979
+ if replica_id in self._down_thread_pool:
980
+ logger.warning(f'Terminate thread for replica {replica_id} '
894
981
  'already exists. Skipping.')
895
982
  return
896
983
 
@@ -955,22 +1042,22 @@ class SkyPilotReplicaManager(ReplicaManager):
955
1042
  # If the cluster does not exist, it means either the cluster never
956
1043
  # exists (e.g., the cluster is scaled down before it gets a chance to
957
1044
  # provision) or the cluster is preempted and cleaned up by the status
958
- # refresh. In this case, we skip spawning a new down process to save
1045
+ # refresh. In this case, we skip spawning a new down thread to save
959
1046
  # controller resources.
960
- if global_user_state.get_cluster_from_name(info.cluster_name) is None:
961
- self._handle_sky_down_finish(info, exitcode=0)
1047
+ if not global_user_state.cluster_with_name_exists(info.cluster_name):
1048
+ self._handle_sky_down_finish(info, format_exc=None)
962
1049
  return
963
1050
 
964
- # Otherwise, start the process to terminate the cluster.
965
- p = multiprocessing.Process(
966
- target=ux_utils.RedirectOutputForProcess(terminate_cluster,
967
- log_file_name, 'a').run,
968
- args=(info.cluster_name, replica_drain_delay_seconds),
1051
+ # Otherwise, start the thread to terminate the cluster.
1052
+ t = thread_utils.SafeThread(
1053
+ target=terminate_cluster,
1054
+ args=(info.cluster_name, log_file_name,
1055
+ replica_drain_delay_seconds),
969
1056
  )
970
1057
  info.status_property.sky_down_status = (
971
1058
  common_utils.ProcessStatus.SCHEDULED)
972
1059
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
973
- self._down_process_pool[replica_id] = p
1060
+ self._down_thread_pool[replica_id] = t
974
1061
 
975
1062
  @with_lock
976
1063
  def scale_down(self, replica_id: int, purge: bool = False) -> None:
@@ -1035,55 +1122,54 @@ class SkyPilotReplicaManager(ReplicaManager):
1035
1122
  #################################
1036
1123
 
1037
1124
  @with_lock
1038
- def _refresh_process_pool(self) -> None:
1039
- """Refresh the launch/down process pool.
1125
+ def _refresh_thread_pool(self) -> None:
1126
+ """Refresh the launch/down thread pool.
1040
1127
 
1041
- This function will checks all sky.launch and sky.down process on
1128
+ This function will checks all sky.launch and sky.down thread on
1042
1129
  the fly. If any of them finished, it will update the status of the
1043
1130
  corresponding replica.
1044
1131
  """
1045
1132
  # To avoid `dictionary changed size during iteration` error.
1046
- launch_process_pool_snapshot = list(self._launch_process_pool.items())
1047
- for replica_id, p in launch_process_pool_snapshot:
1048
- if p.is_alive():
1133
+ launch_thread_pool_snapshot = list(self._launch_thread_pool.items())
1134
+ for replica_id, t in launch_thread_pool_snapshot:
1135
+ if t.is_alive():
1049
1136
  continue
1050
1137
  with filelock.FileLock(controller_utils.get_resources_lock_path()):
1051
1138
  info = serve_state.get_replica_info_from_id(
1052
1139
  self._service_name, replica_id)
1053
1140
  assert info is not None, replica_id
1054
1141
  error_in_sky_launch = False
1055
- schedule_next_jobs = False
1056
1142
  if info.status == serve_state.ReplicaStatus.PENDING:
1057
1143
  # sky.launch not started yet
1058
- if controller_utils.can_provision():
1059
- p.start()
1144
+ if controller_utils.can_provision(self._is_pool):
1145
+ t.start()
1060
1146
  info.status_property.sky_launch_status = (
1061
1147
  common_utils.ProcessStatus.RUNNING)
1062
1148
  else:
1063
1149
  # sky.launch finished
1064
- # TODO(tian): Try-catch in process, and have an enum return
1150
+ # TODO(tian): Try-catch in thread, and have an enum return
1065
1151
  # value to indicate which type of failure happened.
1066
1152
  # Currently we only have user code failure since the
1067
1153
  # retry_until_up flag is set to True, but it will be helpful
1068
1154
  # when we enable user choose whether to retry or not.
1069
1155
  logger.info(
1070
- f'Launch process for replica {replica_id} finished.')
1071
- del self._launch_process_pool[replica_id]
1072
- if p.exitcode != 0:
1156
+ f'Launch thread for replica {replica_id} finished.')
1157
+ self._launch_thread_pool.pop(replica_id)
1158
+ self._replica_to_request_id.pop(replica_id)
1159
+ if t.format_exc is not None:
1073
1160
  logger.warning(
1074
- f'Launch process for replica {replica_id} '
1075
- f'exited abnormally with code {p.exitcode}.'
1076
- ' Terminating...')
1161
+ f'Launch thread for replica {replica_id} '
1162
+ f'exited abnormally with exception '
1163
+ f'{t.format_exc}. Terminating...')
1077
1164
  info.status_property.sky_launch_status = (
1078
1165
  common_utils.ProcessStatus.FAILED)
1079
1166
  error_in_sky_launch = True
1080
1167
  else:
1081
1168
  info.status_property.sky_launch_status = (
1082
1169
  common_utils.ProcessStatus.SUCCEEDED)
1083
- schedule_next_jobs = True
1084
1170
  if self._spot_placer is not None and info.is_spot:
1085
1171
  # TODO(tian): Currently, we set the location to
1086
- # preemptive if the launch process failed. This is
1172
+ # preemptive if the launch thread failed. This is
1087
1173
  # because if the error is not related to the
1088
1174
  # availability of the location, then all locations
1089
1175
  # should failed for same reason. So it does not matter
@@ -1093,26 +1179,22 @@ class SkyPilotReplicaManager(ReplicaManager):
1093
1179
  # availability of the location later.
1094
1180
  location = info.get_spot_location()
1095
1181
  assert location is not None
1096
- if p.exitcode != 0:
1182
+ if t.format_exc is not None:
1097
1183
  self._spot_placer.set_preemptive(location)
1098
1184
  info.status_property.failed_spot_availability = True
1099
1185
  else:
1100
1186
  self._spot_placer.set_active(location)
1101
1187
  serve_state.add_or_update_replica(self._service_name,
1102
1188
  replica_id, info)
1103
- if schedule_next_jobs and self._is_pool:
1104
- jobs_scheduler.maybe_schedule_next_jobs()
1105
1189
  if error_in_sky_launch:
1106
1190
  # Teardown after update replica info since
1107
1191
  # _terminate_replica will update the replica info too.
1108
1192
  self._terminate_replica(replica_id,
1109
1193
  sync_down_logs=True,
1110
1194
  replica_drain_delay_seconds=0)
1111
- # Try schedule next job after acquiring the lock.
1112
- jobs_scheduler.maybe_schedule_next_jobs()
1113
- down_process_pool_snapshot = list(self._down_process_pool.items())
1114
- for replica_id, p in down_process_pool_snapshot:
1115
- if p.is_alive():
1195
+ down_thread_pool_snapshot = list(self._down_thread_pool.items())
1196
+ for replica_id, t in down_thread_pool_snapshot:
1197
+ if t.is_alive():
1116
1198
  continue
1117
1199
  info = serve_state.get_replica_info_from_id(self._service_name,
1118
1200
  replica_id)
@@ -1120,17 +1202,17 @@ class SkyPilotReplicaManager(ReplicaManager):
1120
1202
  if (info.status_property.sky_down_status ==
1121
1203
  common_utils.ProcessStatus.SCHEDULED):
1122
1204
  # sky.down not started yet
1123
- if controller_utils.can_terminate():
1124
- p.start()
1205
+ if controller_utils.can_terminate(self._is_pool):
1206
+ t.start()
1125
1207
  info.status_property.sky_down_status = (
1126
1208
  common_utils.ProcessStatus.RUNNING)
1127
1209
  serve_state.add_or_update_replica(self._service_name,
1128
1210
  replica_id, info)
1129
1211
  else:
1130
1212
  logger.info(
1131
- f'Terminate process for replica {replica_id} finished.')
1132
- del self._down_process_pool[replica_id]
1133
- self._handle_sky_down_finish(info, exitcode=p.exitcode)
1213
+ f'Terminate thread for replica {replica_id} finished.')
1214
+ self._down_thread_pool.pop(replica_id)
1215
+ self._handle_sky_down_finish(info, format_exc=t.format_exc)
1134
1216
 
1135
1217
  # Clean old version
1136
1218
  replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1140,25 +1222,25 @@ class SkyPilotReplicaManager(ReplicaManager):
1140
1222
  if self.least_recent_version < current_least_recent_version:
1141
1223
  for version in range(self.least_recent_version,
1142
1224
  current_least_recent_version):
1143
- task_yaml = serve_utils.generate_task_yaml_file_name(
1225
+ yaml_content = serve_utils.get_yaml_content(
1144
1226
  self._service_name, version)
1145
1227
  # Delete old version metadata.
1146
1228
  serve_state.delete_version(self._service_name, version)
1147
1229
  # Delete storage buckets of older versions.
1148
- service.cleanup_storage(task_yaml)
1230
+ service.cleanup_storage(yaml_content)
1149
1231
  # newest version will be cleaned in serve down
1150
1232
  self.least_recent_version = current_least_recent_version
1151
1233
 
1152
- def _process_pool_refresher(self) -> None:
1153
- """Periodically refresh the launch/down process pool."""
1234
+ def _thread_pool_refresher(self) -> None:
1235
+ """Periodically refresh the launch/down thread pool."""
1154
1236
  while True:
1155
- logger.debug('Refreshing process pool.')
1237
+ logger.debug('Refreshing thread pool.')
1156
1238
  try:
1157
- self._refresh_process_pool()
1239
+ self._refresh_thread_pool()
1158
1240
  except Exception as e: # pylint: disable=broad-except
1159
1241
  # No matter what error happens, we should keep the
1160
- # process pool refresher running.
1161
- logger.error('Error in process pool refresher: '
1242
+ # thread pool refresher running.
1243
+ logger.error('Error in thread pool refresher: '
1162
1244
  f'{common_utils.format_exception(e)}')
1163
1245
  with ux_utils.enable_traceback():
1164
1246
  logger.error(f' Traceback: {traceback.format_exc()}')
@@ -1386,11 +1468,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1386
1468
  logger.error(f'Invalid version: {version}, '
1387
1469
  f'latest version: {self.latest_version}')
1388
1470
  return
1389
- service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1390
- self._service_name, version)
1391
- serve_state.add_or_update_version(self._service_name, version, spec)
1471
+ yaml_content = serve_state.get_yaml_content(self._service_name, version)
1392
1472
  self.latest_version = version
1393
- self.service_task_yaml_path = service_task_yaml_path
1473
+ self.yaml_content = yaml_content
1394
1474
  self._update_mode = update_mode
1395
1475
 
1396
1476
  # Reuse all replicas that have the same config as the new version
@@ -1398,8 +1478,7 @@ class SkyPilotReplicaManager(ReplicaManager):
1398
1478
  # the latest version. This can significantly improve the speed
1399
1479
  # for updating an existing service with only config changes to the
1400
1480
  # service specs, e.g. scale down the service.
1401
- new_config = yaml_utils.read_yaml(
1402
- os.path.expanduser(service_task_yaml_path))
1481
+ new_config = yaml_utils.safe_load(yaml_content)
1403
1482
  # Always create new replicas and scale down old ones when file_mounts
1404
1483
  # are not empty.
1405
1484
  if new_config.get('file_mounts', None) != {}:
@@ -1412,11 +1491,9 @@ class SkyPilotReplicaManager(ReplicaManager):
1412
1491
  for info in replica_infos:
1413
1492
  if info.version < version and not info.is_terminal:
1414
1493
  # Assume user does not change the yaml file on the controller.
1415
- old_service_task_yaml_path = (
1416
- serve_utils.generate_task_yaml_file_name(
1417
- self._service_name, info.version))
1418
- old_config = yaml_utils.read_yaml(
1419
- os.path.expanduser(old_service_task_yaml_path))
1494
+ old_yaml_content = serve_state.get_yaml_content(
1495
+ self._service_name, info.version)
1496
+ old_config = yaml_utils.safe_load(old_yaml_content)
1420
1497
  for key in ['service', 'pool', '_user_specified_yaml']:
1421
1498
  old_config.pop(key, None)
1422
1499
  # Bump replica version if all fields except for service are