skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,7 @@ from sky.server import common as server_common
10
10
  from sky.server import stream_utils
11
11
  from sky.server.requests import executor
12
12
  from sky.server.requests import payloads
13
+ from sky.server.requests import request_names
13
14
  from sky.server.requests import requests as api_requests
14
15
  from sky.skylet import constants
15
16
  from sky.utils import common
@@ -23,9 +24,9 @@ async def up(
23
24
  request: fastapi.Request,
24
25
  up_body: payloads.ServeUpBody,
25
26
  ) -> None:
26
- executor.schedule_request(
27
+ await executor.schedule_request_async(
27
28
  request_id=request.state.request_id,
28
- request_name='serve.up',
29
+ request_name=request_names.RequestName.SERVE_UP,
29
30
  request_body=up_body,
30
31
  func=core.up,
31
32
  schedule_type=api_requests.ScheduleType.LONG,
@@ -38,9 +39,9 @@ async def update(
38
39
  request: fastapi.Request,
39
40
  update_body: payloads.ServeUpdateBody,
40
41
  ) -> None:
41
- executor.schedule_request(
42
+ await executor.schedule_request_async(
42
43
  request_id=request.state.request_id,
43
- request_name='serve.update',
44
+ request_name=request_names.RequestName.SERVE_UPDATE,
44
45
  request_body=update_body,
45
46
  func=core.update,
46
47
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -53,9 +54,9 @@ async def down(
53
54
  request: fastapi.Request,
54
55
  down_body: payloads.ServeDownBody,
55
56
  ) -> None:
56
- executor.schedule_request(
57
+ await executor.schedule_request_async(
57
58
  request_id=request.state.request_id,
58
- request_name='serve.down',
59
+ request_name=request_names.RequestName.SERVE_DOWN,
59
60
  request_body=down_body,
60
61
  func=core.down,
61
62
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -68,9 +69,9 @@ async def terminate_replica(
68
69
  request: fastapi.Request,
69
70
  terminate_replica_body: payloads.ServeTerminateReplicaBody,
70
71
  ) -> None:
71
- executor.schedule_request(
72
+ await executor.schedule_request_async(
72
73
  request_id=request.state.request_id,
73
- request_name='serve.terminate_replica',
74
+ request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
74
75
  request_body=terminate_replica_body,
75
76
  func=core.terminate_replica,
76
77
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -83,9 +84,9 @@ async def status(
83
84
  request: fastapi.Request,
84
85
  status_body: payloads.ServeStatusBody,
85
86
  ) -> None:
86
- executor.schedule_request(
87
+ await executor.schedule_request_async(
87
88
  request_id=request.state.request_id,
88
- request_name='serve.status',
89
+ request_name=request_names.RequestName.SERVE_STATUS,
89
90
  request_body=status_body,
90
91
  func=core.status,
91
92
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -98,21 +99,23 @@ async def tail_logs(
98
99
  request: fastapi.Request, log_body: payloads.ServeLogsBody,
99
100
  background_tasks: fastapi.BackgroundTasks
100
101
  ) -> fastapi.responses.StreamingResponse:
101
- executor.schedule_request(
102
+ executor.check_request_thread_executor_available()
103
+ request_task = await executor.prepare_request_async(
102
104
  request_id=request.state.request_id,
103
- request_name='serve.logs',
105
+ request_name=request_names.RequestName.SERVE_LOGS,
104
106
  request_body=log_body,
105
107
  func=core.tail_logs,
106
108
  schedule_type=api_requests.ScheduleType.SHORT,
107
109
  request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
108
110
  )
109
-
110
- request_task = api_requests.get_request(request.state.request_id)
111
-
112
- return stream_utils.stream_response(
111
+ task = executor.execute_request_in_coroutine(request_task)
112
+ # Cancel the coroutine after the request is done or client disconnects
113
+ background_tasks.add_task(task.cancel)
114
+ return stream_utils.stream_response_for_long_request(
113
115
  request_id=request_task.request_id,
114
116
  logs_path=request_task.log_path,
115
117
  background_tasks=background_tasks,
118
+ kill_request_on_disconnect=False,
116
119
  )
117
120
 
118
121
 
@@ -130,9 +133,9 @@ async def download_logs(
130
133
  # We should reuse the original request body, so that the env vars, such as
131
134
  # user hash, are kept the same.
132
135
  download_logs_body.local_dir = str(logs_dir_on_api_server)
133
- executor.schedule_request(
136
+ await executor.schedule_request_async(
134
137
  request_id=request.state.request_id,
135
- request_name='serve.sync_down_logs',
138
+ request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
136
139
  request_body=download_logs_body,
137
140
  func=core.sync_down_logs,
138
141
  schedule_type=api_requests.ScheduleType.SHORT,
sky/serve/service.py CHANGED
@@ -13,12 +13,13 @@ from typing import Dict
13
13
 
14
14
  import filelock
15
15
 
16
- from sky import authentication
17
16
  from sky import exceptions
17
+ from sky import global_user_state
18
18
  from sky import sky_logging
19
19
  from sky import task as task_lib
20
20
  from sky.backends import backend_utils
21
21
  from sky.backends import cloud_vm_ray_backend
22
+ from sky.data import data_utils
22
23
  from sky.serve import constants
23
24
  from sky.serve import controller
24
25
  from sky.serve import load_balancer
@@ -26,8 +27,11 @@ from sky.serve import replica_managers
26
27
  from sky.serve import serve_state
27
28
  from sky.serve import serve_utils
28
29
  from sky.skylet import constants as skylet_constants
30
+ from sky.utils import auth_utils
29
31
  from sky.utils import common_utils
32
+ from sky.utils import controller_utils
30
33
  from sky.utils import subprocess_utils
34
+ from sky.utils import thread_utils
31
35
  from sky.utils import ux_utils
32
36
 
33
37
  # Use the explicit logger name so that the logger is under the
@@ -62,17 +66,19 @@ def _handle_signal(service_name: str) -> None:
62
66
  raise error_type(f'User signal received: {user_signal.value}')
63
67
 
64
68
 
65
- def cleanup_storage(task_yaml: str) -> bool:
69
+ def cleanup_storage(yaml_content: str) -> bool:
66
70
  """Clean up the storage for the service.
67
71
 
68
72
  Args:
69
- task_yaml: The task yaml file.
73
+ yaml_content: The yaml content of the service.
70
74
 
71
75
  Returns:
72
76
  True if the storage is cleaned up successfully, False otherwise.
73
77
  """
78
+ failed = False
79
+
74
80
  try:
75
- task = task_lib.Task.from_yaml(task_yaml)
81
+ task = task_lib.Task.from_yaml_str(yaml_content)
76
82
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
77
83
  # Need to re-construct storage object in the controller process
78
84
  # because when SkyPilot API server machine sends the yaml config to the
@@ -86,54 +92,125 @@ def cleanup_storage(task_yaml: str) -> bool:
86
92
  f'{common_utils.format_exception(e)}')
87
93
  with ux_utils.enable_traceback():
88
94
  logger.error(f' Traceback: {traceback.format_exc()}')
89
- return False
90
- return True
95
+ failed = True
96
+
97
+ # Clean up any files mounted from the local disk, such as two-hop file
98
+ # mounts.
99
+ for file_mount in (task.file_mounts or {}).values():
100
+ try:
101
+ if not data_utils.is_cloud_store_url(file_mount):
102
+ path = os.path.expanduser(file_mount)
103
+ if os.path.isdir(path):
104
+ shutil.rmtree(path)
105
+ else:
106
+ os.remove(path)
107
+ except Exception as e: # pylint: disable=broad-except
108
+ logger.error(f'Failed to clean up file mount {file_mount}: {e}')
109
+ with ux_utils.enable_traceback():
110
+ logger.error(f' Traceback: {traceback.format_exc()}')
111
+ failed = True
112
+
113
+ return not failed
91
114
 
92
115
 
93
- def _cleanup(service_name: str) -> bool:
116
+ # NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
117
+ # because we killed all the processes (controller & replica manager) before
118
+ # calling this function.
119
+ def _cleanup(service_name: str, pool: bool) -> bool:
94
120
  """Clean up all service related resources, i.e. replicas and storage."""
121
+ # Cleanup the HA recovery script first as it is possible that some error
122
+ # was raised when we construct the task object (e.g.,
123
+ # sky.exceptions.ResourcesUnavailableError).
124
+ serve_state.remove_ha_recovery_script(service_name)
95
125
  failed = False
96
126
  replica_infos = serve_state.get_replica_infos(service_name)
97
- info2proc: Dict[replica_managers.ReplicaInfo,
98
- multiprocessing.Process] = dict()
127
+ info2thr: Dict[replica_managers.ReplicaInfo,
128
+ thread_utils.SafeThread] = dict()
129
+ # NOTE(dev): This relies on `sky/serve/serve_utils.py::
130
+ # generate_replica_cluster_name`. Change it if you change the function.
131
+ existing_cluster_names = global_user_state.get_cluster_names_start_with(
132
+ service_name)
99
133
  for info in replica_infos:
100
- p = multiprocessing.Process(target=replica_managers.terminate_cluster,
101
- args=(info.cluster_name,))
102
- p.start()
103
- info2proc[info] = p
134
+ if info.cluster_name not in existing_cluster_names:
135
+ logger.info(f'Cluster {info.cluster_name} for replica '
136
+ f'{info.replica_id} not found. Might be a failed '
137
+ 'cluster. Skipping.')
138
+ continue
139
+
140
+ log_file_name = serve_utils.generate_replica_log_file_name(
141
+ service_name, info.replica_id)
142
+ t = thread_utils.SafeThread(target=replica_managers.terminate_cluster,
143
+ args=(info.cluster_name, log_file_name))
144
+ info2thr[info] = t
104
145
  # Set replica status to `SHUTTING_DOWN`
105
146
  info.status_property.sky_launch_status = (
106
- replica_managers.ProcessStatus.SUCCEEDED)
147
+ replica_managers.common_utils.ProcessStatus.SUCCEEDED)
107
148
  info.status_property.sky_down_status = (
108
- replica_managers.ProcessStatus.RUNNING)
149
+ replica_managers.common_utils.ProcessStatus.SCHEDULED)
109
150
  serve_state.add_or_update_replica(service_name, info.replica_id, info)
110
- logger.info(f'Terminating replica {info.replica_id} ...')
111
- for info, p in info2proc.items():
112
- p.join()
113
- if p.exitcode == 0:
114
- serve_state.remove_replica(service_name, info.replica_id)
115
- logger.info(f'Replica {info.replica_id} terminated successfully.')
116
- else:
117
- # Set replica status to `FAILED_CLEANUP`
118
- info.status_property.sky_down_status = (
119
- replica_managers.ProcessStatus.FAILED)
120
- serve_state.add_or_update_replica(service_name, info.replica_id,
121
- info)
122
- failed = True
123
- logger.error(f'Replica {info.replica_id} failed to terminate.')
124
- versions = serve_state.get_service_versions(service_name)
125
- serve_state.remove_service_versions(service_name)
151
+ logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
152
+
153
+ def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
154
+ nonlocal failed
155
+ # Set replica status to `FAILED_CLEANUP`
156
+ info.status_property.sky_down_status = (
157
+ replica_managers.common_utils.ProcessStatus.FAILED)
158
+ serve_state.add_or_update_replica(service_name, info.replica_id, info)
159
+ failed = True
160
+ logger.error(f'Replica {info.replica_id} failed to terminate.')
161
+
162
+ # Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
163
+ # TODO(tian): Refactor to use the same logic and code.
164
+ while info2thr:
165
+ snapshot = list(info2thr.items())
166
+ for info, t in snapshot:
167
+ if t.is_alive():
168
+ continue
169
+ if (info.status_property.sky_down_status ==
170
+ replica_managers.common_utils.ProcessStatus.SCHEDULED):
171
+ if controller_utils.can_terminate(pool):
172
+ try:
173
+ t.start()
174
+ except Exception as e: # pylint: disable=broad-except
175
+ _set_to_failed_cleanup(info)
176
+ logger.error(f'Failed to start thread for replica '
177
+ f'{info.replica_id}: {e}')
178
+ del info2thr[info]
179
+ else:
180
+ info.status_property.sky_down_status = (
181
+ common_utils.ProcessStatus.RUNNING)
182
+ serve_state.add_or_update_replica(
183
+ service_name, info.replica_id, info)
184
+ else:
185
+ logger.info('Terminate thread for replica '
186
+ f'{info.replica_id} finished.')
187
+ t.join()
188
+ del info2thr[info]
189
+ if t.format_exc is None:
190
+ serve_state.remove_replica(service_name, info.replica_id)
191
+ logger.info(
192
+ f'Replica {info.replica_id} terminated successfully.')
193
+ else:
194
+ _set_to_failed_cleanup(info)
195
+ time.sleep(3)
126
196
 
127
197
  def cleanup_version_storage(version: int) -> bool:
128
- task_yaml: str = serve_utils.generate_task_yaml_file_name(
129
- service_name, version)
198
+ yaml_content = serve_state.get_yaml_content(service_name, version)
199
+ if yaml_content is None:
200
+ logger.warning(f'No yaml content found for version {version}')
201
+ return True
130
202
  logger.info(f'Cleaning up storage for version {version}, '
131
- f'task_yaml: {task_yaml}')
132
- return cleanup_storage(task_yaml)
203
+ f'yaml_content: {yaml_content}')
204
+ return cleanup_storage(yaml_content)
133
205
 
206
+ versions = serve_state.get_service_versions(service_name)
134
207
  if not all(map(cleanup_version_storage, versions)):
135
208
  failed = True
136
209
 
210
+ # Cleanup version metadata after all storages are cleaned up, otherwise
211
+ # the get_yaml_content will return None as all versions are deleted.
212
+ serve_state.delete_all_versions(service_name)
213
+
137
214
  return failed
138
215
 
139
216
 
@@ -152,73 +229,79 @@ def _cleanup_task_run_script(job_id: int) -> None:
152
229
  logger.warning(f'Task run script {this_task_run_script} not found')
153
230
 
154
231
 
155
- def _start(service_name: str, tmp_task_yaml: str, job_id: int):
232
+ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
156
233
  """Starts the service.
157
234
  This including the controller and load balancer.
158
235
  """
159
236
  # Generate ssh key pair to avoid race condition when multiple sky.launch
160
237
  # are executed at the same time.
161
- authentication.get_or_generate_keys()
238
+ auth_utils.get_or_generate_keys()
162
239
 
163
- # Initialize database record for the service.
164
- task = task_lib.Task.from_yaml(tmp_task_yaml)
165
- # Already checked before submit to controller.
166
- assert task.service is not None, task
167
- service_spec = task.service
168
-
169
- def is_recovery_mode(service_name: str) -> bool:
170
- """Check if service exists in database to determine recovery mode.
171
- """
172
- service = serve_state.get_service_from_name(service_name)
173
- return service is not None
174
-
175
- is_recovery = is_recovery_mode(service_name)
240
+ service = serve_state.get_service_from_name(service_name)
241
+ is_recovery = service is not None
176
242
  logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
177
243
 
244
+ def _read_yaml_content(yaml_path: str) -> str:
245
+ with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
246
+ return f.read()
247
+
178
248
  if is_recovery:
179
- version = serve_state.get_latest_version(service_name)
180
- if version is None:
181
- raise ValueError(f'No version found for service {service_name}')
249
+ yaml_content = service['yaml_content']
250
+ # Backward compatibility for old service records that
251
+ # does not dump the yaml content to version database.
252
+ # TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
253
+ if yaml_content is None:
254
+ yaml_content = _read_yaml_content(tmp_task_yaml)
182
255
  else:
183
- version = constants.INITIAL_VERSION
184
- # Add initial version information to the service state.
185
- serve_state.add_or_update_version(service_name, version, service_spec)
256
+ yaml_content = _read_yaml_content(tmp_task_yaml)
257
+
258
+ # Initialize database record for the service.
259
+ task = task_lib.Task.from_yaml_str(yaml_content)
260
+ # Already checked before submit to controller.
261
+ assert task.service is not None, task
262
+ service_spec = task.service
186
263
 
187
264
  service_dir = os.path.expanduser(
188
265
  serve_utils.generate_remote_service_dir_name(service_name))
189
- task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
190
266
 
191
267
  if not is_recovery:
192
- if (len(serve_state.get_services()) >=
193
- serve_utils.get_num_service_threshold()):
194
- cleanup_storage(tmp_task_yaml)
195
- with ux_utils.print_exception_no_traceback():
196
- raise RuntimeError('Max number of services reached.')
197
- success = serve_state.add_service(
198
- service_name,
199
- controller_job_id=job_id,
200
- policy=service_spec.autoscaling_policy_str(),
201
- requested_resources_str=backend_utils.get_task_resources_str(task),
202
- load_balancing_policy=service_spec.load_balancing_policy,
203
- status=serve_state.ServiceStatus.CONTROLLER_INIT,
204
- tls_encrypted=service_spec.tls_credential is not None)
268
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
269
+ if not controller_utils.can_start_new_process(task.service.pool):
270
+ cleanup_storage(yaml_content)
271
+ with ux_utils.print_exception_no_traceback():
272
+ raise RuntimeError(
273
+ constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
274
+ success = serve_state.add_service(
275
+ service_name,
276
+ controller_job_id=job_id,
277
+ policy=service_spec.autoscaling_policy_str(),
278
+ requested_resources_str=backend_utils.get_task_resources_str(
279
+ task),
280
+ load_balancing_policy=service_spec.load_balancing_policy,
281
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
282
+ tls_encrypted=service_spec.tls_credential is not None,
283
+ pool=service_spec.pool,
284
+ controller_pid=os.getpid(),
285
+ entrypoint=entrypoint)
205
286
  # Directly throw an error here. See sky/serve/api.py::up
206
287
  # for more details.
207
288
  if not success:
208
- cleanup_storage(tmp_task_yaml)
289
+ cleanup_storage(yaml_content)
209
290
  with ux_utils.print_exception_no_traceback():
210
291
  raise ValueError(f'Service {service_name} already exists.')
211
292
 
212
293
  # Create the service working directory.
213
294
  os.makedirs(service_dir, exist_ok=True)
214
295
 
215
- # Copy the tmp task yaml file to the final task yaml file.
216
- # This is for the service name conflict case. The _execute will
217
- # sync file mounts first and then realized a name conflict. We
218
- # don't want the new file mounts to overwrite the old one, so we
219
- # sync to a tmp file first and then copy it to the final name
220
- # if there is no name conflict.
221
- shutil.copy(tmp_task_yaml, task_yaml)
296
+ version = constants.INITIAL_VERSION
297
+ # Add initial version information to the service state.
298
+ serve_state.add_or_update_version(service_name, version, service_spec,
299
+ yaml_content)
300
+ else:
301
+ version = serve_state.get_latest_version(service_name)
302
+ if version is None:
303
+ raise ValueError(f'No version found for service {service_name}')
304
+ serve_state.update_service_controller_pid(service_name, os.getpid())
222
305
 
223
306
  controller_process = None
224
307
  load_balancer_process = None
@@ -249,7 +332,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
249
332
  controller_host = _get_controller_host()
250
333
  controller_process = multiprocessing.Process(
251
334
  target=controller.run_controller,
252
- args=(service_name, service_spec, task_yaml, controller_host,
335
+ args=(service_name, service_spec, version, controller_host,
253
336
  controller_port))
254
337
  controller_process.start()
255
338
 
@@ -271,14 +354,18 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
271
354
  # TODO(tian): Probably we could enable multiple ports specified in
272
355
  # service spec and we could start multiple load balancers.
273
356
  # After that, we will have a mapping from replica port to endpoint.
274
- load_balancer_process = multiprocessing.Process(
275
- target=ux_utils.RedirectOutputForProcess(
276
- load_balancer.run_load_balancer,
277
- load_balancer_log_file).run,
278
- args=(controller_addr, load_balancer_port,
279
- service_spec.load_balancing_policy,
280
- service_spec.tls_credential))
281
- load_balancer_process.start()
357
+ # NOTE(tian): We don't need the load balancer for pool.
358
+ # Skip the load balancer process for pool.
359
+ if not service_spec.pool:
360
+ load_balancer_process = multiprocessing.Process(
361
+ target=ux_utils.RedirectOutputForProcess(
362
+ load_balancer.run_load_balancer,
363
+ load_balancer_log_file).run,
364
+ args=(controller_addr, load_balancer_port,
365
+ service_spec.load_balancing_policy,
366
+ service_spec.tls_credential,
367
+ service_spec.target_qps_per_replica))
368
+ load_balancer_process.start()
282
369
 
283
370
  if not is_recovery:
284
371
  serve_state.set_service_load_balancer_port(
@@ -303,7 +390,19 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
303
390
  for process in process_to_kill:
304
391
  process.join()
305
392
 
306
- failed = _cleanup(service_name)
393
+ # Catch any exception here to avoid it kill the service monitoring
394
+ # process. In which case, the service will not only fail to clean
395
+ # up, but also cannot be terminated in the future as no process
396
+ # will handle the user signal anymore. Instead, we catch any error
397
+ # and set it to FAILED_CLEANUP instead.
398
+ try:
399
+ failed = _cleanup(service_name, service_spec.pool)
400
+ except Exception as e: # pylint: disable=broad-except
401
+ logger.error(f'Failed to clean up service {service_name}: {e}')
402
+ with ux_utils.enable_traceback():
403
+ logger.error(f' Traceback: {traceback.format_exc()}')
404
+ failed = True
405
+
307
406
  if failed:
308
407
  serve_state.set_service_status_and_active_versions(
309
408
  service_name, serve_state.ServiceStatus.FAILED_CLEANUP)
@@ -333,8 +432,12 @@ if __name__ == '__main__':
333
432
  required=True,
334
433
  type=int,
335
434
  help='Job id for the service job.')
435
+ parser.add_argument('--entrypoint',
436
+ type=str,
437
+ help='Entrypoint to launch the service',
438
+ required=True)
336
439
  args = parser.parse_args()
337
440
  # We start process with 'spawn', because 'fork' could result in weird
338
441
  # behaviors; 'spawn' is also cross-platform.
339
442
  multiprocessing.set_start_method('spawn', force=True)
340
- _start(args.service_name, args.task_yaml, args.job_id)
443
+ _start(args.service_name, args.task_yaml, args.job_id, args.entrypoint)