skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from sky.server import common as server_common
10
10
  from sky.server import stream_utils
11
11
  from sky.server.requests import executor
12
12
  from sky.server.requests import payloads
13
+ from sky.server.requests import request_names
13
14
  from sky.server.requests import requests as api_requests
14
15
  from sky.skylet import constants
15
16
  from sky.utils import common
@@ -23,9 +24,9 @@ async def up(
23
24
  request: fastapi.Request,
24
25
  up_body: payloads.ServeUpBody,
25
26
  ) -> None:
26
- executor.schedule_request(
27
+ await executor.schedule_request_async(
27
28
  request_id=request.state.request_id,
28
- request_name='serve.up',
29
+ request_name=request_names.RequestName.SERVE_UP,
29
30
  request_body=up_body,
30
31
  func=core.up,
31
32
  schedule_type=api_requests.ScheduleType.LONG,
@@ -38,9 +39,9 @@ async def update(
38
39
  request: fastapi.Request,
39
40
  update_body: payloads.ServeUpdateBody,
40
41
  ) -> None:
41
- executor.schedule_request(
42
+ await executor.schedule_request_async(
42
43
  request_id=request.state.request_id,
43
- request_name='serve.update',
44
+ request_name=request_names.RequestName.SERVE_UPDATE,
44
45
  request_body=update_body,
45
46
  func=core.update,
46
47
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -53,9 +54,9 @@ async def down(
53
54
  request: fastapi.Request,
54
55
  down_body: payloads.ServeDownBody,
55
56
  ) -> None:
56
- executor.schedule_request(
57
+ await executor.schedule_request_async(
57
58
  request_id=request.state.request_id,
58
- request_name='serve.down',
59
+ request_name=request_names.RequestName.SERVE_DOWN,
59
60
  request_body=down_body,
60
61
  func=core.down,
61
62
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -68,9 +69,9 @@ async def terminate_replica(
68
69
  request: fastapi.Request,
69
70
  terminate_replica_body: payloads.ServeTerminateReplicaBody,
70
71
  ) -> None:
71
- executor.schedule_request(
72
+ await executor.schedule_request_async(
72
73
  request_id=request.state.request_id,
73
- request_name='serve.terminate_replica',
74
+ request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
74
75
  request_body=terminate_replica_body,
75
76
  func=core.terminate_replica,
76
77
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -83,9 +84,9 @@ async def status(
83
84
  request: fastapi.Request,
84
85
  status_body: payloads.ServeStatusBody,
85
86
  ) -> None:
86
- executor.schedule_request(
87
+ await executor.schedule_request_async(
87
88
  request_id=request.state.request_id,
88
- request_name='serve.status',
89
+ request_name=request_names.RequestName.SERVE_STATUS,
89
90
  request_body=status_body,
90
91
  func=core.status,
91
92
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -98,21 +99,23 @@ async def tail_logs(
98
99
  request: fastapi.Request, log_body: payloads.ServeLogsBody,
99
100
  background_tasks: fastapi.BackgroundTasks
100
101
  ) -> fastapi.responses.StreamingResponse:
101
- executor.schedule_request(
102
+ executor.check_request_thread_executor_available()
103
+ request_task = await executor.prepare_request_async(
102
104
  request_id=request.state.request_id,
103
- request_name='serve.logs',
105
+ request_name=request_names.RequestName.SERVE_LOGS,
104
106
  request_body=log_body,
105
107
  func=core.tail_logs,
106
108
  schedule_type=api_requests.ScheduleType.SHORT,
107
109
  request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
108
110
  )
109
-
110
- request_task = api_requests.get_request(request.state.request_id)
111
-
112
- return stream_utils.stream_response(
111
+ task = executor.execute_request_in_coroutine(request_task)
112
+ # Cancel the coroutine after the request is done or client disconnects
113
+ background_tasks.add_task(task.cancel)
114
+ return stream_utils.stream_response_for_long_request(
113
115
  request_id=request_task.request_id,
114
116
  logs_path=request_task.log_path,
115
117
  background_tasks=background_tasks,
118
+ kill_request_on_disconnect=False,
116
119
  )
117
120
 
118
121
 
@@ -130,9 +133,9 @@ async def download_logs(
130
133
  # We should reuse the original request body, so that the env vars, such as
131
134
  # user hash, are kept the same.
132
135
  download_logs_body.local_dir = str(logs_dir_on_api_server)
133
- executor.schedule_request(
136
+ await executor.schedule_request_async(
134
137
  request_id=request.state.request_id,
135
- request_name='serve.sync_down_logs',
138
+ request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
136
139
  request_body=download_logs_body,
137
140
  func=core.sync_down_logs,
138
141
  schedule_type=api_requests.ScheduleType.SHORT,
sky/serve/service.py CHANGED
@@ -13,12 +13,13 @@ from typing import Dict
13
13
 
14
14
  import filelock
15
15
 
16
- from sky import authentication
17
16
  from sky import exceptions
17
+ from sky import global_user_state
18
18
  from sky import sky_logging
19
19
  from sky import task as task_lib
20
20
  from sky.backends import backend_utils
21
21
  from sky.backends import cloud_vm_ray_backend
22
+ from sky.data import data_utils
22
23
  from sky.serve import constants
23
24
  from sky.serve import controller
24
25
  from sky.serve import load_balancer
@@ -26,7 +27,9 @@ from sky.serve import replica_managers
26
27
  from sky.serve import serve_state
27
28
  from sky.serve import serve_utils
28
29
  from sky.skylet import constants as skylet_constants
30
+ from sky.utils import auth_utils
29
31
  from sky.utils import common_utils
32
+ from sky.utils import controller_utils
30
33
  from sky.utils import subprocess_utils
31
34
  from sky.utils import ux_utils
32
35
 
@@ -71,6 +74,8 @@ def cleanup_storage(task_yaml: str) -> bool:
71
74
  Returns:
72
75
  True if the storage is cleaned up successfully, False otherwise.
73
76
  """
77
+ failed = False
78
+
74
79
  try:
75
80
  task = task_lib.Task.from_yaml(task_yaml)
76
81
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
@@ -86,41 +91,105 @@ def cleanup_storage(task_yaml: str) -> bool:
86
91
  f'{common_utils.format_exception(e)}')
87
92
  with ux_utils.enable_traceback():
88
93
  logger.error(f' Traceback: {traceback.format_exc()}')
89
- return False
90
- return True
94
+ failed = True
95
+
96
+ # Clean up any files mounted from the local disk, such as two-hop file
97
+ # mounts.
98
+ for file_mount in (task.file_mounts or {}).values():
99
+ try:
100
+ if not data_utils.is_cloud_store_url(file_mount):
101
+ path = os.path.expanduser(file_mount)
102
+ if os.path.isdir(path):
103
+ shutil.rmtree(path)
104
+ else:
105
+ os.remove(path)
106
+ except Exception as e: # pylint: disable=broad-except
107
+ logger.error(f'Failed to clean up file mount {file_mount}: {e}')
108
+ with ux_utils.enable_traceback():
109
+ logger.error(f' Traceback: {traceback.format_exc()}')
110
+ failed = True
91
111
 
112
+ return not failed
92
113
 
114
+
115
+ # NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
116
+ # because we killed all the processes (controller & replica manager) before
117
+ # calling this function.
93
118
  def _cleanup(service_name: str) -> bool:
94
119
  """Clean up all service related resources, i.e. replicas and storage."""
120
+ # Cleanup the HA recovery script first as it is possible that some error
121
+ # was raised when we construct the task object (e.g.,
122
+ # sky.exceptions.ResourcesUnavailableError).
123
+ serve_state.remove_ha_recovery_script(service_name)
95
124
  failed = False
96
125
  replica_infos = serve_state.get_replica_infos(service_name)
97
126
  info2proc: Dict[replica_managers.ReplicaInfo,
98
127
  multiprocessing.Process] = dict()
128
+ # NOTE(dev): This relies on `sky/serve/serve_utils.py::
129
+ # generate_replica_cluster_name`. Change it if you change the function.
130
+ existing_cluster_names = global_user_state.get_cluster_names_start_with(
131
+ service_name)
99
132
  for info in replica_infos:
133
+ if info.cluster_name not in existing_cluster_names:
134
+ logger.info(f'Cluster {info.cluster_name} for replica '
135
+ f'{info.replica_id} not found. Might be a failed '
136
+ 'cluster. Skipping.')
137
+ continue
100
138
  p = multiprocessing.Process(target=replica_managers.terminate_cluster,
101
139
  args=(info.cluster_name,))
102
- p.start()
103
140
  info2proc[info] = p
104
141
  # Set replica status to `SHUTTING_DOWN`
105
142
  info.status_property.sky_launch_status = (
106
- replica_managers.ProcessStatus.SUCCEEDED)
143
+ replica_managers.common_utils.ProcessStatus.SUCCEEDED)
107
144
  info.status_property.sky_down_status = (
108
- replica_managers.ProcessStatus.RUNNING)
145
+ replica_managers.common_utils.ProcessStatus.SCHEDULED)
109
146
  serve_state.add_or_update_replica(service_name, info.replica_id, info)
110
- logger.info(f'Terminating replica {info.replica_id} ...')
111
- for info, p in info2proc.items():
112
- p.join()
113
- if p.exitcode == 0:
114
- serve_state.remove_replica(service_name, info.replica_id)
115
- logger.info(f'Replica {info.replica_id} terminated successfully.')
116
- else:
117
- # Set replica status to `FAILED_CLEANUP`
118
- info.status_property.sky_down_status = (
119
- replica_managers.ProcessStatus.FAILED)
120
- serve_state.add_or_update_replica(service_name, info.replica_id,
121
- info)
122
- failed = True
123
- logger.error(f'Replica {info.replica_id} failed to terminate.')
147
+ logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
148
+
149
+ def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
150
+ nonlocal failed
151
+ # Set replica status to `FAILED_CLEANUP`
152
+ info.status_property.sky_down_status = (
153
+ replica_managers.common_utils.ProcessStatus.FAILED)
154
+ serve_state.add_or_update_replica(service_name, info.replica_id, info)
155
+ failed = True
156
+ logger.error(f'Replica {info.replica_id} failed to terminate.')
157
+
158
+ # Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
159
+ # TODO(tian): Refactor to use the same logic and code.
160
+ while info2proc:
161
+ snapshot = list(info2proc.items())
162
+ for info, p in snapshot:
163
+ if p.is_alive():
164
+ continue
165
+ if (info.status_property.sky_down_status ==
166
+ replica_managers.common_utils.ProcessStatus.SCHEDULED):
167
+ if controller_utils.can_terminate():
168
+ try:
169
+ p.start()
170
+ except Exception as e: # pylint: disable=broad-except
171
+ _set_to_failed_cleanup(info)
172
+ logger.error(f'Failed to start process for replica '
173
+ f'{info.replica_id}: {e}')
174
+ del info2proc[info]
175
+ else:
176
+ info.status_property.sky_down_status = (
177
+ common_utils.ProcessStatus.RUNNING)
178
+ serve_state.add_or_update_replica(
179
+ service_name, info.replica_id, info)
180
+ else:
181
+ logger.info('Terminate process for replica '
182
+ f'{info.replica_id} finished.')
183
+ p.join()
184
+ del info2proc[info]
185
+ if p.exitcode == 0:
186
+ serve_state.remove_replica(service_name, info.replica_id)
187
+ logger.info(
188
+ f'Replica {info.replica_id} terminated successfully.')
189
+ else:
190
+ _set_to_failed_cleanup(info)
191
+ time.sleep(3)
192
+
124
193
  versions = serve_state.get_service_versions(service_name)
125
194
  serve_state.remove_service_versions(service_name)
126
195
 
@@ -152,13 +221,13 @@ def _cleanup_task_run_script(job_id: int) -> None:
152
221
  logger.warning(f'Task run script {this_task_run_script} not found')
153
222
 
154
223
 
155
- def _start(service_name: str, tmp_task_yaml: str, job_id: int):
224
+ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
156
225
  """Starts the service.
157
226
  This including the controller and load balancer.
158
227
  """
159
228
  # Generate ssh key pair to avoid race condition when multiple sky.launch
160
229
  # are executed at the same time.
161
- authentication.get_or_generate_keys()
230
+ auth_utils.get_or_generate_keys()
162
231
 
163
232
  # Initialize database record for the service.
164
233
  task = task_lib.Task.from_yaml(tmp_task_yaml)
@@ -186,22 +255,28 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
186
255
 
187
256
  service_dir = os.path.expanduser(
188
257
  serve_utils.generate_remote_service_dir_name(service_name))
189
- task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
258
+ service_task_yaml = serve_utils.generate_task_yaml_file_name(
259
+ service_name, version)
190
260
 
191
261
  if not is_recovery:
192
- if (len(serve_state.get_services()) >=
193
- serve_utils.get_num_service_threshold()):
194
- cleanup_storage(tmp_task_yaml)
195
- with ux_utils.print_exception_no_traceback():
196
- raise RuntimeError('Max number of services reached.')
197
- success = serve_state.add_service(
198
- service_name,
199
- controller_job_id=job_id,
200
- policy=service_spec.autoscaling_policy_str(),
201
- requested_resources_str=backend_utils.get_task_resources_str(task),
202
- load_balancing_policy=service_spec.load_balancing_policy,
203
- status=serve_state.ServiceStatus.CONTROLLER_INIT,
204
- tls_encrypted=service_spec.tls_credential is not None)
262
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
263
+ if not controller_utils.can_start_new_process():
264
+ cleanup_storage(tmp_task_yaml)
265
+ with ux_utils.print_exception_no_traceback():
266
+ raise RuntimeError(
267
+ constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
268
+ success = serve_state.add_service(
269
+ service_name,
270
+ controller_job_id=job_id,
271
+ policy=service_spec.autoscaling_policy_str(),
272
+ requested_resources_str=backend_utils.get_task_resources_str(
273
+ task),
274
+ load_balancing_policy=service_spec.load_balancing_policy,
275
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
276
+ tls_encrypted=service_spec.tls_credential is not None,
277
+ pool=service_spec.pool,
278
+ controller_pid=os.getpid(),
279
+ entrypoint=entrypoint)
205
280
  # Directly throw an error here. See sky/serve/api.py::up
206
281
  # for more details.
207
282
  if not success:
@@ -218,7 +293,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
218
293
  # don't want the new file mounts to overwrite the old one, so we
219
294
  # sync to a tmp file first and then copy it to the final name
220
295
  # if there is no name conflict.
221
- shutil.copy(tmp_task_yaml, task_yaml)
296
+ shutil.copy(tmp_task_yaml, service_task_yaml)
297
+ else:
298
+ serve_state.update_service_controller_pid(service_name, os.getpid())
222
299
 
223
300
  controller_process = None
224
301
  load_balancer_process = None
@@ -249,8 +326,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
249
326
  controller_host = _get_controller_host()
250
327
  controller_process = multiprocessing.Process(
251
328
  target=controller.run_controller,
252
- args=(service_name, service_spec, task_yaml, controller_host,
253
- controller_port))
329
+ args=(service_name, service_spec, service_task_yaml,
330
+ controller_host, controller_port))
254
331
  controller_process.start()
255
332
 
256
333
  if not is_recovery:
@@ -271,14 +348,18 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
271
348
  # TODO(tian): Probably we could enable multiple ports specified in
272
349
  # service spec and we could start multiple load balancers.
273
350
  # After that, we will have a mapping from replica port to endpoint.
274
- load_balancer_process = multiprocessing.Process(
275
- target=ux_utils.RedirectOutputForProcess(
276
- load_balancer.run_load_balancer,
277
- load_balancer_log_file).run,
278
- args=(controller_addr, load_balancer_port,
279
- service_spec.load_balancing_policy,
280
- service_spec.tls_credential))
281
- load_balancer_process.start()
351
+ # NOTE(tian): We don't need the load balancer for cluster pool.
352
+ # Skip the load balancer process for cluster pool.
353
+ if not service_spec.pool:
354
+ load_balancer_process = multiprocessing.Process(
355
+ target=ux_utils.RedirectOutputForProcess(
356
+ load_balancer.run_load_balancer,
357
+ load_balancer_log_file).run,
358
+ args=(controller_addr, load_balancer_port,
359
+ service_spec.load_balancing_policy,
360
+ service_spec.tls_credential,
361
+ service_spec.target_qps_per_replica))
362
+ load_balancer_process.start()
282
363
 
283
364
  if not is_recovery:
284
365
  serve_state.set_service_load_balancer_port(
@@ -333,8 +414,12 @@ if __name__ == '__main__':
333
414
  required=True,
334
415
  type=int,
335
416
  help='Job id for the service job.')
417
+ parser.add_argument('--entrypoint',
418
+ type=str,
419
+ help='Entrypoint to launch the service',
420
+ required=True)
336
421
  args = parser.parse_args()
337
422
  # We start process with 'spawn', because 'fork' could result in weird
338
423
  # behaviors; 'spawn' is also cross-platform.
339
424
  multiprocessing.set_start_method('spawn', force=True)
340
- _start(args.service_name, args.task_yaml, args.job_id)
425
+ _start(args.service_name, args.task_yaml, args.job_id, args.entrypoint)
sky/serve/service_spec.py CHANGED
@@ -2,11 +2,9 @@
2
2
  import json
3
3
  import os
4
4
  import textwrap
5
- import typing
6
- from typing import Any, Dict, List, Optional
5
+ from typing import Any, Dict, List, Optional, Union
7
6
 
8
7
  from sky import serve
9
- from sky.adaptors import common as adaptors_common
10
8
  from sky.serve import constants
11
9
  from sky.serve import load_balancing_policies as lb_policies
12
10
  from sky.serve import serve_utils
@@ -14,11 +12,7 @@ from sky.serve import spot_placer as spot_placer_lib
14
12
  from sky.utils import common_utils
15
13
  from sky.utils import schemas
16
14
  from sky.utils import ux_utils
17
-
18
- if typing.TYPE_CHECKING:
19
- import yaml
20
- else:
21
- yaml = adaptors_common.LazyImport('yaml')
15
+ from sky.utils import yaml_utils
22
16
 
23
17
 
24
18
  class SkyServiceSpec:
@@ -33,7 +27,7 @@ class SkyServiceSpec:
33
27
  max_replicas: Optional[int] = None,
34
28
  num_overprovision: Optional[int] = None,
35
29
  ports: Optional[str] = None,
36
- target_qps_per_replica: Optional[float] = None,
30
+ target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None,
37
31
  post_data: Optional[Dict[str, Any]] = None,
38
32
  tls_credential: Optional[serve_utils.TLSCredential] = None,
39
33
  readiness_headers: Optional[Dict[str, str]] = None,
@@ -43,7 +37,33 @@ class SkyServiceSpec:
43
37
  upscale_delay_seconds: Optional[int] = None,
44
38
  downscale_delay_seconds: Optional[int] = None,
45
39
  load_balancing_policy: Optional[str] = None,
40
+ pool: Optional[bool] = None,
46
41
  ) -> None:
42
+ if pool:
43
+ for unsupported_field in [
44
+ 'max_replicas',
45
+ 'num_overprovision',
46
+ 'target_qps_per_replica',
47
+ 'upscale_delay_seconds',
48
+ 'downscale_delay_seconds',
49
+ 'base_ondemand_fallback_replicas',
50
+ 'dynamic_ondemand_fallback',
51
+ 'spot_placer',
52
+ 'load_balancing_policy',
53
+ 'ports',
54
+ 'post_data',
55
+ 'tls_credential',
56
+ 'readiness_headers',
57
+ ]:
58
+ if locals()[unsupported_field] is not None:
59
+ with ux_utils.print_exception_no_traceback():
60
+ raise ValueError(
61
+ f'{unsupported_field} is not supported for pool.')
62
+ if max_replicas is not None and max_replicas != min_replicas:
63
+ with ux_utils.print_exception_no_traceback():
64
+ raise ValueError('Autoscaling is not supported for pool '
65
+ 'for now.')
66
+
47
67
  if max_replicas is not None and max_replicas < min_replicas:
48
68
  with ux_utils.print_exception_no_traceback():
49
69
  raise ValueError('max_replicas must be greater than or '
@@ -83,7 +103,8 @@ class SkyServiceSpec:
83
103
  self._max_replicas: Optional[int] = max_replicas
84
104
  self._num_overprovision: Optional[int] = num_overprovision
85
105
  self._ports: Optional[str] = ports
86
- self._target_qps_per_replica: Optional[float] = target_qps_per_replica
106
+ self._target_qps_per_replica: Optional[Union[float, Dict[
107
+ str, float]]] = target_qps_per_replica
87
108
  self._post_data: Optional[Dict[str, Any]] = post_data
88
109
  self._tls_credential: Optional[serve_utils.TLSCredential] = (
89
110
  tls_credential)
@@ -96,6 +117,7 @@ class SkyServiceSpec:
96
117
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
97
118
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
98
119
  self._load_balancing_policy: Optional[str] = load_balancing_policy
120
+ self._pool: Optional[bool] = pool
99
121
 
100
122
  self._use_ondemand_fallback: bool = (
101
123
  self.dynamic_ondemand_fallback is not None and
@@ -115,7 +137,7 @@ class SkyServiceSpec:
115
137
 
116
138
  service_config: Dict[str, Any] = {}
117
139
 
118
- readiness_section = config['readiness_probe']
140
+ readiness_section = config.get('readiness_probe', '/')
119
141
  if isinstance(readiness_section, str):
120
142
  service_config['readiness_path'] = readiness_section
121
143
  initial_delay_seconds = None
@@ -157,8 +179,29 @@ class SkyServiceSpec:
157
179
  raise ValueError('Port must be between 1 and 65535.')
158
180
  service_config['ports'] = str(ports) if ports is not None else None
159
181
 
182
+ pool_config = config.get('pool', None)
183
+ if pool_config is not None:
184
+ service_config['pool'] = pool_config
185
+
160
186
  policy_section = config.get('replica_policy', None)
187
+ if policy_section is not None and pool_config:
188
+ with ux_utils.print_exception_no_traceback():
189
+ raise ValueError('Cannot specify `replica_policy` for cluster '
190
+ 'pool. Only `workers: <num>` is supported '
191
+ 'for cluster pool now.')
192
+
161
193
  simplified_policy_section = config.get('replicas', None)
194
+ workers_config = config.get('workers', None)
195
+ if simplified_policy_section is not None and workers_config is not None:
196
+ with ux_utils.print_exception_no_traceback():
197
+ raise ValueError('Cannot specify both `replicas` and `workers`.'
198
+ ' Please use one of them.')
199
+ if simplified_policy_section is not None and pool_config:
200
+ with ux_utils.print_exception_no_traceback():
201
+ raise ValueError('Cannot specify `replicas` for cluster pool. '
202
+ 'Please use `workers` instead.')
203
+ if simplified_policy_section is None:
204
+ simplified_policy_section = workers_config
162
205
  if policy_section is None or simplified_policy_section is not None:
163
206
  if simplified_policy_section is not None:
164
207
  min_replicas = simplified_policy_section
@@ -193,6 +236,26 @@ class SkyServiceSpec:
193
236
  service_config['load_balancing_policy'] = config.get(
194
237
  'load_balancing_policy', None)
195
238
 
239
+ # Validate instance-aware settings
240
+ target_qps_per_replica = service_config['target_qps_per_replica']
241
+ load_balancing_policy = service_config['load_balancing_policy']
242
+
243
+ if isinstance(target_qps_per_replica, dict):
244
+ if load_balancing_policy != 'instance_aware_least_load':
245
+ with ux_utils.print_exception_no_traceback():
246
+ raise ValueError(
247
+ 'When using dict type target_qps_per_replica, '
248
+ 'load_balancing_policy must be '
249
+ '"instance_aware_least_load".')
250
+
251
+ if load_balancing_policy == 'instance_aware_least_load':
252
+ if not isinstance(target_qps_per_replica, dict):
253
+ with ux_utils.print_exception_no_traceback():
254
+ raise ValueError(
255
+ 'When using "instance_aware_least_load" policy, '
256
+ 'target_qps_per_replica must be a '
257
+ 'dict mapping GPU types to QPS values.')
258
+
196
259
  tls_section = config.get('tls', None)
197
260
  if tls_section is not None:
198
261
  service_config['tls_credential'] = serve_utils.TLSCredential(
@@ -205,7 +268,7 @@ class SkyServiceSpec:
205
268
  @staticmethod
206
269
  def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
207
270
  with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
208
- config = yaml.safe_load(f)
271
+ config = yaml_utils.safe_load(f)
209
272
 
210
273
  if isinstance(config, str):
211
274
  with ux_utils.print_exception_no_traceback():
@@ -239,6 +302,13 @@ class SkyServiceSpec:
239
302
  config[section] = dict()
240
303
  config[section][key] = value
241
304
 
305
+ add_if_not_none('pool', None, self._pool)
306
+
307
+ if self.pool:
308
+ # For pool, currently only `workers: <num>` is supported.
309
+ add_if_not_none('workers', None, self.min_replicas)
310
+ return config
311
+
242
312
  add_if_not_none('readiness_probe', 'path', self.readiness_path)
243
313
  add_if_not_none('readiness_probe', 'initial_delay_seconds',
244
314
  self.initial_delay_seconds)
@@ -306,10 +376,14 @@ class SkyServiceSpec:
306
376
  return ' '.join(policy_strs)
307
377
 
308
378
  def autoscaling_policy_str(self):
379
+ if self.pool:
380
+ # We only support fixed-size pool for now.
381
+ return f'Fixed-size ({self.min_replicas} workers)'
309
382
  # TODO(MaoZiming): Update policy_str
383
+ noun = 'worker' if self.pool else 'replica'
310
384
  min_plural = '' if self.min_replicas == 1 else 's'
311
385
  if self.max_replicas == self.min_replicas or self.max_replicas is None:
312
- return f'Fixed {self.min_replicas} replica{min_plural}'
386
+ return f'Fixed {self.min_replicas} {noun}{min_plural}'
313
387
  # Already checked in __init__.
314
388
  assert self.target_qps_per_replica is not None
315
389
  # TODO(tian): Refactor to contain more information
@@ -319,8 +393,8 @@ class SkyServiceSpec:
319
393
  overprovision_str = (
320
394
  f' with {self.num_overprovision} overprovisioned replicas')
321
395
  return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
322
- f'replica{max_plural}{overprovision_str} (target QPS per '
323
- f'replica: {self.target_qps_per_replica})')
396
+ f'{noun}{max_plural}{overprovision_str} (target QPS per '
397
+ f'{noun}: {self.target_qps_per_replica})')
324
398
 
325
399
  def set_ports(self, ports: str) -> None:
326
400
  self._ports = ports
@@ -332,6 +406,10 @@ class SkyServiceSpec:
332
406
  f'Certfile: {self.tls_credential.certfile}')
333
407
 
334
408
  def __repr__(self) -> str:
409
+ if self.pool:
410
+ return textwrap.dedent(f"""\
411
+ Worker policy: {self.autoscaling_policy_str()}
412
+ """)
335
413
  return textwrap.dedent(f"""\
336
414
  Readiness probe method: {self.probe_str()}
337
415
  Readiness initial delay seconds: {self.initial_delay_seconds}
@@ -372,7 +450,8 @@ class SkyServiceSpec:
372
450
  return self._ports
373
451
 
374
452
  @property
375
- def target_qps_per_replica(self) -> Optional[float]:
453
+ def target_qps_per_replica(
454
+ self) -> Optional[Union[float, Dict[str, float]]]:
376
455
  return self._target_qps_per_replica
377
456
 
378
457
  @property
@@ -420,3 +499,43 @@ class SkyServiceSpec:
420
499
  def load_balancing_policy(self) -> str:
421
500
  return lb_policies.LoadBalancingPolicy.make_policy_name(
422
501
  self._load_balancing_policy)
502
+
503
+ @property
504
+ def pool(self) -> bool:
505
+ # This can happen for backward compatibility.
506
+ if not hasattr(self, '_pool'):
507
+ return False
508
+ return bool(self._pool)
509
+
510
+ def copy(self, **override) -> 'SkyServiceSpec':
511
+ return SkyServiceSpec(
512
+ readiness_path=override.pop('readiness_path', self._readiness_path),
513
+ initial_delay_seconds=override.pop('initial_delay_seconds',
514
+ self._initial_delay_seconds),
515
+ readiness_timeout_seconds=override.pop(
516
+ 'readiness_timeout_seconds', self._readiness_timeout_seconds),
517
+ min_replicas=override.pop('min_replicas', self._min_replicas),
518
+ max_replicas=override.pop('max_replicas', self._max_replicas),
519
+ num_overprovision=override.pop('num_overprovision',
520
+ self._num_overprovision),
521
+ ports=override.pop('ports', self._ports),
522
+ target_qps_per_replica=override.pop('target_qps_per_replica',
523
+ self._target_qps_per_replica),
524
+ post_data=override.pop('post_data', self._post_data),
525
+ tls_credential=override.pop('tls_credential', self._tls_credential),
526
+ readiness_headers=override.pop('readiness_headers',
527
+ self._readiness_headers),
528
+ dynamic_ondemand_fallback=override.pop(
529
+ 'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
530
+ base_ondemand_fallback_replicas=override.pop(
531
+ 'base_ondemand_fallback_replicas',
532
+ self._base_ondemand_fallback_replicas),
533
+ spot_placer=override.pop('spot_placer', self._spot_placer),
534
+ upscale_delay_seconds=override.pop('upscale_delay_seconds',
535
+ self._upscale_delay_seconds),
536
+ downscale_delay_seconds=override.pop('downscale_delay_seconds',
537
+ self._downscale_delay_seconds),
538
+ load_balancing_policy=override.pop('load_balancing_policy',
539
+ self._load_balancing_policy),
540
+ pool=override.pop('pool', self._pool),
541
+ )
sky/serve/spot_placer.py CHANGED
@@ -46,6 +46,8 @@ class Location:
46
46
 
47
47
  @classmethod
48
48
  def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
49
+ assert resources.cloud is not None, 'Cloud must be specified'
50
+ assert resources.region is not None, 'Region must be specified'
49
51
  return cls(resources.cloud, resources.region, resources.zone)
50
52
 
51
53
  def to_dict(self) -> Dict[str, Any]:
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
147
149
  cloud_str = str(launchable.cloud)
148
150
  region = launchable.region
149
151
  zone = launchable.zone
152
+ assert region is not None, 'Region must be specified'
150
153
  if (cloud_str not in location_requirements and
151
154
  location_requirements):
152
155
  continue
File without changes