skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/service_spec.py CHANGED
@@ -2,11 +2,9 @@
2
2
  import json
3
3
  import os
4
4
  import textwrap
5
- import typing
6
- from typing import Any, Dict, List, Optional
5
+ from typing import Any, Dict, List, Optional, Union
7
6
 
8
7
  from sky import serve
9
- from sky.adaptors import common as adaptors_common
10
8
  from sky.serve import constants
11
9
  from sky.serve import load_balancing_policies as lb_policies
12
10
  from sky.serve import serve_utils
@@ -14,11 +12,7 @@ from sky.serve import spot_placer as spot_placer_lib
14
12
  from sky.utils import common_utils
15
13
  from sky.utils import schemas
16
14
  from sky.utils import ux_utils
17
-
18
- if typing.TYPE_CHECKING:
19
- import yaml
20
- else:
21
- yaml = adaptors_common.LazyImport('yaml')
15
+ from sky.utils import yaml_utils
22
16
 
23
17
 
24
18
  class SkyServiceSpec:
@@ -33,7 +27,7 @@ class SkyServiceSpec:
33
27
  max_replicas: Optional[int] = None,
34
28
  num_overprovision: Optional[int] = None,
35
29
  ports: Optional[str] = None,
36
- target_qps_per_replica: Optional[float] = None,
30
+ target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None,
37
31
  post_data: Optional[Dict[str, Any]] = None,
38
32
  tls_credential: Optional[serve_utils.TLSCredential] = None,
39
33
  readiness_headers: Optional[Dict[str, str]] = None,
@@ -43,7 +37,33 @@ class SkyServiceSpec:
43
37
  upscale_delay_seconds: Optional[int] = None,
44
38
  downscale_delay_seconds: Optional[int] = None,
45
39
  load_balancing_policy: Optional[str] = None,
40
+ pool: Optional[bool] = None,
46
41
  ) -> None:
42
+ if pool:
43
+ for unsupported_field in [
44
+ 'max_replicas',
45
+ 'num_overprovision',
46
+ 'target_qps_per_replica',
47
+ 'upscale_delay_seconds',
48
+ 'downscale_delay_seconds',
49
+ 'base_ondemand_fallback_replicas',
50
+ 'dynamic_ondemand_fallback',
51
+ 'spot_placer',
52
+ 'load_balancing_policy',
53
+ 'ports',
54
+ 'post_data',
55
+ 'tls_credential',
56
+ 'readiness_headers',
57
+ ]:
58
+ if locals()[unsupported_field] is not None:
59
+ with ux_utils.print_exception_no_traceback():
60
+ raise ValueError(
61
+ f'{unsupported_field} is not supported for pool.')
62
+ if max_replicas is not None and max_replicas != min_replicas:
63
+ with ux_utils.print_exception_no_traceback():
64
+ raise ValueError('Autoscaling is not supported for pool '
65
+ 'for now.')
66
+
47
67
  if max_replicas is not None and max_replicas < min_replicas:
48
68
  with ux_utils.print_exception_no_traceback():
49
69
  raise ValueError('max_replicas must be greater than or '
@@ -83,7 +103,8 @@ class SkyServiceSpec:
83
103
  self._max_replicas: Optional[int] = max_replicas
84
104
  self._num_overprovision: Optional[int] = num_overprovision
85
105
  self._ports: Optional[str] = ports
86
- self._target_qps_per_replica: Optional[float] = target_qps_per_replica
106
+ self._target_qps_per_replica: Optional[Union[float, Dict[
107
+ str, float]]] = target_qps_per_replica
87
108
  self._post_data: Optional[Dict[str, Any]] = post_data
88
109
  self._tls_credential: Optional[serve_utils.TLSCredential] = (
89
110
  tls_credential)
@@ -96,6 +117,7 @@ class SkyServiceSpec:
96
117
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
97
118
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
98
119
  self._load_balancing_policy: Optional[str] = load_balancing_policy
120
+ self._pool: Optional[bool] = pool
99
121
 
100
122
  self._use_ondemand_fallback: bool = (
101
123
  self.dynamic_ondemand_fallback is not None and
@@ -115,7 +137,7 @@ class SkyServiceSpec:
115
137
 
116
138
  service_config: Dict[str, Any] = {}
117
139
 
118
- readiness_section = config['readiness_probe']
140
+ readiness_section = config.get('readiness_probe', '/')
119
141
  if isinstance(readiness_section, str):
120
142
  service_config['readiness_path'] = readiness_section
121
143
  initial_delay_seconds = None
@@ -157,8 +179,29 @@ class SkyServiceSpec:
157
179
  raise ValueError('Port must be between 1 and 65535.')
158
180
  service_config['ports'] = str(ports) if ports is not None else None
159
181
 
182
+ pool_config = config.get('pool', None)
183
+ if pool_config is not None:
184
+ service_config['pool'] = pool_config
185
+
160
186
  policy_section = config.get('replica_policy', None)
187
+ if policy_section is not None and pool_config:
188
+ with ux_utils.print_exception_no_traceback():
189
+ raise ValueError('Cannot specify `replica_policy` for cluster '
190
+ 'pool. Only `workers: <num>` is supported '
191
+ 'for pool now.')
192
+
161
193
  simplified_policy_section = config.get('replicas', None)
194
+ workers_config = config.get('workers', None)
195
+ if simplified_policy_section is not None and workers_config is not None:
196
+ with ux_utils.print_exception_no_traceback():
197
+ raise ValueError('Cannot specify both `replicas` and `workers`.'
198
+ ' Please use one of them.')
199
+ if simplified_policy_section is not None and pool_config:
200
+ with ux_utils.print_exception_no_traceback():
201
+ raise ValueError('Cannot specify `replicas` for pool. '
202
+ 'Please use `workers` instead.')
203
+ if simplified_policy_section is None:
204
+ simplified_policy_section = workers_config
162
205
  if policy_section is None or simplified_policy_section is not None:
163
206
  if simplified_policy_section is not None:
164
207
  min_replicas = simplified_policy_section
@@ -193,6 +236,26 @@ class SkyServiceSpec:
193
236
  service_config['load_balancing_policy'] = config.get(
194
237
  'load_balancing_policy', None)
195
238
 
239
+ # Validate instance-aware settings
240
+ target_qps_per_replica = service_config['target_qps_per_replica']
241
+ load_balancing_policy = service_config['load_balancing_policy']
242
+
243
+ if isinstance(target_qps_per_replica, dict):
244
+ if load_balancing_policy != 'instance_aware_least_load':
245
+ with ux_utils.print_exception_no_traceback():
246
+ raise ValueError(
247
+ 'When using dict type target_qps_per_replica, '
248
+ 'load_balancing_policy must be '
249
+ '"instance_aware_least_load".')
250
+
251
+ if load_balancing_policy == 'instance_aware_least_load':
252
+ if not isinstance(target_qps_per_replica, dict):
253
+ with ux_utils.print_exception_no_traceback():
254
+ raise ValueError(
255
+ 'When using "instance_aware_least_load" policy, '
256
+ 'target_qps_per_replica must be a '
257
+ 'dict mapping GPU types to QPS values.')
258
+
196
259
  tls_section = config.get('tls', None)
197
260
  if tls_section is not None:
198
261
  service_config['tls_credential'] = serve_utils.TLSCredential(
@@ -203,14 +266,13 @@ class SkyServiceSpec:
203
266
  return SkyServiceSpec(**service_config)
204
267
 
205
268
  @staticmethod
206
- def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
207
- with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
208
- config = yaml.safe_load(f)
269
+ def from_yaml_str(yaml_str: str) -> 'SkyServiceSpec':
270
+ config = yaml_utils.safe_load(yaml_str)
209
271
 
210
272
  if isinstance(config, str):
211
273
  with ux_utils.print_exception_no_traceback():
212
274
  raise ValueError('YAML loaded as str, not as dict. '
213
- f'Is it correct? Path: {yaml_path}')
275
+ f'Is it correct? content:\n{yaml_str}')
214
276
 
215
277
  if config is None:
216
278
  config = {}
@@ -218,10 +280,16 @@ class SkyServiceSpec:
218
280
  if 'service' not in config:
219
281
  with ux_utils.print_exception_no_traceback():
220
282
  raise ValueError('Service YAML must have a "service" section. '
221
- f'Is it correct? Path: {yaml_path}')
283
+ f'Is it correct? content:\n{yaml_str}')
222
284
 
223
285
  return SkyServiceSpec.from_yaml_config(config['service'])
224
286
 
287
+ @staticmethod
288
+ def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
289
+ with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
290
+ yaml_content = f.read()
291
+ return SkyServiceSpec.from_yaml_str(yaml_content)
292
+
225
293
  def to_yaml_config(self) -> Dict[str, Any]:
226
294
  config: Dict[str, Any] = {}
227
295
 
@@ -239,6 +307,13 @@ class SkyServiceSpec:
239
307
  config[section] = dict()
240
308
  config[section][key] = value
241
309
 
310
+ add_if_not_none('pool', None, self._pool)
311
+
312
+ if self.pool:
313
+ # For pool, currently only `workers: <num>` is supported.
314
+ add_if_not_none('workers', None, self.min_replicas)
315
+ return config
316
+
242
317
  add_if_not_none('readiness_probe', 'path', self.readiness_path)
243
318
  add_if_not_none('readiness_probe', 'initial_delay_seconds',
244
319
  self.initial_delay_seconds)
@@ -306,10 +381,14 @@ class SkyServiceSpec:
306
381
  return ' '.join(policy_strs)
307
382
 
308
383
  def autoscaling_policy_str(self):
384
+ if self.pool:
385
+ # We only support fixed-size pool for now.
386
+ return f'Fixed-size ({self.min_replicas} workers)'
309
387
  # TODO(MaoZiming): Update policy_str
388
+ noun = 'worker' if self.pool else 'replica'
310
389
  min_plural = '' if self.min_replicas == 1 else 's'
311
390
  if self.max_replicas == self.min_replicas or self.max_replicas is None:
312
- return f'Fixed {self.min_replicas} replica{min_plural}'
391
+ return f'Fixed {self.min_replicas} {noun}{min_plural}'
313
392
  # Already checked in __init__.
314
393
  assert self.target_qps_per_replica is not None
315
394
  # TODO(tian): Refactor to contain more information
@@ -319,8 +398,8 @@ class SkyServiceSpec:
319
398
  overprovision_str = (
320
399
  f' with {self.num_overprovision} overprovisioned replicas')
321
400
  return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
322
- f'replica{max_plural}{overprovision_str} (target QPS per '
323
- f'replica: {self.target_qps_per_replica})')
401
+ f'{noun}{max_plural}{overprovision_str} (target QPS per '
402
+ f'{noun}: {self.target_qps_per_replica})')
324
403
 
325
404
  def set_ports(self, ports: str) -> None:
326
405
  self._ports = ports
@@ -332,6 +411,10 @@ class SkyServiceSpec:
332
411
  f'Certfile: {self.tls_credential.certfile}')
333
412
 
334
413
  def __repr__(self) -> str:
414
+ if self.pool:
415
+ return textwrap.dedent(f"""\
416
+ Worker policy: {self.autoscaling_policy_str()}
417
+ """)
335
418
  return textwrap.dedent(f"""\
336
419
  Readiness probe method: {self.probe_str()}
337
420
  Readiness initial delay seconds: {self.initial_delay_seconds}
@@ -372,7 +455,8 @@ class SkyServiceSpec:
372
455
  return self._ports
373
456
 
374
457
  @property
375
- def target_qps_per_replica(self) -> Optional[float]:
458
+ def target_qps_per_replica(
459
+ self) -> Optional[Union[float, Dict[str, float]]]:
376
460
  return self._target_qps_per_replica
377
461
 
378
462
  @property
@@ -420,3 +504,43 @@ class SkyServiceSpec:
420
504
  def load_balancing_policy(self) -> str:
421
505
  return lb_policies.LoadBalancingPolicy.make_policy_name(
422
506
  self._load_balancing_policy)
507
+
508
+ @property
509
+ def pool(self) -> bool:
510
+ # This can happen for backward compatibility.
511
+ if not hasattr(self, '_pool'):
512
+ return False
513
+ return bool(self._pool)
514
+
515
+ def copy(self, **override) -> 'SkyServiceSpec':
516
+ return SkyServiceSpec(
517
+ readiness_path=override.pop('readiness_path', self._readiness_path),
518
+ initial_delay_seconds=override.pop('initial_delay_seconds',
519
+ self._initial_delay_seconds),
520
+ readiness_timeout_seconds=override.pop(
521
+ 'readiness_timeout_seconds', self._readiness_timeout_seconds),
522
+ min_replicas=override.pop('min_replicas', self._min_replicas),
523
+ max_replicas=override.pop('max_replicas', self._max_replicas),
524
+ num_overprovision=override.pop('num_overprovision',
525
+ self._num_overprovision),
526
+ ports=override.pop('ports', self._ports),
527
+ target_qps_per_replica=override.pop('target_qps_per_replica',
528
+ self._target_qps_per_replica),
529
+ post_data=override.pop('post_data', self._post_data),
530
+ tls_credential=override.pop('tls_credential', self._tls_credential),
531
+ readiness_headers=override.pop('readiness_headers',
532
+ self._readiness_headers),
533
+ dynamic_ondemand_fallback=override.pop(
534
+ 'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
535
+ base_ondemand_fallback_replicas=override.pop(
536
+ 'base_ondemand_fallback_replicas',
537
+ self._base_ondemand_fallback_replicas),
538
+ spot_placer=override.pop('spot_placer', self._spot_placer),
539
+ upscale_delay_seconds=override.pop('upscale_delay_seconds',
540
+ self._upscale_delay_seconds),
541
+ downscale_delay_seconds=override.pop('downscale_delay_seconds',
542
+ self._downscale_delay_seconds),
543
+ load_balancing_policy=override.pop('load_balancing_policy',
544
+ self._load_balancing_policy),
545
+ pool=override.pop('pool', self._pool),
546
+ )
sky/serve/spot_placer.py CHANGED
@@ -46,6 +46,8 @@ class Location:
46
46
 
47
47
  @classmethod
48
48
  def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
49
+ assert resources.cloud is not None, 'Cloud must be specified'
50
+ assert resources.region is not None, 'Region must be specified'
49
51
  return cls(resources.cloud, resources.region, resources.zone)
50
52
 
51
53
  def to_dict(self) -> Dict[str, Any]:
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
147
149
  cloud_str = str(launchable.cloud)
148
150
  region = launchable.region
149
151
  zone = launchable.zone
152
+ assert region is not None, 'Region must be specified'
150
153
  if (cloud_str not in location_requirements and
151
154
  location_requirements):
152
155
  continue
File without changes
@@ -0,0 +1,50 @@
1
+ """Authentication module."""
2
+ import json
3
+ from typing import Optional
4
+
5
+ import fastapi
6
+
7
+ from sky import models
8
+ from sky import sky_logging
9
+ from sky.skylet import constants
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ # TODO(hailong): Remove this function and use request.state.auth_user instead.
15
+ async def override_user_info_in_request_body(request: fastapi.Request,
16
+ auth_user: Optional[models.User]):
17
+ # Skip for upload requests to avoid consuming the body prematurely, which
18
+ # will break the streaming upload.
19
+ if request.url.path.startswith('/upload'):
20
+ return
21
+ if auth_user is None:
22
+ return
23
+
24
+ body = await request.body()
25
+ if body:
26
+ try:
27
+ original_json = await request.json()
28
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
29
+ logger.error(f'Error parsing request JSON: {e}')
30
+ else:
31
+ logger.debug(f'Overriding user for {request.state.request_id}: '
32
+ f'{auth_user.name}, {auth_user.id}')
33
+ if 'env_vars' in original_json:
34
+ if isinstance(original_json.get('env_vars'), dict):
35
+ original_json['env_vars'][
36
+ constants.USER_ID_ENV_VAR] = auth_user.id
37
+ original_json['env_vars'][
38
+ constants.USER_ENV_VAR] = auth_user.name
39
+ else:
40
+ logger.warning(
41
+ f'"env_vars" in request body is not a dictionary '
42
+ f'for request {request.state.request_id}. '
43
+ 'Skipping user info injection into body.')
44
+ else:
45
+ original_json['env_vars'] = {}
46
+ original_json['env_vars'][
47
+ constants.USER_ID_ENV_VAR] = auth_user.id
48
+ original_json['env_vars'][
49
+ constants.USER_ENV_VAR] = auth_user.name
50
+ request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
@@ -0,0 +1,38 @@
1
+ """Shared loopback detection utilities for auth middlewares."""
2
+
3
+ import ipaddress
4
+
5
+ import fastapi
6
+
7
+ from sky import sky_logging
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ COMMON_PROXY_HEADERS = [
12
+ 'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'X-Client-IP',
13
+ 'X-Forwarded-Host', 'X-Forwarded-Proto'
14
+ ]
15
+
16
+
17
+ def _is_loopback_ip(ip_str: str) -> bool:
18
+ """Check if an IP address is a loopback address."""
19
+ try:
20
+ ip = ipaddress.ip_address(ip_str)
21
+ return ip.is_loopback
22
+ except ValueError:
23
+ return False
24
+
25
+
26
+ def is_loopback_request(request: fastapi.Request) -> bool:
27
+ """Determine if a request is coming from localhost."""
28
+ if request.client is None:
29
+ return False
30
+
31
+ client_host = request.client.host
32
+ if client_host == 'localhost' or _is_loopback_ip(client_host):
33
+ # Additional checks: ensure no forwarding headers are present.
34
+ # If there are any, assume this traffic went through a proxy.
35
+ return not any(
36
+ request.headers.get(header) for header in COMMON_PROXY_HEADERS)
37
+
38
+ return False
@@ -0,0 +1,202 @@
1
+ """Authentication based on oauth2-proxy."""
2
+
3
+ import asyncio
4
+ import hashlib
5
+ import http
6
+ import os
7
+ import traceback
8
+ from typing import Optional
9
+ import urllib
10
+
11
+ import aiohttp
12
+ import fastapi
13
+ import starlette.middleware.base
14
+
15
+ from sky import global_user_state
16
+ from sky import models
17
+ from sky import sky_logging
18
+ from sky.jobs import utils as managed_job_utils
19
+ from sky.server import middleware_utils
20
+ from sky.server.auth import authn
21
+ from sky.server.auth import loopback
22
+ from sky.users import permission
23
+ from sky.utils import common_utils
24
+
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+ # We do not support setting these in config.yaml because:
28
+ # 1. config.yaml can be updated dynamically, but auth middleware does not
29
+ # support hot reload yet.
30
+ # 2. If we introduce hot reload for auth middleware, bad config might
31
+ # invalidate all authenticated sessions and thus cannot be rolled back
32
+ # by API users.
33
+ # TODO(aylei): we should introduce server.yaml for static server admin config,
34
+ # which is more structured than multiple environment variables and can be less
35
+ # confusing to users.
36
+ OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
37
+ OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
38
+
39
+
40
+ @middleware_utils.websocket_aware
41
+ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
42
+ """Middleware to handle authentication by delegating to OAuth2 Proxy."""
43
+
44
+ def __init__(self, *args, **kwargs):
45
+ super().__init__(*args, **kwargs)
46
+ self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
47
+ 'false') == 'true')
48
+ self.proxy_base: str = ''
49
+ if self.enabled:
50
+ proxy_base = os.getenv(OAUTH2_PROXY_BASE_URL_ENV_VAR)
51
+ if not proxy_base:
52
+ raise ValueError('OAuth2 Proxy is enabled but base_url is not '
53
+ 'set')
54
+ self.proxy_base = proxy_base.rstrip('/')
55
+
56
+ async def dispatch(self, request: fastapi.Request, call_next):
57
+ if not self.enabled:
58
+ return await call_next(request)
59
+
60
+ # Forward /oauth2/* to oauth2-proxy, including /oauth2/start and
61
+ # /oauth2/callback.
62
+ if request.url.path.startswith('/oauth2'):
63
+ return await self.forward_to_oauth2_proxy(request)
64
+
65
+ return await self.authenticate(request, call_next)
66
+
67
+ async def forward_to_oauth2_proxy(self, request: fastapi.Request):
68
+ """Forward requests to oauth2-proxy service."""
69
+ logger.debug(f'forwarding to oauth2-proxy: {request.url.path}')
70
+ path = request.url.path.lstrip('/')
71
+ target_url = f'{self.proxy_base}/{path}'
72
+ body = await request.body()
73
+ async with aiohttp.ClientSession() as session:
74
+ try:
75
+ forwarded_headers = dict(request.headers)
76
+ async with session.request(
77
+ method=request.method,
78
+ url=target_url,
79
+ headers=forwarded_headers,
80
+ data=body,
81
+ cookies=request.cookies,
82
+ params=request.query_params,
83
+ allow_redirects=False,
84
+ ) as response:
85
+ response_body = await response.read()
86
+ fastapi_response = fastapi.responses.Response(
87
+ content=response_body,
88
+ status_code=response.status,
89
+ headers=dict(response.headers),
90
+ )
91
+ # Forward cookies from OAuth2 proxy response to client
92
+ for cookie_name, cookie in response.cookies.items():
93
+ fastapi_response.set_cookie(
94
+ key=cookie_name,
95
+ value=cookie.value,
96
+ max_age=cookie.get('max-age'),
97
+ expires=cookie.get('expires'),
98
+ path=cookie.get('path', '/'),
99
+ domain=cookie.get('domain'),
100
+ secure=cookie.get('secure', False),
101
+ httponly=cookie.get('httponly', False),
102
+ )
103
+ return fastapi_response
104
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
105
+ logger.error(f'Error forwarding to OAuth2 proxy: {e}')
106
+ return fastapi.responses.JSONResponse(
107
+ status_code=http.HTTPStatus.BAD_GATEWAY,
108
+ content={'detail': 'oauth2-proxy service unavailable'})
109
+
110
+ async def authenticate(self, request: fastapi.Request, call_next):
111
+ if request.state.auth_user is not None:
112
+ # Already authenticated
113
+ return await call_next(request)
114
+
115
+ if managed_job_utils.is_consolidation_mode(
116
+ ) and loopback.is_loopback_request(request):
117
+ return await call_next(request)
118
+
119
+ async with aiohttp.ClientSession() as session:
120
+ try:
121
+ return await self._authenticate(request, call_next, session)
122
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
123
+ logger.error(f'Error communicating with OAuth2 proxy: {e}'
124
+ f'{traceback.format_exc()}')
125
+ return fastapi.responses.JSONResponse(
126
+ status_code=http.HTTPStatus.BAD_GATEWAY,
127
+ content={'detail': 'oauth2-proxy service unavailable'})
128
+
129
+ async def _authenticate(self, request: fastapi.Request, call_next,
130
+ session: aiohttp.ClientSession):
131
+ forwarded_headers = {}
132
+ auth_url = f'{self.proxy_base}/oauth2/auth'
133
+ forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
134
+ forwarded_headers['Host'] = request.url.hostname
135
+ logger.debug(f'authenticate request: {auth_url}, '
136
+ f'headers: {forwarded_headers}')
137
+
138
+ async with session.request(
139
+ method='GET',
140
+ url=auth_url,
141
+ headers=forwarded_headers,
142
+ cookies=request.cookies,
143
+ timeout=aiohttp.ClientTimeout(total=10),
144
+ allow_redirects=False,
145
+ ) as auth_response:
146
+
147
+ if auth_response.status == http.HTTPStatus.ACCEPTED:
148
+ # User is authenticated, extract user info from headers
149
+ auth_user = self.get_auth_user(auth_response)
150
+ if not auth_user:
151
+ return fastapi.responses.JSONResponse(
152
+ status_code=http.HTTPStatus.INTERNAL_SERVER_ERROR,
153
+ content={
154
+ 'detail':
155
+ 'oauth2-proxy is enabled but did not'
156
+ 'return user info, check your oauth2-proxy'
157
+ 'setup.'
158
+ })
159
+ newly_added = global_user_state.add_or_update_user(auth_user)
160
+ if newly_added:
161
+ permission.permission_service.add_user_if_not_exists(
162
+ auth_user.id)
163
+ request.state.auth_user = auth_user
164
+ await authn.override_user_info_in_request_body(
165
+ request, auth_user)
166
+ return await call_next(request)
167
+ elif auth_response.status == http.HTTPStatus.UNAUTHORIZED:
168
+ # For /api/health, we should allow unauthenticated requests to
169
+ # not break healthz check.
170
+ # TODO(aylei): remove this to an aggregated login middleware
171
+ # in favor of the unified authentication.
172
+ if request.url.path.startswith('/api/health'):
173
+ request.state.anonymous_user = True
174
+ return await call_next(request)
175
+
176
+ # TODO(aylei): in unified authentication, the redirection
177
+ # or rejection should be done after all the authentication
178
+ # methods are performed.
179
+ # Not authenticated, redirect to sign-in
180
+ redirect_path = request.url.path
181
+ if request.url.query:
182
+ redirect_path += f'?{request.url.query}'
183
+ rd = urllib.parse.quote(redirect_path)
184
+ signin_url = (f'{request.base_url}oauth2/start?'
185
+ f'rd={rd}')
186
+ return fastapi.responses.RedirectResponse(url=signin_url)
187
+ else:
188
+ logger.error('oauth2-proxy returned unexpected status '
189
+ f'{auth_response.status}: {auth_response.text}')
190
+ return fastapi.responses.JSONResponse(
191
+ status_code=auth_response.status,
192
+ content={'detail': 'oauth2-proxy error'})
193
+
194
+ def get_auth_user(
195
+ self, response: aiohttp.ClientResponse) -> Optional[models.User]:
196
+ """Extract user info from OAuth2 proxy response headers."""
197
+ email_header = response.headers.get('X-Auth-Request-Email')
198
+ if email_header:
199
+ user_hash = hashlib.md5(email_header.encode()).hexdigest(
200
+ )[:common_utils.USER_HASH_LENGTH]
201
+ return models.User(id=user_hash, name=email_header)
202
+ return None