skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,7 @@ from sky.server import common as server_common
10
10
  from sky.server import stream_utils
11
11
  from sky.server.requests import executor
12
12
  from sky.server.requests import payloads
13
+ from sky.server.requests import request_names
13
14
  from sky.server.requests import requests as api_requests
14
15
  from sky.skylet import constants
15
16
  from sky.utils import common
@@ -23,9 +24,9 @@ async def up(
23
24
  request: fastapi.Request,
24
25
  up_body: payloads.ServeUpBody,
25
26
  ) -> None:
26
- executor.schedule_request(
27
+ await executor.schedule_request_async(
27
28
  request_id=request.state.request_id,
28
- request_name='serve.up',
29
+ request_name=request_names.RequestName.SERVE_UP,
29
30
  request_body=up_body,
30
31
  func=core.up,
31
32
  schedule_type=api_requests.ScheduleType.LONG,
@@ -38,9 +39,9 @@ async def update(
38
39
  request: fastapi.Request,
39
40
  update_body: payloads.ServeUpdateBody,
40
41
  ) -> None:
41
- executor.schedule_request(
42
+ await executor.schedule_request_async(
42
43
  request_id=request.state.request_id,
43
- request_name='serve.update',
44
+ request_name=request_names.RequestName.SERVE_UPDATE,
44
45
  request_body=update_body,
45
46
  func=core.update,
46
47
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -53,9 +54,9 @@ async def down(
53
54
  request: fastapi.Request,
54
55
  down_body: payloads.ServeDownBody,
55
56
  ) -> None:
56
- executor.schedule_request(
57
+ await executor.schedule_request_async(
57
58
  request_id=request.state.request_id,
58
- request_name='serve.down',
59
+ request_name=request_names.RequestName.SERVE_DOWN,
59
60
  request_body=down_body,
60
61
  func=core.down,
61
62
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -68,9 +69,9 @@ async def terminate_replica(
68
69
  request: fastapi.Request,
69
70
  terminate_replica_body: payloads.ServeTerminateReplicaBody,
70
71
  ) -> None:
71
- executor.schedule_request(
72
+ await executor.schedule_request_async(
72
73
  request_id=request.state.request_id,
73
- request_name='serve.terminate_replica',
74
+ request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
74
75
  request_body=terminate_replica_body,
75
76
  func=core.terminate_replica,
76
77
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -83,9 +84,9 @@ async def status(
83
84
  request: fastapi.Request,
84
85
  status_body: payloads.ServeStatusBody,
85
86
  ) -> None:
86
- executor.schedule_request(
87
+ await executor.schedule_request_async(
87
88
  request_id=request.state.request_id,
88
- request_name='serve.status',
89
+ request_name=request_names.RequestName.SERVE_STATUS,
89
90
  request_body=status_body,
90
91
  func=core.status,
91
92
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -98,22 +99,23 @@ async def tail_logs(
98
99
  request: fastapi.Request, log_body: payloads.ServeLogsBody,
99
100
  background_tasks: fastapi.BackgroundTasks
100
101
  ) -> fastapi.responses.StreamingResponse:
101
- executor.schedule_request(
102
+ executor.check_request_thread_executor_available()
103
+ request_task = await executor.prepare_request_async(
102
104
  request_id=request.state.request_id,
103
- request_name='serve.logs',
105
+ request_name=request_names.RequestName.SERVE_LOGS,
104
106
  request_body=log_body,
105
107
  func=core.tail_logs,
106
108
  schedule_type=api_requests.ScheduleType.SHORT,
107
109
  request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
108
110
  )
109
-
110
- request_task = await api_requests.get_request_async(request.state.request_id
111
- )
112
-
113
- return stream_utils.stream_response(
111
+ task = executor.execute_request_in_coroutine(request_task)
112
+ # Cancel the coroutine after the request is done or client disconnects
113
+ background_tasks.add_task(task.cancel)
114
+ return stream_utils.stream_response_for_long_request(
114
115
  request_id=request_task.request_id,
115
116
  logs_path=request_task.log_path,
116
117
  background_tasks=background_tasks,
118
+ kill_request_on_disconnect=False,
117
119
  )
118
120
 
119
121
 
@@ -131,9 +133,9 @@ async def download_logs(
131
133
  # We should reuse the original request body, so that the env vars, such as
132
134
  # user hash, are kept the same.
133
135
  download_logs_body.local_dir = str(logs_dir_on_api_server)
134
- executor.schedule_request(
136
+ await executor.schedule_request_async(
135
137
  request_id=request.state.request_id,
136
- request_name='serve.sync_down_logs',
138
+ request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
137
139
  request_body=download_logs_body,
138
140
  func=core.sync_down_logs,
139
141
  schedule_type=api_requests.ScheduleType.SHORT,
sky/serve/service.py CHANGED
@@ -13,7 +13,6 @@ from typing import Dict
13
13
 
14
14
  import filelock
15
15
 
16
- from sky import authentication
17
16
  from sky import exceptions
18
17
  from sky import global_user_state
19
18
  from sky import sky_logging
@@ -21,7 +20,6 @@ from sky import task as task_lib
21
20
  from sky.backends import backend_utils
22
21
  from sky.backends import cloud_vm_ray_backend
23
22
  from sky.data import data_utils
24
- from sky.jobs import scheduler as jobs_scheduler
25
23
  from sky.serve import constants
26
24
  from sky.serve import controller
27
25
  from sky.serve import load_balancer
@@ -29,9 +27,11 @@ from sky.serve import replica_managers
29
27
  from sky.serve import serve_state
30
28
  from sky.serve import serve_utils
31
29
  from sky.skylet import constants as skylet_constants
30
+ from sky.utils import auth_utils
32
31
  from sky.utils import common_utils
33
32
  from sky.utils import controller_utils
34
33
  from sky.utils import subprocess_utils
34
+ from sky.utils import thread_utils
35
35
  from sky.utils import ux_utils
36
36
 
37
37
  # Use the explicit logger name so that the logger is under the
@@ -66,11 +66,11 @@ def _handle_signal(service_name: str) -> None:
66
66
  raise error_type(f'User signal received: {user_signal.value}')
67
67
 
68
68
 
69
- def cleanup_storage(task_yaml: str) -> bool:
69
+ def cleanup_storage(yaml_content: str) -> bool:
70
70
  """Clean up the storage for the service.
71
71
 
72
72
  Args:
73
- task_yaml: The task yaml file.
73
+ yaml_content: The yaml content of the service.
74
74
 
75
75
  Returns:
76
76
  True if the storage is cleaned up successfully, False otherwise.
@@ -78,7 +78,7 @@ def cleanup_storage(task_yaml: str) -> bool:
78
78
  failed = False
79
79
 
80
80
  try:
81
- task = task_lib.Task.from_yaml(task_yaml)
81
+ task = task_lib.Task.from_yaml_str(yaml_content)
82
82
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
83
83
  # Need to re-construct storage object in the controller process
84
84
  # because when SkyPilot API server machine sends the yaml config to the
@@ -116,7 +116,7 @@ def cleanup_storage(task_yaml: str) -> bool:
116
116
  # NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
117
117
  # because we killed all the processes (controller & replica manager) before
118
118
  # calling this function.
119
- def _cleanup(service_name: str) -> bool:
119
+ def _cleanup(service_name: str, pool: bool) -> bool:
120
120
  """Clean up all service related resources, i.e. replicas and storage."""
121
121
  # Cleanup the HA recovery script first as it is possible that some error
122
122
  # was raised when we construct the task object (e.g.,
@@ -124,8 +124,8 @@ def _cleanup(service_name: str) -> bool:
124
124
  serve_state.remove_ha_recovery_script(service_name)
125
125
  failed = False
126
126
  replica_infos = serve_state.get_replica_infos(service_name)
127
- info2proc: Dict[replica_managers.ReplicaInfo,
128
- multiprocessing.Process] = dict()
127
+ info2thr: Dict[replica_managers.ReplicaInfo,
128
+ thread_utils.SafeThread] = dict()
129
129
  # NOTE(dev): This relies on `sky/serve/serve_utils.py::
130
130
  # generate_replica_cluster_name`. Change it if you change the function.
131
131
  existing_cluster_names = global_user_state.get_cluster_names_start_with(
@@ -136,9 +136,12 @@ def _cleanup(service_name: str) -> bool:
136
136
  f'{info.replica_id} not found. Might be a failed '
137
137
  'cluster. Skipping.')
138
138
  continue
139
- p = multiprocessing.Process(target=replica_managers.terminate_cluster,
140
- args=(info.cluster_name,))
141
- info2proc[info] = p
139
+
140
+ log_file_name = serve_utils.generate_replica_log_file_name(
141
+ service_name, info.replica_id)
142
+ t = thread_utils.SafeThread(target=replica_managers.terminate_cluster,
143
+ args=(info.cluster_name, log_file_name))
144
+ info2thr[info] = t
142
145
  # Set replica status to `SHUTTING_DOWN`
143
146
  info.status_property.sky_launch_status = (
144
147
  replica_managers.common_utils.ProcessStatus.SUCCEEDED)
@@ -158,32 +161,32 @@ def _cleanup(service_name: str) -> bool:
158
161
 
159
162
  # Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
160
163
  # TODO(tian): Refactor to use the same logic and code.
161
- while info2proc:
162
- snapshot = list(info2proc.items())
163
- for info, p in snapshot:
164
- if p.is_alive():
164
+ while info2thr:
165
+ snapshot = list(info2thr.items())
166
+ for info, t in snapshot:
167
+ if t.is_alive():
165
168
  continue
166
169
  if (info.status_property.sky_down_status ==
167
170
  replica_managers.common_utils.ProcessStatus.SCHEDULED):
168
- if controller_utils.can_terminate():
171
+ if controller_utils.can_terminate(pool):
169
172
  try:
170
- p.start()
173
+ t.start()
171
174
  except Exception as e: # pylint: disable=broad-except
172
175
  _set_to_failed_cleanup(info)
173
- logger.error(f'Failed to start process for replica '
176
+ logger.error(f'Failed to start thread for replica '
174
177
  f'{info.replica_id}: {e}')
175
- del info2proc[info]
178
+ del info2thr[info]
176
179
  else:
177
180
  info.status_property.sky_down_status = (
178
181
  common_utils.ProcessStatus.RUNNING)
179
182
  serve_state.add_or_update_replica(
180
183
  service_name, info.replica_id, info)
181
184
  else:
182
- logger.info('Terminate process for replica '
185
+ logger.info('Terminate thread for replica '
183
186
  f'{info.replica_id} finished.')
184
- p.join()
185
- del info2proc[info]
186
- if p.exitcode == 0:
187
+ t.join()
188
+ del info2thr[info]
189
+ if t.format_exc is None:
187
190
  serve_state.remove_replica(service_name, info.replica_id)
188
191
  logger.info(
189
192
  f'Replica {info.replica_id} terminated successfully.')
@@ -191,19 +194,23 @@ def _cleanup(service_name: str) -> bool:
191
194
  _set_to_failed_cleanup(info)
192
195
  time.sleep(3)
193
196
 
194
- versions = serve_state.get_service_versions(service_name)
195
- serve_state.remove_service_versions(service_name)
196
-
197
197
  def cleanup_version_storage(version: int) -> bool:
198
- task_yaml: str = serve_utils.generate_task_yaml_file_name(
199
- service_name, version)
198
+ yaml_content = serve_state.get_yaml_content(service_name, version)
199
+ if yaml_content is None:
200
+ logger.warning(f'No yaml content found for version {version}')
201
+ return True
200
202
  logger.info(f'Cleaning up storage for version {version}, '
201
- f'task_yaml: {task_yaml}')
202
- return cleanup_storage(task_yaml)
203
+ f'yaml_content: {yaml_content}')
204
+ return cleanup_storage(yaml_content)
203
205
 
206
+ versions = serve_state.get_service_versions(service_name)
204
207
  if not all(map(cleanup_version_storage, versions)):
205
208
  failed = True
206
209
 
210
+ # Cleanup version metadata after all storages are cleaned up, otherwise
211
+ # the get_yaml_content will return None as all versions are deleted.
212
+ serve_state.delete_all_versions(service_name)
213
+
207
214
  return failed
208
215
 
209
216
 
@@ -228,41 +235,39 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
228
235
  """
229
236
  # Generate ssh key pair to avoid race condition when multiple sky.launch
230
237
  # are executed at the same time.
231
- authentication.get_or_generate_keys()
238
+ auth_utils.get_or_generate_keys()
232
239
 
233
- # Initialize database record for the service.
234
- task = task_lib.Task.from_yaml(tmp_task_yaml)
235
- # Already checked before submit to controller.
236
- assert task.service is not None, task
237
- service_spec = task.service
238
-
239
- def is_recovery_mode(service_name: str) -> bool:
240
- """Check if service exists in database to determine recovery mode.
241
- """
242
- service = serve_state.get_service_from_name(service_name)
243
- return service is not None
244
-
245
- is_recovery = is_recovery_mode(service_name)
240
+ service = serve_state.get_service_from_name(service_name)
241
+ is_recovery = service is not None
246
242
  logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
247
243
 
244
+ def _read_yaml_content(yaml_path: str) -> str:
245
+ with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
246
+ return f.read()
247
+
248
248
  if is_recovery:
249
- version = serve_state.get_latest_version(service_name)
250
- if version is None:
251
- raise ValueError(f'No version found for service {service_name}')
249
+ yaml_content = service['yaml_content']
250
+ # Backward compatibility for old service records that
251
+ # does not dump the yaml content to version database.
252
+ # TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
253
+ if yaml_content is None:
254
+ yaml_content = _read_yaml_content(tmp_task_yaml)
252
255
  else:
253
- version = constants.INITIAL_VERSION
254
- # Add initial version information to the service state.
255
- serve_state.add_or_update_version(service_name, version, service_spec)
256
+ yaml_content = _read_yaml_content(tmp_task_yaml)
257
+
258
+ # Initialize database record for the service.
259
+ task = task_lib.Task.from_yaml_str(yaml_content)
260
+ # Already checked before submit to controller.
261
+ assert task.service is not None, task
262
+ service_spec = task.service
256
263
 
257
264
  service_dir = os.path.expanduser(
258
265
  serve_utils.generate_remote_service_dir_name(service_name))
259
- service_task_yaml = serve_utils.generate_task_yaml_file_name(
260
- service_name, version)
261
266
 
262
267
  if not is_recovery:
263
268
  with filelock.FileLock(controller_utils.get_resources_lock_path()):
264
- if not controller_utils.can_start_new_process():
265
- cleanup_storage(tmp_task_yaml)
269
+ if not controller_utils.can_start_new_process(task.service.pool):
270
+ cleanup_storage(yaml_content)
266
271
  with ux_utils.print_exception_no_traceback():
267
272
  raise RuntimeError(
268
273
  constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
@@ -278,25 +283,24 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
278
283
  pool=service_spec.pool,
279
284
  controller_pid=os.getpid(),
280
285
  entrypoint=entrypoint)
281
- jobs_scheduler.maybe_schedule_next_jobs()
282
286
  # Directly throw an error here. See sky/serve/api.py::up
283
287
  # for more details.
284
288
  if not success:
285
- cleanup_storage(tmp_task_yaml)
289
+ cleanup_storage(yaml_content)
286
290
  with ux_utils.print_exception_no_traceback():
287
291
  raise ValueError(f'Service {service_name} already exists.')
288
292
 
289
293
  # Create the service working directory.
290
294
  os.makedirs(service_dir, exist_ok=True)
291
295
 
292
- # Copy the tmp task yaml file to the final task yaml file.
293
- # This is for the service name conflict case. The _execute will
294
- # sync file mounts first and then realized a name conflict. We
295
- # don't want the new file mounts to overwrite the old one, so we
296
- # sync to a tmp file first and then copy it to the final name
297
- # if there is no name conflict.
298
- shutil.copy(tmp_task_yaml, service_task_yaml)
296
+ version = constants.INITIAL_VERSION
297
+ # Add initial version information to the service state.
298
+ serve_state.add_or_update_version(service_name, version, service_spec,
299
+ yaml_content)
299
300
  else:
301
+ version = serve_state.get_latest_version(service_name)
302
+ if version is None:
303
+ raise ValueError(f'No version found for service {service_name}')
300
304
  serve_state.update_service_controller_pid(service_name, os.getpid())
301
305
 
302
306
  controller_process = None
@@ -328,8 +332,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
328
332
  controller_host = _get_controller_host()
329
333
  controller_process = multiprocessing.Process(
330
334
  target=controller.run_controller,
331
- args=(service_name, service_spec, service_task_yaml,
332
- controller_host, controller_port))
335
+ args=(service_name, service_spec, version, controller_host,
336
+ controller_port))
333
337
  controller_process.start()
334
338
 
335
339
  if not is_recovery:
@@ -350,8 +354,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
350
354
  # TODO(tian): Probably we could enable multiple ports specified in
351
355
  # service spec and we could start multiple load balancers.
352
356
  # After that, we will have a mapping from replica port to endpoint.
353
- # NOTE(tian): We don't need the load balancer for cluster pool.
354
- # Skip the load balancer process for cluster pool.
357
+ # NOTE(tian): We don't need the load balancer for pool.
358
+ # Skip the load balancer process for pool.
355
359
  if not service_spec.pool:
356
360
  load_balancer_process = multiprocessing.Process(
357
361
  target=ux_utils.RedirectOutputForProcess(
@@ -386,7 +390,19 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
386
390
  for process in process_to_kill:
387
391
  process.join()
388
392
 
389
- failed = _cleanup(service_name)
393
+ # Catch any exception here to avoid it kill the service monitoring
394
+ # process. In which case, the service will not only fail to clean
395
+ # up, but also cannot be terminated in the future as no process
396
+ # will handle the user signal anymore. Instead, we catch any error
397
+ # and set it to FAILED_CLEANUP instead.
398
+ try:
399
+ failed = _cleanup(service_name, service_spec.pool)
400
+ except Exception as e: # pylint: disable=broad-except
401
+ logger.error(f'Failed to clean up service {service_name}: {e}')
402
+ with ux_utils.enable_traceback():
403
+ logger.error(f' Traceback: {traceback.format_exc()}')
404
+ failed = True
405
+
390
406
  if failed:
391
407
  serve_state.set_service_status_and_active_versions(
392
408
  service_name, serve_state.ServiceStatus.FAILED_CLEANUP)
sky/serve/service_spec.py CHANGED
@@ -188,7 +188,7 @@ class SkyServiceSpec:
188
188
  with ux_utils.print_exception_no_traceback():
189
189
  raise ValueError('Cannot specify `replica_policy` for cluster '
190
190
  'pool. Only `workers: <num>` is supported '
191
- 'for cluster pool now.')
191
+ 'for pool now.')
192
192
 
193
193
  simplified_policy_section = config.get('replicas', None)
194
194
  workers_config = config.get('workers', None)
@@ -198,7 +198,7 @@ class SkyServiceSpec:
198
198
  ' Please use one of them.')
199
199
  if simplified_policy_section is not None and pool_config:
200
200
  with ux_utils.print_exception_no_traceback():
201
- raise ValueError('Cannot specify `replicas` for cluster pool. '
201
+ raise ValueError('Cannot specify `replicas` for pool. '
202
202
  'Please use `workers` instead.')
203
203
  if simplified_policy_section is None:
204
204
  simplified_policy_section = workers_config
@@ -266,14 +266,13 @@ class SkyServiceSpec:
266
266
  return SkyServiceSpec(**service_config)
267
267
 
268
268
  @staticmethod
269
- def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
270
- with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
271
- config = yaml_utils.safe_load(f)
269
+ def from_yaml_str(yaml_str: str) -> 'SkyServiceSpec':
270
+ config = yaml_utils.safe_load(yaml_str)
272
271
 
273
272
  if isinstance(config, str):
274
273
  with ux_utils.print_exception_no_traceback():
275
274
  raise ValueError('YAML loaded as str, not as dict. '
276
- f'Is it correct? Path: {yaml_path}')
275
+ f'Is it correct? content:\n{yaml_str}')
277
276
 
278
277
  if config is None:
279
278
  config = {}
@@ -281,10 +280,16 @@ class SkyServiceSpec:
281
280
  if 'service' not in config:
282
281
  with ux_utils.print_exception_no_traceback():
283
282
  raise ValueError('Service YAML must have a "service" section. '
284
- f'Is it correct? Path: {yaml_path}')
283
+ f'Is it correct? content:\n{yaml_str}')
285
284
 
286
285
  return SkyServiceSpec.from_yaml_config(config['service'])
287
286
 
287
+ @staticmethod
288
+ def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
289
+ with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
290
+ yaml_content = f.read()
291
+ return SkyServiceSpec.from_yaml_str(yaml_content)
292
+
288
293
  def to_yaml_config(self) -> Dict[str, Any]:
289
294
  config: Dict[str, Any] = {}
290
295
 
@@ -506,3 +511,36 @@ class SkyServiceSpec:
506
511
  if not hasattr(self, '_pool'):
507
512
  return False
508
513
  return bool(self._pool)
514
+
515
+ def copy(self, **override) -> 'SkyServiceSpec':
516
+ return SkyServiceSpec(
517
+ readiness_path=override.pop('readiness_path', self._readiness_path),
518
+ initial_delay_seconds=override.pop('initial_delay_seconds',
519
+ self._initial_delay_seconds),
520
+ readiness_timeout_seconds=override.pop(
521
+ 'readiness_timeout_seconds', self._readiness_timeout_seconds),
522
+ min_replicas=override.pop('min_replicas', self._min_replicas),
523
+ max_replicas=override.pop('max_replicas', self._max_replicas),
524
+ num_overprovision=override.pop('num_overprovision',
525
+ self._num_overprovision),
526
+ ports=override.pop('ports', self._ports),
527
+ target_qps_per_replica=override.pop('target_qps_per_replica',
528
+ self._target_qps_per_replica),
529
+ post_data=override.pop('post_data', self._post_data),
530
+ tls_credential=override.pop('tls_credential', self._tls_credential),
531
+ readiness_headers=override.pop('readiness_headers',
532
+ self._readiness_headers),
533
+ dynamic_ondemand_fallback=override.pop(
534
+ 'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
535
+ base_ondemand_fallback_replicas=override.pop(
536
+ 'base_ondemand_fallback_replicas',
537
+ self._base_ondemand_fallback_replicas),
538
+ spot_placer=override.pop('spot_placer', self._spot_placer),
539
+ upscale_delay_seconds=override.pop('upscale_delay_seconds',
540
+ self._upscale_delay_seconds),
541
+ downscale_delay_seconds=override.pop('downscale_delay_seconds',
542
+ self._downscale_delay_seconds),
543
+ load_balancing_policy=override.pop('load_balancing_policy',
544
+ self._load_balancing_policy),
545
+ pool=override.pop('pool', self._pool),
546
+ )
@@ -0,0 +1,38 @@
1
+ """Shared loopback detection utilities for auth middlewares."""
2
+
3
+ import ipaddress
4
+
5
+ import fastapi
6
+
7
+ from sky import sky_logging
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ COMMON_PROXY_HEADERS = [
12
+ 'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'X-Client-IP',
13
+ 'X-Forwarded-Host', 'X-Forwarded-Proto'
14
+ ]
15
+
16
+
17
+ def _is_loopback_ip(ip_str: str) -> bool:
18
+ """Check if an IP address is a loopback address."""
19
+ try:
20
+ ip = ipaddress.ip_address(ip_str)
21
+ return ip.is_loopback
22
+ except ValueError:
23
+ return False
24
+
25
+
26
+ def is_loopback_request(request: fastapi.Request) -> bool:
27
+ """Determine if a request is coming from localhost."""
28
+ if request.client is None:
29
+ return False
30
+
31
+ client_host = request.client.host
32
+ if client_host == 'localhost' or _is_loopback_ip(client_host):
33
+ # Additional checks: ensure no forwarding headers are present.
34
+ # If there are any, assume this traffic went through a proxy.
35
+ return not any(
36
+ request.headers.get(header) for header in COMMON_PROXY_HEADERS)
37
+
38
+ return False
@@ -15,7 +15,10 @@ import starlette.middleware.base
15
15
  from sky import global_user_state
16
16
  from sky import models
17
17
  from sky import sky_logging
18
+ from sky.jobs import utils as managed_job_utils
19
+ from sky.server import middleware_utils
18
20
  from sky.server.auth import authn
21
+ from sky.server.auth import loopback
19
22
  from sky.users import permission
20
23
  from sky.utils import common_utils
21
24
 
@@ -34,11 +37,12 @@ OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
34
37
  OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
35
38
 
36
39
 
40
+ @middleware_utils.websocket_aware
37
41
  class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
38
42
  """Middleware to handle authentication by delegating to OAuth2 Proxy."""
39
43
 
40
- def __init__(self, application: fastapi.FastAPI):
41
- super().__init__(application)
44
+ def __init__(self, *args, **kwargs):
45
+ super().__init__(*args, **kwargs)
42
46
  self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
43
47
  'false') == 'true')
44
48
  self.proxy_base: str = ''
@@ -108,6 +112,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
108
112
  # Already authenticated
109
113
  return await call_next(request)
110
114
 
115
+ if managed_job_utils.is_consolidation_mode(
116
+ ) and loopback.is_loopback_request(request):
117
+ return await call_next(request)
118
+
111
119
  async with aiohttp.ClientSession() as session:
112
120
  try:
113
121
  return await self._authenticate(request, call_next, session)
@@ -120,13 +128,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
120
128
 
121
129
  async def _authenticate(self, request: fastapi.Request, call_next,
122
130
  session: aiohttp.ClientSession):
123
- forwarded_headers = dict(request.headers)
131
+ forwarded_headers = {}
124
132
  auth_url = f'{self.proxy_base}/oauth2/auth'
125
133
  forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
126
- # Remove content-length and content-type headers and drop request body
127
- # to reduce the auth overhead.
128
- forwarded_headers.pop('content-length', None)
129
- forwarded_headers.pop('content-type', None)
134
+ forwarded_headers['Host'] = request.url.hostname
130
135
  logger.debug(f'authenticate request: {auth_url}, '
131
136
  f'headers: {forwarded_headers}')
132
137