skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/server/server.py CHANGED
@@ -1,20 +1,20 @@
1
1
  """REST API for managed jobs."""
2
- import os
2
+
3
+ import pathlib
3
4
 
4
5
  import fastapi
5
- import httpx
6
6
 
7
7
  from sky import sky_logging
8
+ from sky.jobs import utils as managed_jobs_utils
8
9
  from sky.jobs.server import core
9
- from sky.jobs.server import dashboard_utils
10
10
  from sky.server import common as server_common
11
11
  from sky.server import stream_utils
12
12
  from sky.server.requests import executor
13
13
  from sky.server.requests import payloads
14
+ from sky.server.requests import request_names
14
15
  from sky.server.requests import requests as api_requests
15
16
  from sky.skylet import constants
16
17
  from sky.utils import common
17
- from sky.utils import common_utils
18
18
 
19
19
  logger = sky_logging.init_logger(__name__)
20
20
 
@@ -24,22 +24,36 @@ router = fastapi.APIRouter()
24
24
  @router.post('/launch')
25
25
  async def launch(request: fastapi.Request,
26
26
  jobs_launch_body: payloads.JobsLaunchBody) -> None:
27
- executor.schedule_request(
27
+ # In consolidation mode, the jobs controller will use sky.launch on the same
28
+ # API server to launch the underlying job cluster. If you start run many
29
+ # jobs.launch requests, some may be blocked for a long time by sky.launch
30
+ # requests triggered by earlier jobs, which leads to confusing behavior as
31
+ # the jobs.launch requests trickle though. Also, since we don't have to
32
+ # actually launch a jobs controller sky cluster, the jobs.launch request is
33
+ # much quicker in consolidation mode. So we avoid the issue by just using
34
+ # the short executor instead - then jobs.launch will not be blocked by
35
+ # sky.launch.
36
+ consolidation_mode = managed_jobs_utils.is_consolidation_mode()
37
+ schedule_type = (api_requests.ScheduleType.SHORT
38
+ if consolidation_mode else api_requests.ScheduleType.LONG)
39
+ await executor.schedule_request_async(
28
40
  request_id=request.state.request_id,
29
- request_name='jobs.launch',
41
+ request_name=request_names.RequestName.JOBS_LAUNCH,
30
42
  request_body=jobs_launch_body,
31
43
  func=core.launch,
32
- schedule_type=api_requests.ScheduleType.LONG,
44
+ schedule_type=schedule_type,
33
45
  request_cluster_name=common.JOB_CONTROLLER_NAME,
34
46
  )
35
47
 
36
48
 
49
+ # For backwards compatibility
50
+ # TODO(hailong): Remove before 0.12.0.
37
51
  @router.post('/queue')
38
52
  async def queue(request: fastapi.Request,
39
53
  jobs_queue_body: payloads.JobsQueueBody) -> None:
40
- executor.schedule_request(
54
+ await executor.schedule_request_async(
41
55
  request_id=request.state.request_id,
42
- request_name='jobs.queue',
56
+ request_name=request_names.RequestName.JOBS_QUEUE,
43
57
  request_body=jobs_queue_body,
44
58
  func=core.queue,
45
59
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
@@ -48,12 +62,27 @@ async def queue(request: fastapi.Request,
48
62
  )
49
63
 
50
64
 
65
+ @router.post('/queue/v2')
66
+ async def queue_v2(request: fastapi.Request,
67
+ jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
68
+ await executor.schedule_request_async(
69
+ request_id=request.state.request_id,
70
+ request_name=request_names.RequestName.JOBS_QUEUE_V2,
71
+ request_body=jobs_queue_body_v2,
72
+ func=core.queue_v2_api,
73
+ schedule_type=(api_requests.ScheduleType.LONG
74
+ if jobs_queue_body_v2.refresh else
75
+ api_requests.ScheduleType.SHORT),
76
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
77
+ )
78
+
79
+
51
80
  @router.post('/cancel')
52
81
  async def cancel(request: fastapi.Request,
53
82
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
54
- executor.schedule_request(
83
+ await executor.schedule_request_async(
55
84
  request_id=request.state.request_id,
56
- request_name='jobs.cancel',
85
+ request_name=request_names.RequestName.JOBS_CANCEL,
57
86
  request_body=jobs_cancel_body,
58
87
  func=core.cancel,
59
88
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -66,27 +95,39 @@ async def logs(
66
95
  request: fastapi.Request, jobs_logs_body: payloads.JobsLogsBody,
67
96
  background_tasks: fastapi.BackgroundTasks
68
97
  ) -> fastapi.responses.StreamingResponse:
69
- executor.schedule_request(
98
+ schedule_type = api_requests.ScheduleType.SHORT
99
+ if jobs_logs_body.refresh:
100
+ # When refresh is specified, the job controller might be restarted,
101
+ # which takes longer time to finish. We schedule it to long executor.
102
+ schedule_type = api_requests.ScheduleType.LONG
103
+ if schedule_type == api_requests.ScheduleType.SHORT:
104
+ executor.check_request_thread_executor_available()
105
+ request_task = await executor.prepare_request_async(
70
106
  request_id=request.state.request_id,
71
- request_name='jobs.logs',
107
+ request_name=request_names.RequestName.JOBS_LOGS,
72
108
  request_body=jobs_logs_body,
73
109
  func=core.tail_logs,
74
- # TODO(aylei): We have tail logs scheduled as SHORT request, because it
75
- # should be responsive. However, it can be long running if the user's
76
- # job keeps running, and we should avoid it taking the SHORT worker
77
- # indefinitely.
78
- # When refresh is True we schedule it as LONG because a controller
79
- # restart might be needed.
80
- schedule_type=api_requests.ScheduleType.LONG
81
- if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
110
+ schedule_type=schedule_type,
82
111
  request_cluster_name=common.JOB_CONTROLLER_NAME,
83
112
  )
84
- request_task = api_requests.get_request(request.state.request_id)
85
-
86
- return stream_utils.stream_response(
113
+ kill_request_on_disconnect = False
114
+ if schedule_type == api_requests.ScheduleType.SHORT:
115
+ # For short request, run in the coroutine to avoid blocking
116
+ # short workers.
117
+ task = executor.execute_request_in_coroutine(request_task)
118
+ # Cancel the coroutine after the request is done or client disconnects
119
+ background_tasks.add_task(task.cancel)
120
+ else:
121
+ executor.schedule_prepared_request(request_task)
122
+ # When runs in long executor process, we should kill the request on
123
+ # disconnect to cancel the running routine.
124
+ kill_request_on_disconnect = True
125
+
126
+ return stream_utils.stream_response_for_long_request(
87
127
  request_id=request_task.request_id,
88
128
  logs_path=request_task.log_path,
89
129
  background_tasks=background_tasks,
130
+ kill_request_on_disconnect=kill_request_on_disconnect,
90
131
  )
91
132
 
92
133
 
@@ -101,9 +142,9 @@ async def download_logs(
101
142
  # We should reuse the original request body, so that the env vars, such as
102
143
  # user hash, are kept the same.
103
144
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
104
- executor.schedule_request(
145
+ await executor.schedule_request_async(
105
146
  request_id=request.state.request_id,
106
- request_name='jobs.download_logs',
147
+ request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
107
148
  request_body=jobs_download_logs_body,
108
149
  func=core.download_logs,
109
150
  schedule_type=api_requests.ScheduleType.LONG
@@ -112,92 +153,92 @@ async def download_logs(
112
153
  )
113
154
 
114
155
 
115
- @router.get('/dashboard')
116
- async def dashboard(request: fastapi.Request,
117
- user_hash: str) -> fastapi.Response:
118
- # TODO(cooperc): Support showing only jobs for a specific user.
119
-
120
- # FIX(zhwu/cooperc/eric): Fix log downloading (assumes global
121
- # /download_log/xx route)
122
-
123
- # Note: before #4717, each user had their own controller, and thus their own
124
- # dashboard. Now, all users share the same controller, so this isn't really
125
- # necessary. TODO(cooperc): clean up.
126
-
127
- # TODO: Put this in an executor to avoid blocking the main server thread.
128
- # It can take a long time if it needs to check the controller status.
129
-
130
- # Find the port for the dashboard of the user
131
- os.environ[constants.USER_ID_ENV_VAR] = user_hash
132
- server_common.reload_for_new_request(client_entrypoint=None,
133
- client_command=None,
134
- using_remote_api_server=False)
135
- logger.info(f'Starting dashboard for user hash: {user_hash}')
136
-
137
- with dashboard_utils.get_dashboard_lock_for_user(user_hash):
138
- max_retries = 3
139
- for attempt in range(max_retries):
140
- port, pid = dashboard_utils.get_dashboard_session(user_hash)
141
- if port == 0 or attempt > 0:
142
- # Let the client know that we are waiting for starting the
143
- # dashboard.
144
- try:
145
- port, pid = core.start_dashboard_forwarding()
146
- except Exception as e: # pylint: disable=broad-except
147
- # We catch all exceptions to gracefully handle unknown
148
- # errors and raise an HTTPException to the client.
149
- msg = (
150
- 'Dashboard failed to start: '
151
- f'{common_utils.format_exception(e, use_bracket=True)}')
152
- logger.error(msg)
153
- raise fastapi.HTTPException(status_code=503, detail=msg)
154
- dashboard_utils.add_dashboard_session(user_hash, port, pid)
155
-
156
- # Assuming the dashboard is forwarded to localhost on the API server
157
- dashboard_url = f'http://localhost:{port}'
158
- try:
159
- # Ping the dashboard to check if it's still running
160
- async with httpx.AsyncClient() as client:
161
- response = await client.request('GET',
162
- dashboard_url,
163
- timeout=5)
164
- if response.is_success:
165
- break # Connection successful, proceed with the request
166
- # Raise an HTTPException here which will be caught by the
167
- # following except block to retry with new connection
168
- response.raise_for_status()
169
- except Exception as e: # pylint: disable=broad-except
170
- # We catch all exceptions to gracefully handle unknown
171
- # errors and retry or raise an HTTPException to the client.
172
- # Assume an exception indicates that the dashboard connection
173
- # is stale - remove it so that a new one is created.
174
- dashboard_utils.remove_dashboard_session(user_hash)
175
- msg = (
176
- f'Dashboard connection attempt {attempt + 1} failed with '
177
- f'{common_utils.format_exception(e, use_bracket=True)}')
178
- logger.info(msg)
179
- if attempt == max_retries - 1:
180
- raise fastapi.HTTPException(status_code=503, detail=msg)
181
-
182
- # Create a client session to forward the request
183
- try:
184
- async with httpx.AsyncClient() as client:
185
- # Make the request and get the response
186
- response = await client.request(
187
- method='GET',
188
- url=f'{dashboard_url}',
189
- headers=request.headers.raw,
190
- )
191
-
192
- # Create a new response with the content already read
193
- content = await response.aread()
194
- return fastapi.Response(
195
- content=content,
196
- status_code=response.status_code,
197
- headers=dict(response.headers),
198
- media_type=response.headers.get('content-type'))
199
- except Exception as e:
200
- msg = (f'Failed to forward request to dashboard: '
201
- f'{common_utils.format_exception(e, use_bracket=True)}')
202
- logger.error(msg)
203
- raise fastapi.HTTPException(status_code=502, detail=msg)
156
+ @router.post('/pool_apply')
157
+ async def pool_apply(request: fastapi.Request,
158
+ jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
159
+ await executor.schedule_request_async(
160
+ request_id=request.state.request_id,
161
+ request_name=request_names.RequestName.JOBS_POOL_APPLY,
162
+ request_body=jobs_pool_apply_body,
163
+ func=core.pool_apply,
164
+ schedule_type=api_requests.ScheduleType.LONG,
165
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
166
+ )
167
+
168
+
169
+ @router.post('/pool_down')
170
+ async def pool_down(request: fastapi.Request,
171
+ jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
172
+ await executor.schedule_request_async(
173
+ request_id=request.state.request_id,
174
+ request_name=request_names.RequestName.JOBS_POOL_DOWN,
175
+ request_body=jobs_pool_down_body,
176
+ func=core.pool_down,
177
+ schedule_type=api_requests.ScheduleType.SHORT,
178
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
179
+ )
180
+
181
+
182
+ @router.post('/pool_status')
183
+ async def pool_status(
184
+ request: fastapi.Request,
185
+ jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
186
+ await executor.schedule_request_async(
187
+ request_id=request.state.request_id,
188
+ request_name=request_names.RequestName.JOBS_POOL_STATUS,
189
+ request_body=jobs_pool_status_body,
190
+ func=core.pool_status,
191
+ schedule_type=api_requests.ScheduleType.SHORT,
192
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
193
+ )
194
+
195
+
196
+ @router.post('/pool_logs')
197
+ async def pool_tail_logs(
198
+ request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
199
+ background_tasks: fastapi.BackgroundTasks
200
+ ) -> fastapi.responses.StreamingResponse:
201
+ await executor.schedule_request_async(
202
+ request_id=request.state.request_id,
203
+ request_name=request_names.RequestName.JOBS_POOL_LOGS,
204
+ request_body=log_body,
205
+ func=core.pool_tail_logs,
206
+ schedule_type=api_requests.ScheduleType.SHORT,
207
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
208
+ )
209
+
210
+ request_task = await api_requests.get_request_async(
211
+ request.state.request_id, fields=['request_id'])
212
+
213
+ return stream_utils.stream_response_for_long_request(
214
+ request_id=request_task.request_id,
215
+ # req.log_path is derived from request_id,
216
+ # so it's ok to just grab the request_id in the above query.
217
+ logs_path=request_task.log_path,
218
+ background_tasks=background_tasks,
219
+ kill_request_on_disconnect=True,
220
+ )
221
+
222
+
223
+ @router.post('/pool_sync-down-logs')
224
+ async def pool_download_logs(
225
+ request: fastapi.Request,
226
+ download_logs_body: payloads.JobsPoolDownloadLogsBody,
227
+ ) -> None:
228
+ user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
229
+ timestamp = sky_logging.get_run_timestamp()
230
+ logs_dir_on_api_server = (
231
+ pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
232
+ 'pool' / f'{download_logs_body.pool_name}_{timestamp}')
233
+ logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
234
+ # We should reuse the original request body, so that the env vars, such as
235
+ # user hash, are kept the same.
236
+ download_logs_body.local_dir = str(logs_dir_on_api_server)
237
+ await executor.schedule_request_async(
238
+ request_id=request.state.request_id,
239
+ request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
240
+ request_body=download_logs_body,
241
+ func=core.pool_sync_down_logs,
242
+ schedule_type=api_requests.ScheduleType.SHORT,
243
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
244
+ )
@@ -0,0 +1,136 @@
1
+ """Utility functions for managed jobs."""
2
+ import typing
3
+
4
+ from sky import backends
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.adaptors import common as adaptors_common
8
+ from sky.backends import backend_utils
9
+ from sky.backends import cloud_vm_ray_backend
10
+ from sky.jobs import utils as managed_job_utils
11
+ from sky.skylet import constants as skylet_constants
12
+ from sky.utils import controller_utils
13
+
14
+ logger = sky_logging.init_logger(__name__)
15
+
16
+ if typing.TYPE_CHECKING:
17
+ from sky.schemas.generated import managed_jobsv1_pb2
18
+ else:
19
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
+ 'sky.schemas.generated.managed_jobsv1_pb2')
21
+
22
+ _MANAGED_JOB_FIELDS_TO_GET = [
23
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
24
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
25
+ ]
26
+
27
+
28
+ def check_version_mismatch_and_non_terminal_jobs() -> None:
29
+ """Check if controller has version mismatch and non-terminal jobs exist.
30
+ Raises:
31
+ ValueError: If there's a version mismatch and non-terminal jobs exist.
32
+ sky.exceptions.ClusterNotUpError: If the controller is not accessible.
33
+ """
34
+ # Get the current local SKYLET_VERSION
35
+ local_version = skylet_constants.SKYLET_VERSION
36
+
37
+ # Get controller handle (works the same in both normal and
38
+ # consolidation mode)
39
+ jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
40
+ handle = backend_utils.is_controller_accessible(
41
+ controller=jobs_controller_type,
42
+ stopped_message='Jobs controller is not running.')
43
+
44
+ backend = backend_utils.get_backend_from_handle(handle)
45
+ assert isinstance(backend, backends.CloudVmRayBackend)
46
+
47
+ use_legacy = not handle.is_grpc_enabled_with_flag
48
+
49
+ if not use_legacy:
50
+ try:
51
+ version_request = managed_jobsv1_pb2.GetVersionRequest()
52
+ version_response = backend_utils.invoke_skylet_with_retries(
53
+ lambda: cloud_vm_ray_backend.SkyletClient(
54
+ handle.get_grpc_channel(
55
+ )).get_managed_job_controller_version(version_request))
56
+ controller_version = version_response.controller_version
57
+
58
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
59
+ skip_finished=True,
60
+ fields=managed_jobsv1_pb2.Fields(
61
+ fields=_MANAGED_JOB_FIELDS_TO_GET),
62
+ )
63
+ job_table_response = backend_utils.invoke_skylet_with_retries(
64
+ lambda: cloud_vm_ray_backend.SkyletClient(
65
+ handle.get_grpc_channel()).get_managed_job_table(
66
+ job_table_request))
67
+ jobs = managed_job_utils.decode_managed_job_protos(
68
+ job_table_response.jobs)
69
+ except exceptions.SkyletMethodNotImplementedError:
70
+ use_legacy = True
71
+
72
+ if use_legacy:
73
+ # Get controller version and raw job table
74
+ code = managed_job_utils.ManagedJobCodeGen.get_version()
75
+
76
+ returncode, output, stderr = backend.run_on_head(handle,
77
+ code,
78
+ require_outputs=True,
79
+ stream_logs=False,
80
+ separate_stderr=True)
81
+
82
+ if returncode != 0:
83
+ logger.error(output + stderr)
84
+ raise ValueError('Failed to check controller version with '
85
+ f'returncode: {returncode}.\n{output + stderr}')
86
+
87
+ # Parse the output to extract controller version (split only on first
88
+ # newline)
89
+ output_parts = output.strip().split('\n', 1)
90
+
91
+ # Extract controller version from first line
92
+ if not output_parts[0].startswith('controller_version:'):
93
+ raise ValueError(
94
+ f'Expected controller version in first line, got: {output}')
95
+
96
+ controller_version = output_parts[0].split(':', 1)[1]
97
+
98
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
99
+ skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
100
+ returncode, job_table_payload, stderr = backend.run_on_head(
101
+ handle,
102
+ code,
103
+ require_outputs=True,
104
+ stream_logs=False,
105
+ separate_stderr=True)
106
+
107
+ if returncode != 0:
108
+ logger.error(job_table_payload + stderr)
109
+ raise ValueError('Failed to fetch managed jobs with returncode: '
110
+ f'{returncode}.\n{job_table_payload + stderr}')
111
+
112
+ jobs, _, _, _, _ = (
113
+ managed_job_utils.load_managed_job_queue(job_table_payload))
114
+
115
+ # Process locally: check version match and filter non-terminal jobs
116
+ version_matches = (controller_version == local_version or
117
+ int(controller_version) > 17)
118
+ non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
119
+ has_non_terminal_jobs = len(non_terminal_jobs) > 0
120
+
121
+ if not version_matches and has_non_terminal_jobs:
122
+ # Format job table locally using the same method as queue()
123
+ formatted_job_table = managed_job_utils.format_job_table(
124
+ non_terminal_jobs,
125
+ pool_status=None,
126
+ show_all=False,
127
+ show_user=False)
128
+
129
+ error_msg = (
130
+ f'Controller SKYLET_VERSION ({controller_version}) does not match '
131
+ f'current version ({local_version}), and there are non-terminal '
132
+ 'jobs on the controller. Please wait for all jobs to complete or '
133
+ 'cancel them before launching new jobs with the updated version.'
134
+ f'\n\nCurrent non-terminal jobs:\n{formatted_job_table}')
135
+
136
+ raise ValueError(error_msg)