skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/__init__.py CHANGED
@@ -5,7 +5,13 @@ from sky.jobs.client.sdk import cancel
5
5
  from sky.jobs.client.sdk import dashboard
6
6
  from sky.jobs.client.sdk import download_logs
7
7
  from sky.jobs.client.sdk import launch
8
+ from sky.jobs.client.sdk import pool_apply
9
+ from sky.jobs.client.sdk import pool_down
10
+ from sky.jobs.client.sdk import pool_status
11
+ from sky.jobs.client.sdk import pool_sync_down_logs
12
+ from sky.jobs.client.sdk import pool_tail_logs
8
13
  from sky.jobs.client.sdk import queue
14
+ from sky.jobs.client.sdk import queue_v2
9
15
  from sky.jobs.client.sdk import tail_logs
10
16
  from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
11
17
  from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
@@ -33,6 +39,7 @@ __all__ = [
33
39
  'cancel',
34
40
  'launch',
35
41
  'queue',
42
+ 'queue_v2',
36
43
  'tail_logs',
37
44
  'dashboard',
38
45
  'download_logs',
sky/jobs/client/sdk.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import json
3
3
  import typing
4
- from typing import Dict, List, Optional, Union
5
- import webbrowser
4
+ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
6
5
 
7
6
  import click
8
7
 
@@ -10,34 +9,47 @@ from sky import sky_logging
10
9
  from sky.adaptors import common as adaptors_common
11
10
  from sky.client import common as client_common
12
11
  from sky.client import sdk
12
+ from sky.schemas.api import responses
13
+ from sky.serve.client import impl
13
14
  from sky.server import common as server_common
15
+ from sky.server import rest
16
+ from sky.server import versions
14
17
  from sky.server.requests import payloads
18
+ from sky.server.requests import request_names
15
19
  from sky.skylet import constants
16
20
  from sky.usage import usage_lib
21
+ from sky.utils import admin_policy_utils
17
22
  from sky.utils import common_utils
23
+ from sky.utils import context
18
24
  from sky.utils import dag_utils
19
25
 
20
26
  if typing.TYPE_CHECKING:
21
27
  import io
22
-
23
- import requests
28
+ import webbrowser
24
29
 
25
30
  import sky
31
+ from sky import backends
32
+ from sky.serve import serve_utils
26
33
  else:
27
- requests = adaptors_common.LazyImport('requests')
34
+ # only used in dashboard()
35
+ webbrowser = adaptors_common.LazyImport('webbrowser')
28
36
 
29
37
  logger = sky_logging.init_logger(__name__)
30
38
 
31
39
 
40
+ @context.contextual
32
41
  @usage_lib.entrypoint
33
42
  @server_common.check_server_healthy_or_start
34
43
  def launch(
35
44
  task: Union['sky.Task', 'sky.Dag'],
36
45
  name: Optional[str] = None,
46
+ pool: Optional[str] = None,
47
+ num_jobs: Optional[int] = None,
37
48
  # Internal only:
38
49
  # pylint: disable=invalid-name
39
50
  _need_confirmation: bool = False,
40
- ) -> server_common.RequestId:
51
+ ) -> server_common.RequestId[Tuple[Optional[int],
52
+ Optional['backends.ResourceHandle']]]:
41
53
  """Launches a managed job.
42
54
 
43
55
  Please refer to sky.cli.job_launch for documentation.
@@ -62,50 +74,166 @@ def launch(
62
74
  chain dag.
63
75
  sky.exceptions.NotSupportedError: the feature is not supported.
64
76
  """
77
+ remote_api_version = versions.get_remote_api_version()
78
+ if (pool is not None and
79
+ (remote_api_version is None or remote_api_version < 12)):
80
+ raise click.UsageError('Pools are not supported in your API server. '
81
+ 'Please upgrade to a newer API server to use '
82
+ 'pools.')
83
+ if pool is None and num_jobs is not None:
84
+ raise click.UsageError('Cannot specify num_jobs without pool.')
65
85
 
66
86
  dag = dag_utils.convert_entrypoint_to_dag(task)
67
- sdk.validate(dag)
68
- if _need_confirmation:
69
- request_id = sdk.optimize(dag)
70
- sdk.stream_and_get(request_id)
71
- prompt = f'Launching a managed job {dag.name!r}. Proceed?'
72
- if prompt is not None:
73
- click.confirm(prompt, default=True, abort=True, show_default=True)
74
-
75
- dag = client_common.upload_mounts_to_api_server(dag)
76
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
77
- body = payloads.JobsLaunchBody(
78
- task=dag_str,
79
- name=name,
87
+
88
+ with admin_policy_utils.apply_and_use_config_in_current_request(
89
+ dag,
90
+ request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
91
+ at_client_side=True) as dag:
92
+ sdk.validate(dag)
93
+ if _need_confirmation:
94
+ job_identity = 'a managed job'
95
+ if pool is None:
96
+ optimize_request_id = sdk.optimize(dag)
97
+ sdk.stream_and_get(optimize_request_id)
98
+ else:
99
+ pool_status_request_id = pool_status(pool)
100
+ pool_statuses = sdk.get(pool_status_request_id)
101
+ if not pool_statuses:
102
+ raise click.UsageError(f'Pool {pool!r} not found.')
103
+ resources = pool_statuses[0]['requested_resources_str']
104
+ click.secho(f'Use resources from pool {pool!r}: {resources}.',
105
+ fg='green')
106
+ if num_jobs is not None:
107
+ job_identity = f'{num_jobs} managed jobs'
108
+ prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
109
+ if prompt is not None:
110
+ click.confirm(prompt,
111
+ default=True,
112
+ abort=True,
113
+ show_default=True)
114
+
115
+ dag = client_common.upload_mounts_to_api_server(dag)
116
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
117
+ body = payloads.JobsLaunchBody(
118
+ task=dag_str,
119
+ name=name,
120
+ pool=pool,
121
+ num_jobs=num_jobs,
122
+ )
123
+ response = server_common.make_authenticated_request(
124
+ 'POST',
125
+ '/jobs/launch',
126
+ json=json.loads(body.model_dump_json()),
127
+ timeout=(5, None))
128
+ return server_common.get_request_id(response)
129
+
130
+
131
+ @usage_lib.entrypoint
132
+ @server_common.check_server_healthy_or_start
133
+ @versions.minimal_api_version(18)
134
+ def queue_v2(
135
+ refresh: bool,
136
+ skip_finished: bool = False,
137
+ all_users: bool = False,
138
+ job_ids: Optional[List[int]] = None,
139
+ limit: Optional[int] = None,
140
+ fields: Optional[List[str]] = None,
141
+ ) -> server_common.RequestId[Tuple[List[responses.ManagedJobRecord], int, Dict[
142
+ str, int], int]]:
143
+ """Gets statuses of managed jobs.
144
+
145
+ Please refer to sky.cli.job_queue for documentation.
146
+
147
+ Args:
148
+ refresh: Whether to restart the jobs controller if it is stopped.
149
+ skip_finished: Whether to skip finished jobs.
150
+ all_users: Whether to show all users' jobs.
151
+ job_ids: IDs of the managed jobs to show.
152
+ limit: Number of jobs to show.
153
+ fields: Fields to get for the managed jobs.
154
+
155
+ Returns:
156
+ The request ID of the queue request.
157
+
158
+ Request Returns:
159
+ job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
160
+ containing the information of a job.
161
+
162
+ .. code-block:: python
163
+
164
+ [
165
+ {
166
+ 'job_id': (int) job id,
167
+ 'job_name': (str) job name,
168
+ 'resources': (str) resources of the job,
169
+ 'submitted_at': (float) timestamp of submission,
170
+ 'end_at': (float) timestamp of end,
171
+ 'job_duration': (float) duration in seconds,
172
+ 'recovery_count': (int) Number of retries,
173
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
174
+ 'cluster_resources': (str) resources of the cluster,
175
+ 'region': (str) region of the cluster,
176
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
177
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
178
+ }
179
+ ]
180
+ total (int): Total number of jobs after filter,
181
+ status_counts (Dict[str, int]): Status counts after filter,
182
+ total_no_filter (int): Total number of jobs before filter,
183
+
184
+ Request Raises:
185
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
186
+ does not exist.
187
+ RuntimeError: if failed to get the managed jobs with ssh.
188
+ """
189
+ body = payloads.JobsQueueV2Body(
190
+ refresh=refresh,
191
+ skip_finished=skip_finished,
192
+ all_users=all_users,
193
+ job_ids=job_ids,
194
+ limit=limit,
195
+ fields=fields,
80
196
  )
81
- response = requests.post(
82
- f'{server_common.get_server_url()}/jobs/launch',
197
+ path = '/jobs/queue/v2'
198
+ response = server_common.make_authenticated_request(
199
+ 'POST',
200
+ path,
83
201
  json=json.loads(body.model_dump_json()),
84
- timeout=(5, None),
85
- cookies=server_common.get_api_cookie_jar(),
86
- )
87
- return server_common.get_request_id(response)
202
+ timeout=(5, None))
203
+ return server_common.get_request_id(response=response)
88
204
 
89
205
 
206
+ # Deprecated. Please use queue_v2 instead for better performance.
207
+ # In https://github.com/skypilot-org/skypilot/pull/7695, the `queue` function
208
+ # is updated to return new typed data for performance improvement if the API
209
+ # server supports it, which breaks the backward compatibility.
210
+ # In https://github.com/skypilot-org/skypilot/pull/8015, we revert the change
211
+ # and add a new function `queue_v2` to return the new typed data.
90
212
  @usage_lib.entrypoint
91
213
  @server_common.check_server_healthy_or_start
92
- def queue(refresh: bool,
93
- skip_finished: bool = False,
94
- all_users: bool = False) -> server_common.RequestId:
214
+ def queue(
215
+ refresh: bool,
216
+ skip_finished: bool = False,
217
+ all_users: bool = False,
218
+ job_ids: Optional[List[int]] = None
219
+ ) -> server_common.RequestId[List[responses.ManagedJobRecord]]:
95
220
  """Gets statuses of managed jobs.
96
221
 
222
+ Deprecated. Please use queue_v2 instead for better performance.
223
+
97
224
  Please refer to sky.cli.job_queue for documentation.
98
225
 
99
226
  Args:
100
227
  refresh: Whether to restart the jobs controller if it is stopped.
101
228
  skip_finished: Whether to skip finished jobs.
102
229
  all_users: Whether to show all users' jobs.
230
+ job_ids: IDs of the managed jobs to show.
103
231
 
104
232
  Returns:
105
233
  The request ID of the queue request.
106
234
 
107
235
  Request Returns:
108
- job_records (List[Dict[str, Any]]): A list of dicts, with each dict
236
+ job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
109
237
  containing the information of a job.
110
238
 
111
239
  .. code-block:: python
@@ -117,11 +245,13 @@ def queue(refresh: bool,
117
245
  'resources': (str) resources of the job,
118
246
  'submitted_at': (float) timestamp of submission,
119
247
  'end_at': (float) timestamp of end,
120
- 'duration': (float) duration in seconds,
248
+ 'job_duration': (float) duration in seconds,
121
249
  'recovery_count': (int) Number of retries,
122
250
  'status': (sky.jobs.ManagedJobStatus) of the job,
123
251
  'cluster_resources': (str) resources of the cluster,
124
252
  'region': (str) region of the cluster,
253
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
254
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
125
255
  }
126
256
  ]
127
257
 
@@ -134,13 +264,13 @@ def queue(refresh: bool,
134
264
  refresh=refresh,
135
265
  skip_finished=skip_finished,
136
266
  all_users=all_users,
267
+ job_ids=job_ids,
137
268
  )
138
- response = requests.post(
139
- f'{server_common.get_server_url()}/jobs/queue',
269
+ response = server_common.make_authenticated_request(
270
+ 'POST',
271
+ '/jobs/queue',
140
272
  json=json.loads(body.model_dump_json()),
141
- timeout=(5, None),
142
- cookies=server_common.get_api_cookie_jar(),
143
- )
273
+ timeout=(5, None))
144
274
  return server_common.get_request_id(response=response)
145
275
 
146
276
 
@@ -148,10 +278,11 @@ def queue(refresh: bool,
148
278
  @server_common.check_server_healthy_or_start
149
279
  def cancel(
150
280
  name: Optional[str] = None,
151
- job_ids: Optional[List[int]] = None,
281
+ job_ids: Optional[Sequence[int]] = None,
152
282
  all: bool = False, # pylint: disable=redefined-builtin
153
283
  all_users: bool = False,
154
- ) -> server_common.RequestId:
284
+ pool: Optional[str] = None,
285
+ ) -> server_common.RequestId[None]:
155
286
  """Cancels managed jobs.
156
287
 
157
288
  Please refer to sky.cli.job_cancel for documentation.
@@ -161,6 +292,7 @@ def cancel(
161
292
  job_ids: IDs of the managed jobs to cancel.
162
293
  all: Whether to cancel all managed jobs.
163
294
  all_users: Whether to cancel all managed jobs from all users.
295
+ pool: Pool name to cancel.
164
296
 
165
297
  Returns:
166
298
  The request ID of the cancel request.
@@ -169,29 +301,37 @@ def cancel(
169
301
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
170
302
  RuntimeError: failed to cancel the job.
171
303
  """
304
+ remote_api_version = versions.get_remote_api_version()
305
+ if (pool is not None and
306
+ (remote_api_version is None or remote_api_version < 12)):
307
+ raise click.UsageError('Pools are not supported in your API server. '
308
+ 'Please upgrade to a newer API server to use '
309
+ 'pools.')
172
310
  body = payloads.JobsCancelBody(
173
311
  name=name,
174
312
  job_ids=job_ids,
175
313
  all=all,
176
314
  all_users=all_users,
315
+ pool=pool,
177
316
  )
178
- response = requests.post(
179
- f'{server_common.get_server_url()}/jobs/cancel',
317
+ response = server_common.make_authenticated_request(
318
+ 'POST',
319
+ '/jobs/cancel',
180
320
  json=json.loads(body.model_dump_json()),
181
- timeout=(5, None),
182
- cookies=server_common.get_api_cookie_jar(),
183
- )
321
+ timeout=(5, None))
184
322
  return server_common.get_request_id(response=response)
185
323
 
186
324
 
187
325
  @usage_lib.entrypoint
188
326
  @server_common.check_server_healthy_or_start
327
+ @rest.retry_transient_errors()
189
328
  def tail_logs(name: Optional[str] = None,
190
329
  job_id: Optional[int] = None,
191
330
  follow: bool = True,
192
331
  controller: bool = False,
193
332
  refresh: bool = False,
194
- output_stream: Optional['io.TextIOBase'] = None) -> int:
333
+ tail: Optional[int] = None,
334
+ output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
195
335
  """Tails logs of managed jobs.
196
336
 
197
337
  You can provide either a job name or a job ID to tail logs. If both are not
@@ -203,6 +343,7 @@ def tail_logs(name: Optional[str] = None,
203
343
  follow: Whether to follow the logs.
204
344
  controller: Whether to tail logs from the jobs controller.
205
345
  refresh: Whether to restart the jobs controller if it is stopped.
346
+ tail: Number of lines to tail from the end of the log file.
206
347
  output_stream: The stream to write the logs to. If None, print to the
207
348
  console.
208
349
 
@@ -210,6 +351,8 @@ def tail_logs(name: Optional[str] = None,
210
351
  Exit code based on success or failure of the job. 0 if success,
211
352
  100 if the job failed. See exceptions.JobExitCode for possible exit
212
353
  codes.
354
+ Will return None if follow is False
355
+ (see note in sky/client/sdk.py::stream_response)
213
356
 
214
357
  Request Raises:
215
358
  ValueError: invalid arguments.
@@ -221,16 +364,23 @@ def tail_logs(name: Optional[str] = None,
221
364
  follow=follow,
222
365
  controller=controller,
223
366
  refresh=refresh,
367
+ tail=tail,
224
368
  )
225
- response = requests.post(
226
- f'{server_common.get_server_url()}/jobs/logs',
369
+ response = server_common.make_authenticated_request(
370
+ 'POST',
371
+ '/jobs/logs',
227
372
  json=json.loads(body.model_dump_json()),
228
373
  stream=True,
229
- timeout=(5, None),
230
- cookies=server_common.get_api_cookie_jar(),
231
- )
232
- request_id = server_common.get_request_id(response)
233
- return sdk.stream_response(request_id, response, output_stream)
374
+ timeout=(5, None))
375
+ request_id: server_common.RequestId[int] = server_common.get_request_id(
376
+ response)
377
+ # Log request is idempotent when tail is 0, thus can resume previous
378
+ # streaming point on retry.
379
+ return sdk.stream_response(request_id=request_id,
380
+ response=response,
381
+ output_stream=output_stream,
382
+ resumable=(tail == 0),
383
+ get_result=follow)
234
384
 
235
385
 
236
386
  @usage_lib.entrypoint
@@ -267,18 +417,18 @@ def download_logs(
267
417
  controller=controller,
268
418
  local_dir=local_dir,
269
419
  )
270
- response = requests.post(
271
- f'{server_common.get_server_url()}/jobs/download_logs',
420
+ response = server_common.make_authenticated_request(
421
+ 'POST',
422
+ '/jobs/download_logs',
272
423
  json=json.loads(body.model_dump_json()),
273
- timeout=(5, None),
274
- cookies=server_common.get_api_cookie_jar(),
275
- )
276
- job_id_remote_path_dict = sdk.stream_and_get(
277
- server_common.get_request_id(response))
424
+ timeout=(5, None))
425
+ request_id: server_common.RequestId[Dict[
426
+ str, str]] = server_common.get_request_id(response)
427
+ job_id_remote_path_dict = sdk.stream_and_get(request_id)
278
428
  remote2local_path_dict = client_common.download_logs_from_api_server(
279
429
  job_id_remote_path_dict.values())
280
430
  return {
281
- job_id: remote2local_path_dict[remote_path]
431
+ int(job_id): remote2local_path_dict[remote_path]
282
432
  for job_id, remote_path in job_id_remote_path_dict.items()
283
433
  }
284
434
 
@@ -314,3 +464,95 @@ def dashboard() -> None:
314
464
  url = f'{api_server_url}/jobs/dashboard?{params}'
315
465
  logger.info(f'Opening dashboard in browser: {url}')
316
466
  webbrowser.open(url)
467
+
468
+
469
+ @context.contextual
470
+ @usage_lib.entrypoint
471
+ @server_common.check_server_healthy_or_start
472
+ @versions.minimal_api_version(12)
473
+ def pool_apply(
474
+ task: Optional[Union['sky.Task', 'sky.Dag']],
475
+ pool_name: str,
476
+ mode: 'serve_utils.UpdateMode',
477
+ workers: Optional[int] = None,
478
+ # Internal only:
479
+ # pylint: disable=invalid-name
480
+ _need_confirmation: bool = False
481
+ ) -> server_common.RequestId[None]:
482
+ """Apply a config to a pool."""
483
+ remote_api_version = versions.get_remote_api_version()
484
+ if (workers is not None and
485
+ (remote_api_version is None or remote_api_version < 19)):
486
+ raise click.UsageError('Updating the number of workers in a pool is '
487
+ 'not supported in your API server. Please '
488
+ 'upgrade to a newer API server to use this '
489
+ 'feature.')
490
+ return impl.apply(task,
491
+ workers,
492
+ pool_name,
493
+ mode,
494
+ pool=True,
495
+ _need_confirmation=_need_confirmation)
496
+
497
+
498
+ @usage_lib.entrypoint
499
+ @server_common.check_server_healthy_or_start
500
+ @versions.minimal_api_version(12)
501
+ def pool_down(
502
+ pool_names: Optional[Union[str, List[str]]],
503
+ all: bool = False, # pylint: disable=redefined-builtin
504
+ purge: bool = False,
505
+ ) -> server_common.RequestId[None]:
506
+ """Delete a pool."""
507
+ return impl.down(pool_names, all, purge, pool=True)
508
+
509
+
510
+ @usage_lib.entrypoint
511
+ @server_common.check_server_healthy_or_start
512
+ @versions.minimal_api_version(12)
513
+ def pool_status(
514
+ pool_names: Optional[Union[str, List[str]]],
515
+ ) -> server_common.RequestId[List[Dict[str, Any]]]:
516
+ """Query a pool."""
517
+ return impl.status(pool_names, pool=True)
518
+
519
+
520
+ @usage_lib.entrypoint
521
+ @server_common.check_server_healthy_or_start
522
+ @rest.retry_transient_errors()
523
+ @versions.minimal_api_version(16)
524
+ def pool_tail_logs(pool_name: str,
525
+ target: Union[str, 'serve_utils.ServiceComponent'],
526
+ worker_id: Optional[int] = None,
527
+ follow: bool = True,
528
+ output_stream: Optional['io.TextIOBase'] = None,
529
+ tail: Optional[int] = None) -> None:
530
+ """Tails logs of a pool."""
531
+ return impl.tail_logs(pool_name,
532
+ target,
533
+ worker_id,
534
+ follow,
535
+ output_stream,
536
+ tail,
537
+ pool=True)
538
+
539
+
540
+ @usage_lib.entrypoint
541
+ @server_common.check_server_healthy_or_start
542
+ @rest.retry_transient_errors()
543
+ @versions.minimal_api_version(16)
544
+ def pool_sync_down_logs(pool_name: str,
545
+ local_dir: str,
546
+ *,
547
+ targets: Optional[Union[
548
+ str, 'serve_utils.ServiceComponent', Sequence[Union[
549
+ str, 'serve_utils.ServiceComponent']]]] = None,
550
+ worker_ids: Optional[List[int]] = None,
551
+ tail: Optional[int] = None) -> None:
552
+ """Sync down logs of a pool."""
553
+ return impl.sync_down_logs(pool_name,
554
+ local_dir,
555
+ targets=targets,
556
+ replica_ids=worker_ids,
557
+ tail=tail,
558
+ pool=True)