skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/__init__.py CHANGED
@@ -5,6 +5,11 @@ from sky.jobs.client.sdk import cancel
5
5
  from sky.jobs.client.sdk import dashboard
6
6
  from sky.jobs.client.sdk import download_logs
7
7
  from sky.jobs.client.sdk import launch
8
+ from sky.jobs.client.sdk import pool_apply
9
+ from sky.jobs.client.sdk import pool_down
10
+ from sky.jobs.client.sdk import pool_status
11
+ from sky.jobs.client.sdk import pool_sync_down_logs
12
+ from sky.jobs.client.sdk import pool_tail_logs
8
13
  from sky.jobs.client.sdk import queue
9
14
  from sky.jobs.client.sdk import tail_logs
10
15
  from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
sky/jobs/client/sdk.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import json
3
3
  import typing
4
- from typing import Dict, List, Optional, Union
5
- import webbrowser
4
+ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
6
5
 
7
6
  import click
8
7
 
@@ -10,34 +9,47 @@ from sky import sky_logging
10
9
  from sky.adaptors import common as adaptors_common
11
10
  from sky.client import common as client_common
12
11
  from sky.client import sdk
12
+ from sky.schemas.api import responses
13
+ from sky.serve.client import impl
13
14
  from sky.server import common as server_common
15
+ from sky.server import rest
16
+ from sky.server import versions
14
17
  from sky.server.requests import payloads
18
+ from sky.server.requests import request_names
15
19
  from sky.skylet import constants
16
20
  from sky.usage import usage_lib
21
+ from sky.utils import admin_policy_utils
17
22
  from sky.utils import common_utils
23
+ from sky.utils import context
18
24
  from sky.utils import dag_utils
19
25
 
20
26
  if typing.TYPE_CHECKING:
21
27
  import io
22
-
23
- import requests
28
+ import webbrowser
24
29
 
25
30
  import sky
31
+ from sky import backends
32
+ from sky.serve import serve_utils
26
33
  else:
27
- requests = adaptors_common.LazyImport('requests')
34
+ # only used in dashboard()
35
+ webbrowser = adaptors_common.LazyImport('webbrowser')
28
36
 
29
37
  logger = sky_logging.init_logger(__name__)
30
38
 
31
39
 
40
+ @context.contextual
32
41
  @usage_lib.entrypoint
33
42
  @server_common.check_server_healthy_or_start
34
43
  def launch(
35
44
  task: Union['sky.Task', 'sky.Dag'],
36
45
  name: Optional[str] = None,
46
+ pool: Optional[str] = None,
47
+ num_jobs: Optional[int] = None,
37
48
  # Internal only:
38
49
  # pylint: disable=invalid-name
39
50
  _need_confirmation: bool = False,
40
- ) -> server_common.RequestId:
51
+ ) -> server_common.RequestId[Tuple[Optional[int],
52
+ Optional['backends.ResourceHandle']]]:
41
53
  """Launches a managed job.
42
54
 
43
55
  Please refer to sky.cli.job_launch for documentation.
@@ -62,36 +74,70 @@ def launch(
62
74
  chain dag.
63
75
  sky.exceptions.NotSupportedError: the feature is not supported.
64
76
  """
77
+ remote_api_version = versions.get_remote_api_version()
78
+ if (pool is not None and
79
+ (remote_api_version is None or remote_api_version < 12)):
80
+ raise click.UsageError('Pools are not supported in your API server. '
81
+ 'Please upgrade to a newer API server to use '
82
+ 'pools.')
83
+ if pool is None and num_jobs is not None:
84
+ raise click.UsageError('Cannot specify num_jobs without pool.')
65
85
 
66
86
  dag = dag_utils.convert_entrypoint_to_dag(task)
67
- sdk.validate(dag)
68
- if _need_confirmation:
69
- request_id = sdk.optimize(dag)
70
- sdk.stream_and_get(request_id)
71
- prompt = f'Launching a managed job {dag.name!r}. Proceed?'
72
- if prompt is not None:
73
- click.confirm(prompt, default=True, abort=True, show_default=True)
74
-
75
- dag = client_common.upload_mounts_to_api_server(dag)
76
- dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
77
- body = payloads.JobsLaunchBody(
78
- task=dag_str,
79
- name=name,
80
- )
81
- response = requests.post(
82
- f'{server_common.get_server_url()}/jobs/launch',
83
- json=json.loads(body.model_dump_json()),
84
- timeout=(5, None),
85
- cookies=server_common.get_api_cookie_jar(),
86
- )
87
- return server_common.get_request_id(response)
87
+ with admin_policy_utils.apply_and_use_config_in_current_request(
88
+ dag,
89
+ request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
90
+ at_client_side=True) as dag:
91
+ sdk.validate(dag)
92
+ if _need_confirmation:
93
+ job_identity = 'a managed job'
94
+ if pool is None:
95
+ optimize_request_id = sdk.optimize(dag)
96
+ sdk.stream_and_get(optimize_request_id)
97
+ else:
98
+ pool_status_request_id = pool_status(pool)
99
+ pool_statuses = sdk.get(pool_status_request_id)
100
+ if not pool_statuses:
101
+ raise click.UsageError(f'Pool {pool!r} not found.')
102
+ resources = pool_statuses[0]['requested_resources_str']
103
+ click.secho(f'Use resources from pool {pool!r}: {resources}.',
104
+ fg='green')
105
+ if num_jobs is not None:
106
+ job_identity = f'{num_jobs} managed jobs'
107
+ prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
108
+ if prompt is not None:
109
+ click.confirm(prompt,
110
+ default=True,
111
+ abort=True,
112
+ show_default=True)
113
+
114
+ dag = client_common.upload_mounts_to_api_server(dag)
115
+ dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
116
+ body = payloads.JobsLaunchBody(
117
+ task=dag_str,
118
+ name=name,
119
+ pool=pool,
120
+ num_jobs=num_jobs,
121
+ )
122
+ response = server_common.make_authenticated_request(
123
+ 'POST',
124
+ '/jobs/launch',
125
+ json=json.loads(body.model_dump_json()),
126
+ timeout=(5, None))
127
+ return server_common.get_request_id(response)
88
128
 
89
129
 
90
130
  @usage_lib.entrypoint
91
131
  @server_common.check_server_healthy_or_start
92
- def queue(refresh: bool,
93
- skip_finished: bool = False,
94
- all_users: bool = False) -> server_common.RequestId:
132
+ def queue(
133
+ refresh: bool,
134
+ skip_finished: bool = False,
135
+ all_users: bool = False,
136
+ job_ids: Optional[List[int]] = None,
137
+ limit: Optional[int] = None,
138
+ fields: Optional[List[str]] = None,
139
+ ) -> server_common.RequestId[Union[List[responses.ManagedJobRecord], Tuple[
140
+ List[responses.ManagedJobRecord], int, Dict[str, int], int]]]:
95
141
  """Gets statuses of managed jobs.
96
142
 
97
143
  Please refer to sky.cli.job_queue for documentation.
@@ -100,12 +146,15 @@ def queue(refresh: bool,
100
146
  refresh: Whether to restart the jobs controller if it is stopped.
101
147
  skip_finished: Whether to skip finished jobs.
102
148
  all_users: Whether to show all users' jobs.
149
+ job_ids: IDs of the managed jobs to show.
150
+ limit: Number of jobs to show.
151
+ fields: Fields to get for the managed jobs.
103
152
 
104
153
  Returns:
105
154
  The request ID of the queue request.
106
155
 
107
156
  Request Returns:
108
- job_records (List[Dict[str, Any]]): A list of dicts, with each dict
157
+ job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
109
158
  containing the information of a job.
110
159
 
111
160
  .. code-block:: python
@@ -117,11 +166,13 @@ def queue(refresh: bool,
117
166
  'resources': (str) resources of the job,
118
167
  'submitted_at': (float) timestamp of submission,
119
168
  'end_at': (float) timestamp of end,
120
- 'duration': (float) duration in seconds,
169
+ 'job_duration': (float) duration in seconds,
121
170
  'recovery_count': (int) Number of retries,
122
171
  'status': (sky.jobs.ManagedJobStatus) of the job,
123
172
  'cluster_resources': (str) resources of the cluster,
124
173
  'region': (str) region of the cluster,
174
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
175
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
125
176
  }
126
177
  ]
127
178
 
@@ -130,17 +181,31 @@ def queue(refresh: bool,
130
181
  does not exist.
131
182
  RuntimeError: if failed to get the managed jobs with ssh.
132
183
  """
133
- body = payloads.JobsQueueBody(
134
- refresh=refresh,
135
- skip_finished=skip_finished,
136
- all_users=all_users,
137
- )
138
- response = requests.post(
139
- f'{server_common.get_server_url()}/jobs/queue',
184
+ remote_api_version = versions.get_remote_api_version()
185
+ if remote_api_version and remote_api_version >= 18:
186
+ body = payloads.JobsQueueV2Body(
187
+ refresh=refresh,
188
+ skip_finished=skip_finished,
189
+ all_users=all_users,
190
+ job_ids=job_ids,
191
+ limit=limit,
192
+ fields=fields,
193
+ )
194
+ path = '/jobs/queue/v2'
195
+ else:
196
+ body = payloads.JobsQueueBody(
197
+ refresh=refresh,
198
+ skip_finished=skip_finished,
199
+ all_users=all_users,
200
+ job_ids=job_ids,
201
+ )
202
+ path = '/jobs/queue'
203
+
204
+ response = server_common.make_authenticated_request(
205
+ 'POST',
206
+ path,
140
207
  json=json.loads(body.model_dump_json()),
141
- timeout=(5, None),
142
- cookies=server_common.get_api_cookie_jar(),
143
- )
208
+ timeout=(5, None))
144
209
  return server_common.get_request_id(response=response)
145
210
 
146
211
 
@@ -148,10 +213,11 @@ def queue(refresh: bool,
148
213
  @server_common.check_server_healthy_or_start
149
214
  def cancel(
150
215
  name: Optional[str] = None,
151
- job_ids: Optional[List[int]] = None,
216
+ job_ids: Optional[Sequence[int]] = None,
152
217
  all: bool = False, # pylint: disable=redefined-builtin
153
218
  all_users: bool = False,
154
- ) -> server_common.RequestId:
219
+ pool: Optional[str] = None,
220
+ ) -> server_common.RequestId[None]:
155
221
  """Cancels managed jobs.
156
222
 
157
223
  Please refer to sky.cli.job_cancel for documentation.
@@ -161,6 +227,7 @@ def cancel(
161
227
  job_ids: IDs of the managed jobs to cancel.
162
228
  all: Whether to cancel all managed jobs.
163
229
  all_users: Whether to cancel all managed jobs from all users.
230
+ pool: Pool name to cancel.
164
231
 
165
232
  Returns:
166
233
  The request ID of the cancel request.
@@ -169,29 +236,37 @@ def cancel(
169
236
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
170
237
  RuntimeError: failed to cancel the job.
171
238
  """
239
+ remote_api_version = versions.get_remote_api_version()
240
+ if (pool is not None and
241
+ (remote_api_version is None or remote_api_version < 12)):
242
+ raise click.UsageError('Pools are not supported in your API server. '
243
+ 'Please upgrade to a newer API server to use '
244
+ 'pools.')
172
245
  body = payloads.JobsCancelBody(
173
246
  name=name,
174
247
  job_ids=job_ids,
175
248
  all=all,
176
249
  all_users=all_users,
250
+ pool=pool,
177
251
  )
178
- response = requests.post(
179
- f'{server_common.get_server_url()}/jobs/cancel',
252
+ response = server_common.make_authenticated_request(
253
+ 'POST',
254
+ '/jobs/cancel',
180
255
  json=json.loads(body.model_dump_json()),
181
- timeout=(5, None),
182
- cookies=server_common.get_api_cookie_jar(),
183
- )
256
+ timeout=(5, None))
184
257
  return server_common.get_request_id(response=response)
185
258
 
186
259
 
187
260
  @usage_lib.entrypoint
188
261
  @server_common.check_server_healthy_or_start
262
+ @rest.retry_transient_errors()
189
263
  def tail_logs(name: Optional[str] = None,
190
264
  job_id: Optional[int] = None,
191
265
  follow: bool = True,
192
266
  controller: bool = False,
193
267
  refresh: bool = False,
194
- output_stream: Optional['io.TextIOBase'] = None) -> int:
268
+ tail: Optional[int] = None,
269
+ output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
195
270
  """Tails logs of managed jobs.
196
271
 
197
272
  You can provide either a job name or a job ID to tail logs. If both are not
@@ -203,6 +278,7 @@ def tail_logs(name: Optional[str] = None,
203
278
  follow: Whether to follow the logs.
204
279
  controller: Whether to tail logs from the jobs controller.
205
280
  refresh: Whether to restart the jobs controller if it is stopped.
281
+ tail: Number of lines to tail from the end of the log file.
206
282
  output_stream: The stream to write the logs to. If None, print to the
207
283
  console.
208
284
 
@@ -210,6 +286,8 @@ def tail_logs(name: Optional[str] = None,
210
286
  Exit code based on success or failure of the job. 0 if success,
211
287
  100 if the job failed. See exceptions.JobExitCode for possible exit
212
288
  codes.
289
+ Will return None if follow is False
290
+ (see note in sky/client/sdk.py::stream_response)
213
291
 
214
292
  Request Raises:
215
293
  ValueError: invalid arguments.
@@ -221,16 +299,23 @@ def tail_logs(name: Optional[str] = None,
221
299
  follow=follow,
222
300
  controller=controller,
223
301
  refresh=refresh,
302
+ tail=tail,
224
303
  )
225
- response = requests.post(
226
- f'{server_common.get_server_url()}/jobs/logs',
304
+ response = server_common.make_authenticated_request(
305
+ 'POST',
306
+ '/jobs/logs',
227
307
  json=json.loads(body.model_dump_json()),
228
308
  stream=True,
229
- timeout=(5, None),
230
- cookies=server_common.get_api_cookie_jar(),
231
- )
232
- request_id = server_common.get_request_id(response)
233
- return sdk.stream_response(request_id, response, output_stream)
309
+ timeout=(5, None))
310
+ request_id: server_common.RequestId[int] = server_common.get_request_id(
311
+ response)
312
+ # Log request is idempotent when tail is 0, thus can resume previous
313
+ # streaming point on retry.
314
+ return sdk.stream_response(request_id=request_id,
315
+ response=response,
316
+ output_stream=output_stream,
317
+ resumable=(tail == 0),
318
+ get_result=follow)
234
319
 
235
320
 
236
321
  @usage_lib.entrypoint
@@ -267,18 +352,18 @@ def download_logs(
267
352
  controller=controller,
268
353
  local_dir=local_dir,
269
354
  )
270
- response = requests.post(
271
- f'{server_common.get_server_url()}/jobs/download_logs',
355
+ response = server_common.make_authenticated_request(
356
+ 'POST',
357
+ '/jobs/download_logs',
272
358
  json=json.loads(body.model_dump_json()),
273
- timeout=(5, None),
274
- cookies=server_common.get_api_cookie_jar(),
275
- )
276
- job_id_remote_path_dict = sdk.stream_and_get(
277
- server_common.get_request_id(response))
359
+ timeout=(5, None))
360
+ request_id: server_common.RequestId[Dict[
361
+ str, str]] = server_common.get_request_id(response)
362
+ job_id_remote_path_dict = sdk.stream_and_get(request_id)
278
363
  remote2local_path_dict = client_common.download_logs_from_api_server(
279
364
  job_id_remote_path_dict.values())
280
365
  return {
281
- job_id: remote2local_path_dict[remote_path]
366
+ int(job_id): remote2local_path_dict[remote_path]
282
367
  for job_id, remote_path in job_id_remote_path_dict.items()
283
368
  }
284
369
 
@@ -314,3 +399,95 @@ def dashboard() -> None:
314
399
  url = f'{api_server_url}/jobs/dashboard?{params}'
315
400
  logger.info(f'Opening dashboard in browser: {url}')
316
401
  webbrowser.open(url)
402
+
403
+
404
+ @context.contextual
405
+ @usage_lib.entrypoint
406
+ @server_common.check_server_healthy_or_start
407
+ @versions.minimal_api_version(12)
408
+ def pool_apply(
409
+ task: Optional[Union['sky.Task', 'sky.Dag']],
410
+ pool_name: str,
411
+ mode: 'serve_utils.UpdateMode',
412
+ workers: Optional[int] = None,
413
+ # Internal only:
414
+ # pylint: disable=invalid-name
415
+ _need_confirmation: bool = False
416
+ ) -> server_common.RequestId[None]:
417
+ """Apply a config to a pool."""
418
+ remote_api_version = versions.get_remote_api_version()
419
+ if (workers is not None and
420
+ (remote_api_version is None or remote_api_version < 19)):
421
+ raise click.UsageError('Updating the number of workers in a pool is '
422
+ 'not supported in your API server. Please '
423
+ 'upgrade to a newer API server to use this '
424
+ 'feature.')
425
+ return impl.apply(task,
426
+ workers,
427
+ pool_name,
428
+ mode,
429
+ pool=True,
430
+ _need_confirmation=_need_confirmation)
431
+
432
+
433
+ @usage_lib.entrypoint
434
+ @server_common.check_server_healthy_or_start
435
+ @versions.minimal_api_version(12)
436
+ def pool_down(
437
+ pool_names: Optional[Union[str, List[str]]],
438
+ all: bool = False, # pylint: disable=redefined-builtin
439
+ purge: bool = False,
440
+ ) -> server_common.RequestId[None]:
441
+ """Delete a pool."""
442
+ return impl.down(pool_names, all, purge, pool=True)
443
+
444
+
445
+ @usage_lib.entrypoint
446
+ @server_common.check_server_healthy_or_start
447
+ @versions.minimal_api_version(12)
448
+ def pool_status(
449
+ pool_names: Optional[Union[str, List[str]]],
450
+ ) -> server_common.RequestId[List[Dict[str, Any]]]:
451
+ """Query a pool."""
452
+ return impl.status(pool_names, pool=True)
453
+
454
+
455
+ @usage_lib.entrypoint
456
+ @server_common.check_server_healthy_or_start
457
+ @rest.retry_transient_errors()
458
+ @versions.minimal_api_version(16)
459
+ def pool_tail_logs(pool_name: str,
460
+ target: Union[str, 'serve_utils.ServiceComponent'],
461
+ worker_id: Optional[int] = None,
462
+ follow: bool = True,
463
+ output_stream: Optional['io.TextIOBase'] = None,
464
+ tail: Optional[int] = None) -> None:
465
+ """Tails logs of a pool."""
466
+ return impl.tail_logs(pool_name,
467
+ target,
468
+ worker_id,
469
+ follow,
470
+ output_stream,
471
+ tail,
472
+ pool=True)
473
+
474
+
475
+ @usage_lib.entrypoint
476
+ @server_common.check_server_healthy_or_start
477
+ @rest.retry_transient_errors()
478
+ @versions.minimal_api_version(16)
479
+ def pool_sync_down_logs(pool_name: str,
480
+ local_dir: str,
481
+ *,
482
+ targets: Optional[Union[
483
+ str, 'serve_utils.ServiceComponent', Sequence[Union[
484
+ str, 'serve_utils.ServiceComponent']]]] = None,
485
+ worker_ids: Optional[List[int]] = None,
486
+ tail: Optional[int] = None) -> None:
487
+ """Sync down logs of a pool."""
488
+ return impl.sync_down_logs(pool_name,
489
+ local_dir,
490
+ targets=targets,
491
+ replica_ids=worker_ids,
492
+ tail=tail,
493
+ pool=True)
@@ -0,0 +1,143 @@
1
+ """Async SDK functions for managed jobs."""
2
+ import typing
3
+ from typing import Dict, List, Optional, Tuple, Union
4
+
5
+ from sky import backends
6
+ from sky import sky_logging
7
+ from sky.adaptors import common as adaptors_common
8
+ from sky.client import sdk_async
9
+ from sky.jobs.client import sdk
10
+ from sky.schemas.api import responses
11
+ from sky.skylet import constants
12
+ from sky.usage import usage_lib
13
+ from sky.utils import common_utils
14
+ from sky.utils import context_utils
15
+
16
+ if typing.TYPE_CHECKING:
17
+ import io
18
+
19
+ import requests
20
+
21
+ import sky
22
+ else:
23
+ requests = adaptors_common.LazyImport('requests')
24
+
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+
28
+ @usage_lib.entrypoint
29
+ async def launch(
30
+ task: Union['sky.Task', 'sky.Dag'],
31
+ name: Optional[str] = None,
32
+ pool: Optional[str] = None,
33
+ num_jobs: Optional[int] = None,
34
+ # Internal only:
35
+ # pylint: disable=invalid-name
36
+ _need_confirmation: bool = False,
37
+ stream_logs: Optional[
38
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
39
+ ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
40
+ """Async version of launch() that launches a managed job."""
41
+ request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
42
+ num_jobs, _need_confirmation)
43
+ if stream_logs is not None:
44
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
45
+ else:
46
+ return await sdk_async.get(request_id)
47
+
48
+
49
+ @usage_lib.entrypoint
50
+ async def queue(
51
+ refresh: bool,
52
+ skip_finished: bool = False,
53
+ all_users: bool = False,
54
+ job_ids: Optional[List[int]] = None,
55
+ limit: Optional[int] = None,
56
+ fields: Optional[List[str]] = None,
57
+ stream_logs: Optional[
58
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
59
+ ) -> Union[List[responses.ManagedJobRecord], Tuple[
60
+ List[responses.ManagedJobRecord], int, Dict[str, int], int]]:
61
+ """Async version of queue() that gets statuses of managed jobs."""
62
+ request_id = await context_utils.to_thread(sdk.queue, refresh,
63
+ skip_finished, all_users,
64
+ job_ids, limit, fields)
65
+ if stream_logs is not None:
66
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
67
+ else:
68
+ return await sdk_async.get(request_id)
69
+
70
+
71
+ @usage_lib.entrypoint
72
+ async def cancel(
73
+ name: Optional[str] = None,
74
+ job_ids: Optional[List[int]] = None,
75
+ all: bool = False, # pylint: disable=redefined-builtin
76
+ all_users: bool = False,
77
+ stream_logs: Optional[
78
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
79
+ ) -> None:
80
+ """Async version of cancel() that cancels managed jobs."""
81
+ request_id = await context_utils.to_thread(sdk.cancel, name, job_ids, all,
82
+ all_users)
83
+ if stream_logs is not None:
84
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
85
+ else:
86
+ return await sdk_async.get(request_id)
87
+
88
+
89
+ @usage_lib.entrypoint
90
+ async def tail_logs(cluster_name: str,
91
+ job_id: Optional[int],
92
+ follow: bool,
93
+ tail: int = 0,
94
+ output_stream: Optional['io.TextIOBase'] = None) -> int:
95
+ """Async version of tail_logs() that tails the logs of a job."""
96
+ return await context_utils.to_thread(
97
+ sdk.tail_logs,
98
+ cluster_name,
99
+ job_id,
100
+ follow,
101
+ tail,
102
+ output_stream,
103
+ )
104
+
105
+
106
+ @usage_lib.entrypoint
107
+ async def download_logs(
108
+ name: Optional[str],
109
+ job_id: Optional[int],
110
+ refresh: bool,
111
+ controller: bool,
112
+ local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[int, str]:
113
+ """Async version of download_logs() that syncs down logs of managed jobs."""
114
+ return await context_utils.to_thread(sdk.download_logs, name, job_id,
115
+ refresh, controller, local_dir)
116
+
117
+
118
+ @usage_lib.entrypoint
119
+ async def dashboard() -> None:
120
+ """Async version of dashboard() that starts a dashboard for managed jobs."""
121
+ return await context_utils.to_thread(sdk.dashboard)
122
+
123
+
124
+ # Deprecated functions
125
+ spot_launch = common_utils.deprecated_function(
126
+ launch,
127
+ name='sky.jobs.launch',
128
+ deprecated_name='spot_launch',
129
+ removing_version='0.8.0',
130
+ override_argument={'use_spot': True})
131
+ spot_queue = common_utils.deprecated_function(queue,
132
+ name='sky.jobs.queue',
133
+ deprecated_name='spot_queue',
134
+ removing_version='0.8.0')
135
+ spot_cancel = common_utils.deprecated_function(cancel,
136
+ name='sky.jobs.cancel',
137
+ deprecated_name='spot_cancel',
138
+ removing_version='0.8.0')
139
+ spot_tail_logs = common_utils.deprecated_function(
140
+ tail_logs,
141
+ name='sky.jobs.tail_logs',
142
+ deprecated_name='spot_tail_logs',
143
+ removing_version='0.8.0')
sky/jobs/constants.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Constants used for Managed Jobs."""
2
+ import os
2
3
  from typing import Any, Dict, Union
3
4
 
4
5
  from sky.skylet import constants as skylet_constants
@@ -9,17 +10,15 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
9
10
 
10
11
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
11
12
 
13
+ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
14
+
15
+ CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
16
+ SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
12
17
  # Resources as a dict for the jobs controller.
13
- # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
14
- # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
15
- # and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
16
- # Concurrently limits are set based on profiling. 4x num vCPUs is the launch
17
- # parallelism limit, and memory / 350MB is the limit to concurrently running
18
- # jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
19
18
  # We use 50 GB disk size to reduce the cost.
20
19
  CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
21
20
  'cpus': '4+',
22
- 'memory': '8x',
21
+ 'memory': '4x',
23
22
  'disk_size': 50
24
23
  }
25
24
 
@@ -47,7 +46,9 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
47
46
  # The version of the lib files that jobs/utils use. Whenever there is an API
48
47
  # change for the jobs/utils, we need to bump this version and update
49
48
  # job.utils.ManagedJobCodeGen to handle the version update.
50
- MANAGED_JOBS_VERSION = 3
49
+ # WARNING: If you update this due to a codegen change, make sure to make the
50
+ # corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
51
+ MANAGED_JOBS_VERSION = 12
51
52
 
52
53
  # The command for setting up the jobs dashboard on the controller. It firstly
53
54
  # checks if the systemd services are available, and if not (e.g., Kubernetes