skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/__init__.py CHANGED
@@ -11,6 +11,7 @@ from sky.jobs.client.sdk import pool_status
11
11
  from sky.jobs.client.sdk import pool_sync_down_logs
12
12
  from sky.jobs.client.sdk import pool_tail_logs
13
13
  from sky.jobs.client.sdk import queue
14
+ from sky.jobs.client.sdk import queue_v2
14
15
  from sky.jobs.client.sdk import tail_logs
15
16
  from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
16
17
  from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
@@ -38,6 +39,7 @@ __all__ = [
38
39
  'cancel',
39
40
  'launch',
40
41
  'queue',
42
+ 'queue_v2',
41
43
  'tail_logs',
42
44
  'dashboard',
43
45
  'download_logs',
sky/jobs/client/sdk.py CHANGED
@@ -9,11 +9,13 @@ from sky import sky_logging
9
9
  from sky.adaptors import common as adaptors_common
10
10
  from sky.client import common as client_common
11
11
  from sky.client import sdk
12
+ from sky.schemas.api import responses
12
13
  from sky.serve.client import impl
13
14
  from sky.server import common as server_common
14
15
  from sky.server import rest
15
16
  from sky.server import versions
16
17
  from sky.server.requests import payloads
18
+ from sky.server.requests import request_names
17
19
  from sky.skylet import constants
18
20
  from sky.usage import usage_lib
19
21
  from sky.utils import admin_policy_utils
@@ -82,8 +84,11 @@ def launch(
82
84
  raise click.UsageError('Cannot specify num_jobs without pool.')
83
85
 
84
86
  dag = dag_utils.convert_entrypoint_to_dag(task)
87
+
85
88
  with admin_policy_utils.apply_and_use_config_in_current_request(
86
- dag, at_client_side=True) as dag:
89
+ dag,
90
+ request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
91
+ at_client_side=True) as dag:
87
92
  sdk.validate(dag)
88
93
  if _need_confirmation:
89
94
  job_identity = 'a managed job'
@@ -123,6 +128,87 @@ def launch(
123
128
  return server_common.get_request_id(response)
124
129
 
125
130
 
131
+ @usage_lib.entrypoint
132
+ @server_common.check_server_healthy_or_start
133
+ @versions.minimal_api_version(18)
134
+ def queue_v2(
135
+ refresh: bool,
136
+ skip_finished: bool = False,
137
+ all_users: bool = False,
138
+ job_ids: Optional[List[int]] = None,
139
+ limit: Optional[int] = None,
140
+ fields: Optional[List[str]] = None,
141
+ ) -> server_common.RequestId[Tuple[List[responses.ManagedJobRecord], int, Dict[
142
+ str, int], int]]:
143
+ """Gets statuses of managed jobs.
144
+
145
+ Please refer to sky.cli.job_queue for documentation.
146
+
147
+ Args:
148
+ refresh: Whether to restart the jobs controller if it is stopped.
149
+ skip_finished: Whether to skip finished jobs.
150
+ all_users: Whether to show all users' jobs.
151
+ job_ids: IDs of the managed jobs to show.
152
+ limit: Number of jobs to show.
153
+ fields: Fields to get for the managed jobs.
154
+
155
+ Returns:
156
+ The request ID of the queue request.
157
+
158
+ Request Returns:
159
+ job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
160
+ containing the information of a job.
161
+
162
+ .. code-block:: python
163
+
164
+ [
165
+ {
166
+ 'job_id': (int) job id,
167
+ 'job_name': (str) job name,
168
+ 'resources': (str) resources of the job,
169
+ 'submitted_at': (float) timestamp of submission,
170
+ 'end_at': (float) timestamp of end,
171
+ 'job_duration': (float) duration in seconds,
172
+ 'recovery_count': (int) Number of retries,
173
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
174
+ 'cluster_resources': (str) resources of the cluster,
175
+ 'region': (str) region of the cluster,
176
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
177
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
178
+ }
179
+ ]
180
+ total (int): Total number of jobs after filter,
181
+ status_counts (Dict[str, int]): Status counts after filter,
182
+ total_no_filter (int): Total number of jobs before filter,
183
+
184
+ Request Raises:
185
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
186
+ does not exist.
187
+ RuntimeError: if failed to get the managed jobs with ssh.
188
+ """
189
+ body = payloads.JobsQueueV2Body(
190
+ refresh=refresh,
191
+ skip_finished=skip_finished,
192
+ all_users=all_users,
193
+ job_ids=job_ids,
194
+ limit=limit,
195
+ fields=fields,
196
+ )
197
+ path = '/jobs/queue/v2'
198
+ response = server_common.make_authenticated_request(
199
+ 'POST',
200
+ path,
201
+ json=json.loads(body.model_dump_json()),
202
+ timeout=(5, None))
203
+ return server_common.get_request_id(response=response)
204
+
205
+
206
+ # Deprecated. Please use queue_v2 instead for better performance.
207
+ # In https://github.com/skypilot-org/skypilot/pull/7695, the `queue` function
208
+ # is updated to return new typed data for performance improvement if the API
209
+ # server supports it, which breaks the backward compatibility.
210
+ # In https://github.com/skypilot-org/skypilot/pull/8015, we revert the change
211
+ # and add a new function `queue_v2` to return the new typed data.
126
212
  @usage_lib.entrypoint
127
213
  @server_common.check_server_healthy_or_start
128
214
  def queue(
@@ -130,9 +216,11 @@ def queue(
130
216
  skip_finished: bool = False,
131
217
  all_users: bool = False,
132
218
  job_ids: Optional[List[int]] = None
133
- ) -> server_common.RequestId[List[Dict[str, Any]]]:
219
+ ) -> server_common.RequestId[List[responses.ManagedJobRecord]]:
134
220
  """Gets statuses of managed jobs.
135
221
 
222
+ Deprecated. Please use queue_v2 instead for better performance.
223
+
136
224
  Please refer to sky.cli.job_queue for documentation.
137
225
 
138
226
  Args:
@@ -145,7 +233,7 @@ def queue(
145
233
  The request ID of the queue request.
146
234
 
147
235
  Request Returns:
148
- job_records (List[Dict[str, Any]]): A list of dicts, with each dict
236
+ job_records (List[responses.ManagedJobRecord]): A list of dicts, with each dict
149
237
  containing the information of a job.
150
238
 
151
239
  .. code-block:: python
@@ -383,15 +471,24 @@ def dashboard() -> None:
383
471
  @server_common.check_server_healthy_or_start
384
472
  @versions.minimal_api_version(12)
385
473
  def pool_apply(
386
- task: Union['sky.Task', 'sky.Dag'],
474
+ task: Optional[Union['sky.Task', 'sky.Dag']],
387
475
  pool_name: str,
388
476
  mode: 'serve_utils.UpdateMode',
477
+ workers: Optional[int] = None,
389
478
  # Internal only:
390
479
  # pylint: disable=invalid-name
391
480
  _need_confirmation: bool = False
392
481
  ) -> server_common.RequestId[None]:
393
482
  """Apply a config to a pool."""
483
+ remote_api_version = versions.get_remote_api_version()
484
+ if (workers is not None and
485
+ (remote_api_version is None or remote_api_version < 19)):
486
+ raise click.UsageError('Updating the number of workers in a pool is '
487
+ 'not supported in your API server. Please '
488
+ 'upgrade to a newer API server to use this '
489
+ 'feature.')
394
490
  return impl.apply(task,
491
+ workers,
395
492
  pool_name,
396
493
  mode,
397
494
  pool=True,
@@ -1,12 +1,13 @@
1
1
  """Async SDK functions for managed jobs."""
2
2
  import typing
3
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import Dict, List, Optional, Tuple, Union
4
4
 
5
5
  from sky import backends
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
8
8
  from sky.client import sdk_async
9
9
  from sky.jobs.client import sdk
10
+ from sky.schemas.api import responses
10
11
  from sky.skylet import constants
11
12
  from sky.usage import usage_lib
12
13
  from sky.utils import common_utils
@@ -28,6 +29,8 @@ logger = sky_logging.init_logger(__name__)
28
29
  async def launch(
29
30
  task: Union['sky.Task', 'sky.Dag'],
30
31
  name: Optional[str] = None,
32
+ pool: Optional[str] = None,
33
+ num_jobs: Optional[int] = None,
31
34
  # Internal only:
32
35
  # pylint: disable=invalid-name
33
36
  _need_confirmation: bool = False,
@@ -35,8 +38,29 @@ async def launch(
35
38
  sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
36
39
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
37
40
  """Async version of launch() that launches a managed job."""
38
- request_id = await context_utils.to_thread(sdk.launch, task, name,
39
- _need_confirmation)
41
+ request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
42
+ num_jobs, _need_confirmation)
43
+ if stream_logs is not None:
44
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
45
+ else:
46
+ return await sdk_async.get(request_id)
47
+
48
+
49
+ @usage_lib.entrypoint
50
+ async def queue_v2(
51
+ refresh: bool,
52
+ skip_finished: bool = False,
53
+ all_users: bool = False,
54
+ job_ids: Optional[List[int]] = None,
55
+ limit: Optional[int] = None,
56
+ fields: Optional[List[str]] = None,
57
+ stream_logs: Optional[
58
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
59
+ ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
60
+ """Async version of queue_v2() that gets statuses of managed jobs."""
61
+ request_id = await context_utils.to_thread(sdk.queue_v2, refresh,
62
+ skip_finished, all_users,
63
+ job_ids, limit, fields)
40
64
  if stream_logs is not None:
41
65
  return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
42
66
  else:
@@ -48,12 +72,14 @@ async def queue(
48
72
  refresh: bool,
49
73
  skip_finished: bool = False,
50
74
  all_users: bool = False,
75
+ job_ids: Optional[List[int]] = None,
51
76
  stream_logs: Optional[
52
77
  sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
53
- ) -> List[Dict[str, Any]]:
78
+ ) -> List[responses.ManagedJobRecord]:
54
79
  """Async version of queue() that gets statuses of managed jobs."""
55
80
  request_id = await context_utils.to_thread(sdk.queue, refresh,
56
- skip_finished, all_users)
81
+ skip_finished, all_users,
82
+ job_ids)
57
83
  if stream_logs is not None:
58
84
  return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
59
85
  else:
sky/jobs/constants.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Constants used for Managed Jobs."""
2
+ import os
2
3
  from typing import Any, Dict, Union
3
4
 
4
5
  from sky.skylet import constants as skylet_constants
@@ -9,17 +10,21 @@ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
9
10
 
10
11
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
11
12
 
13
+ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
14
+
15
+ CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
16
+ SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
17
+
18
+ # The consolidation mode lock ensures that if multiple API servers are running
19
+ # at the same time (e.g. during a rolling update), recovery can only happen once
20
+ # the previous API server has exited.
21
+ CONSOLIDATION_MODE_LOCK_ID = '~/.sky/consolidation_mode_lock'
22
+
12
23
  # Resources as a dict for the jobs controller.
13
- # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
14
- # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
15
- # and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
16
- # Concurrently limits are set based on profiling. 4x num vCPUs is the launch
17
- # parallelism limit, and memory / 350MB is the limit to concurrently running
18
- # jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
19
24
  # We use 50 GB disk size to reduce the cost.
20
25
  CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
21
26
  'cpus': '4+',
22
- 'memory': '8x',
27
+ 'memory': '4x',
23
28
  'disk_size': 50
24
29
  }
25
30
 
@@ -47,7 +52,9 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
47
52
  # The version of the lib files that jobs/utils use. Whenever there is an API
48
53
  # change for the jobs/utils, we need to bump this version and update
49
54
  # job.utils.ManagedJobCodeGen to handle the version update.
50
- MANAGED_JOBS_VERSION = 10
55
+ # WARNING: If you update this due to a codegen change, make sure to make the
56
+ # corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
57
+ MANAGED_JOBS_VERSION = 12
51
58
 
52
59
  # The command for setting up the jobs dashboard on the controller. It firstly
53
60
  # checks if the systemd services are available, and if not (e.g., Kubernetes