skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/jobs/server/server.py CHANGED
@@ -5,11 +5,13 @@ import pathlib
5
5
  import fastapi
6
6
 
7
7
  from sky import sky_logging
8
+ from sky.jobs import utils as managed_jobs_utils
8
9
  from sky.jobs.server import core
9
10
  from sky.server import common as server_common
10
11
  from sky.server import stream_utils
11
12
  from sky.server.requests import executor
12
13
  from sky.server.requests import payloads
14
+ from sky.server.requests import request_names
13
15
  from sky.server.requests import requests as api_requests
14
16
  from sky.skylet import constants
15
17
  from sky.utils import common
@@ -22,22 +24,36 @@ router = fastapi.APIRouter()
22
24
  @router.post('/launch')
23
25
  async def launch(request: fastapi.Request,
24
26
  jobs_launch_body: payloads.JobsLaunchBody) -> None:
25
- executor.schedule_request(
27
+ # In consolidation mode, the jobs controller will use sky.launch on the same
28
+ # API server to launch the underlying job cluster. If you start run many
29
+ # jobs.launch requests, some may be blocked for a long time by sky.launch
30
+ # requests triggered by earlier jobs, which leads to confusing behavior as
31
+ # the jobs.launch requests trickle though. Also, since we don't have to
32
+ # actually launch a jobs controller sky cluster, the jobs.launch request is
33
+ # much quicker in consolidation mode. So we avoid the issue by just using
34
+ # the short executor instead - then jobs.launch will not be blocked by
35
+ # sky.launch.
36
+ consolidation_mode = managed_jobs_utils.is_consolidation_mode()
37
+ schedule_type = (api_requests.ScheduleType.SHORT
38
+ if consolidation_mode else api_requests.ScheduleType.LONG)
39
+ await executor.schedule_request_async(
26
40
  request_id=request.state.request_id,
27
- request_name='jobs.launch',
41
+ request_name=request_names.RequestName.JOBS_LAUNCH,
28
42
  request_body=jobs_launch_body,
29
43
  func=core.launch,
30
- schedule_type=api_requests.ScheduleType.LONG,
44
+ schedule_type=schedule_type,
31
45
  request_cluster_name=common.JOB_CONTROLLER_NAME,
32
46
  )
33
47
 
34
48
 
49
+ # For backwards compatibility
50
+ # TODO(hailong): Remove before 0.12.0.
35
51
  @router.post('/queue')
36
52
  async def queue(request: fastapi.Request,
37
53
  jobs_queue_body: payloads.JobsQueueBody) -> None:
38
- executor.schedule_request(
54
+ await executor.schedule_request_async(
39
55
  request_id=request.state.request_id,
40
- request_name='jobs.queue',
56
+ request_name=request_names.RequestName.JOBS_QUEUE,
41
57
  request_body=jobs_queue_body,
42
58
  func=core.queue,
43
59
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
@@ -46,12 +62,27 @@ async def queue(request: fastapi.Request,
46
62
  )
47
63
 
48
64
 
65
+ @router.post('/queue/v2')
66
+ async def queue_v2(request: fastapi.Request,
67
+ jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
68
+ await executor.schedule_request_async(
69
+ request_id=request.state.request_id,
70
+ request_name=request_names.RequestName.JOBS_QUEUE_V2,
71
+ request_body=jobs_queue_body_v2,
72
+ func=core.queue_v2_api,
73
+ schedule_type=(api_requests.ScheduleType.LONG
74
+ if jobs_queue_body_v2.refresh else
75
+ api_requests.ScheduleType.SHORT),
76
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
77
+ )
78
+
79
+
49
80
  @router.post('/cancel')
50
81
  async def cancel(request: fastapi.Request,
51
82
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
52
- executor.schedule_request(
83
+ await executor.schedule_request_async(
53
84
  request_id=request.state.request_id,
54
- request_name='jobs.cancel',
85
+ request_name=request_names.RequestName.JOBS_CANCEL,
55
86
  request_body=jobs_cancel_body,
56
87
  func=core.cancel,
57
88
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -64,28 +95,39 @@ async def logs(
64
95
  request: fastapi.Request, jobs_logs_body: payloads.JobsLogsBody,
65
96
  background_tasks: fastapi.BackgroundTasks
66
97
  ) -> fastapi.responses.StreamingResponse:
67
- executor.schedule_request(
98
+ schedule_type = api_requests.ScheduleType.SHORT
99
+ if jobs_logs_body.refresh:
100
+ # When refresh is specified, the job controller might be restarted,
101
+ # which takes longer time to finish. We schedule it to long executor.
102
+ schedule_type = api_requests.ScheduleType.LONG
103
+ if schedule_type == api_requests.ScheduleType.SHORT:
104
+ executor.check_request_thread_executor_available()
105
+ request_task = await executor.prepare_request_async(
68
106
  request_id=request.state.request_id,
69
- request_name='jobs.logs',
107
+ request_name=request_names.RequestName.JOBS_LOGS,
70
108
  request_body=jobs_logs_body,
71
109
  func=core.tail_logs,
72
- # TODO(aylei): We have tail logs scheduled as SHORT request, because it
73
- # should be responsive. However, it can be long running if the user's
74
- # job keeps running, and we should avoid it taking the SHORT worker
75
- # indefinitely.
76
- # When refresh is True we schedule it as LONG because a controller
77
- # restart might be needed.
78
- schedule_type=api_requests.ScheduleType.LONG
79
- if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
110
+ schedule_type=schedule_type,
80
111
  request_cluster_name=common.JOB_CONTROLLER_NAME,
81
112
  )
82
- request_task = await api_requests.get_request_async(request.state.request_id
83
- )
84
-
85
- return stream_utils.stream_response(
113
+ kill_request_on_disconnect = False
114
+ if schedule_type == api_requests.ScheduleType.SHORT:
115
+ # For short request, run in the coroutine to avoid blocking
116
+ # short workers.
117
+ task = executor.execute_request_in_coroutine(request_task)
118
+ # Cancel the coroutine after the request is done or client disconnects
119
+ background_tasks.add_task(task.cancel)
120
+ else:
121
+ executor.schedule_prepared_request(request_task)
122
+ # When runs in long executor process, we should kill the request on
123
+ # disconnect to cancel the running routine.
124
+ kill_request_on_disconnect = True
125
+
126
+ return stream_utils.stream_response_for_long_request(
86
127
  request_id=request_task.request_id,
87
128
  logs_path=request_task.log_path,
88
129
  background_tasks=background_tasks,
130
+ kill_request_on_disconnect=kill_request_on_disconnect,
89
131
  )
90
132
 
91
133
 
@@ -100,9 +142,9 @@ async def download_logs(
100
142
  # We should reuse the original request body, so that the env vars, such as
101
143
  # user hash, are kept the same.
102
144
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
103
- executor.schedule_request(
145
+ await executor.schedule_request_async(
104
146
  request_id=request.state.request_id,
105
- request_name='jobs.download_logs',
147
+ request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
106
148
  request_body=jobs_download_logs_body,
107
149
  func=core.download_logs,
108
150
  schedule_type=api_requests.ScheduleType.LONG
@@ -114,9 +156,9 @@ async def download_logs(
114
156
  @router.post('/pool_apply')
115
157
  async def pool_apply(request: fastapi.Request,
116
158
  jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
117
- executor.schedule_request(
159
+ await executor.schedule_request_async(
118
160
  request_id=request.state.request_id,
119
- request_name='jobs.pool_apply',
161
+ request_name=request_names.RequestName.JOBS_POOL_APPLY,
120
162
  request_body=jobs_pool_apply_body,
121
163
  func=core.pool_apply,
122
164
  schedule_type=api_requests.ScheduleType.LONG,
@@ -127,9 +169,9 @@ async def pool_apply(request: fastapi.Request,
127
169
  @router.post('/pool_down')
128
170
  async def pool_down(request: fastapi.Request,
129
171
  jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
130
- executor.schedule_request(
172
+ await executor.schedule_request_async(
131
173
  request_id=request.state.request_id,
132
- request_name='jobs.pool_down',
174
+ request_name=request_names.RequestName.JOBS_POOL_DOWN,
133
175
  request_body=jobs_pool_down_body,
134
176
  func=core.pool_down,
135
177
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -141,9 +183,9 @@ async def pool_down(request: fastapi.Request,
141
183
  async def pool_status(
142
184
  request: fastapi.Request,
143
185
  jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
144
- executor.schedule_request(
186
+ await executor.schedule_request_async(
145
187
  request_id=request.state.request_id,
146
- request_name='jobs.pool_status',
188
+ request_name=request_names.RequestName.JOBS_POOL_STATUS,
147
189
  request_body=jobs_pool_status_body,
148
190
  func=core.pool_status,
149
191
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -156,21 +198,25 @@ async def pool_tail_logs(
156
198
  request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
157
199
  background_tasks: fastapi.BackgroundTasks
158
200
  ) -> fastapi.responses.StreamingResponse:
159
- executor.schedule_request(
201
+ await executor.schedule_request_async(
160
202
  request_id=request.state.request_id,
161
- request_name='jobs.pool_logs',
203
+ request_name=request_names.RequestName.JOBS_POOL_LOGS,
162
204
  request_body=log_body,
163
205
  func=core.pool_tail_logs,
164
206
  schedule_type=api_requests.ScheduleType.SHORT,
165
207
  request_cluster_name=common.JOB_CONTROLLER_NAME,
166
208
  )
167
209
 
168
- request_task = api_requests.get_request(request.state.request_id)
210
+ request_task = await api_requests.get_request_async(
211
+ request.state.request_id, fields=['request_id'])
169
212
 
170
- return stream_utils.stream_response(
213
+ return stream_utils.stream_response_for_long_request(
171
214
  request_id=request_task.request_id,
215
+ # req.log_path is derived from request_id,
216
+ # so it's ok to just grab the request_id in the above query.
172
217
  logs_path=request_task.log_path,
173
218
  background_tasks=background_tasks,
219
+ kill_request_on_disconnect=True,
174
220
  )
175
221
 
176
222
 
@@ -188,9 +234,9 @@ async def pool_download_logs(
188
234
  # We should reuse the original request body, so that the env vars, such as
189
235
  # user hash, are kept the same.
190
236
  download_logs_body.local_dir = str(logs_dir_on_api_server)
191
- executor.schedule_request(
237
+ await executor.schedule_request_async(
192
238
  request_id=request.state.request_id,
193
- request_name='jobs.pool_sync_down_logs',
239
+ request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
194
240
  request_body=download_logs_body,
195
241
  func=core.pool_sync_down_logs,
196
242
  schedule_type=api_requests.ScheduleType.SHORT,
sky/jobs/server/utils.py CHANGED
@@ -1,17 +1,32 @@
1
1
  """Utility functions for managed jobs."""
2
+ import typing
3
+
2
4
  from sky import backends
5
+ from sky import exceptions
3
6
  from sky import sky_logging
7
+ from sky.adaptors import common as adaptors_common
4
8
  from sky.backends import backend_utils
9
+ from sky.backends import cloud_vm_ray_backend
5
10
  from sky.jobs import utils as managed_job_utils
6
11
  from sky.skylet import constants as skylet_constants
7
12
  from sky.utils import controller_utils
8
13
 
9
14
  logger = sky_logging.init_logger(__name__)
10
15
 
16
+ if typing.TYPE_CHECKING:
17
+ from sky.schemas.generated import managed_jobsv1_pb2
18
+ else:
19
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
+ 'sky.schemas.generated.managed_jobsv1_pb2')
21
+
22
+ _MANAGED_JOB_FIELDS_TO_GET = [
23
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
24
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
25
+ ]
26
+
11
27
 
12
28
  def check_version_mismatch_and_non_terminal_jobs() -> None:
13
29
  """Check if controller has version mismatch and non-terminal jobs exist.
14
-
15
30
  Raises:
16
31
  ValueError: If there's a version mismatch and non-terminal jobs exist.
17
32
  sky.exceptions.ClusterNotUpError: If the controller is not accessible.
@@ -29,48 +44,87 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
29
44
  backend = backend_utils.get_backend_from_handle(handle)
30
45
  assert isinstance(backend, backends.CloudVmRayBackend)
31
46
 
32
- # Get controller version and raw job table
33
- code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
34
-
35
- returncode, output, stderr = backend.run_on_head(handle,
36
- code,
37
- require_outputs=True,
38
- stream_logs=False,
39
- separate_stderr=True)
40
-
41
- if returncode != 0:
42
- logger.error(output + stderr)
43
- raise ValueError('Failed to check controller version and jobs with '
44
- f'returncode: {returncode}.\n{output + stderr}')
45
-
46
- # Parse the output to extract controller version (split only on first
47
- # newline)
48
- output_parts = output.strip().split('\n', 1)
49
-
50
- # Extract controller version from first line
51
- if len(output_parts) < 2 or not output_parts[0].startswith(
52
- 'controller_version:'):
53
- raise ValueError(
54
- f'Expected controller version in first line, got: {output}')
55
-
56
- controller_version = output_parts[0].split(':', 1)[1]
57
-
58
- # Rest is job table payload (preserving any newlines within it)
59
- job_table_payload = output_parts[1]
47
+ use_legacy = not handle.is_grpc_enabled_with_flag
48
+
49
+ if not use_legacy:
50
+ try:
51
+ version_request = managed_jobsv1_pb2.GetVersionRequest()
52
+ version_response = backend_utils.invoke_skylet_with_retries(
53
+ lambda: cloud_vm_ray_backend.SkyletClient(
54
+ handle.get_grpc_channel(
55
+ )).get_managed_job_controller_version(version_request))
56
+ controller_version = version_response.controller_version
57
+
58
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
59
+ skip_finished=True,
60
+ fields=managed_jobsv1_pb2.Fields(
61
+ fields=_MANAGED_JOB_FIELDS_TO_GET),
62
+ )
63
+ job_table_response = backend_utils.invoke_skylet_with_retries(
64
+ lambda: cloud_vm_ray_backend.SkyletClient(
65
+ handle.get_grpc_channel()).get_managed_job_table(
66
+ job_table_request))
67
+ jobs = managed_job_utils.decode_managed_job_protos(
68
+ job_table_response.jobs)
69
+ except exceptions.SkyletMethodNotImplementedError:
70
+ use_legacy = True
71
+
72
+ if use_legacy:
73
+ # Get controller version and raw job table
74
+ code = managed_job_utils.ManagedJobCodeGen.get_version()
75
+
76
+ returncode, output, stderr = backend.run_on_head(handle,
77
+ code,
78
+ require_outputs=True,
79
+ stream_logs=False,
80
+ separate_stderr=True)
81
+
82
+ if returncode != 0:
83
+ logger.error(output + stderr)
84
+ raise ValueError('Failed to check controller version with '
85
+ f'returncode: {returncode}.\n{output + stderr}')
86
+
87
+ # Parse the output to extract controller version (split only on first
88
+ # newline)
89
+ output_parts = output.strip().split('\n', 1)
90
+
91
+ # Extract controller version from first line
92
+ if not output_parts[0].startswith('controller_version:'):
93
+ raise ValueError(
94
+ f'Expected controller version in first line, got: {output}')
95
+
96
+ controller_version = output_parts[0].split(':', 1)[1]
97
+
98
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
99
+ skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
100
+ returncode, job_table_payload, stderr = backend.run_on_head(
101
+ handle,
102
+ code,
103
+ require_outputs=True,
104
+ stream_logs=False,
105
+ separate_stderr=True)
106
+
107
+ if returncode != 0:
108
+ logger.error(job_table_payload + stderr)
109
+ raise ValueError('Failed to fetch managed jobs with returncode: '
110
+ f'{returncode}.\n{job_table_payload + stderr}')
111
+
112
+ jobs, _, _, _, _ = (
113
+ managed_job_utils.load_managed_job_queue(job_table_payload))
60
114
 
61
115
  # Process locally: check version match and filter non-terminal jobs
62
- version_matches = controller_version == local_version
63
-
64
- # Load and filter jobs locally using existing method
65
- jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
66
- job_table_payload)
116
+ version_matches = (controller_version == local_version or
117
+ int(controller_version) > 17)
67
118
  non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
68
119
  has_non_terminal_jobs = len(non_terminal_jobs) > 0
69
120
 
70
121
  if not version_matches and has_non_terminal_jobs:
71
122
  # Format job table locally using the same method as queue()
72
123
  formatted_job_table = managed_job_utils.format_job_table(
73
- non_terminal_jobs, show_all=False, show_user=False)
124
+ non_terminal_jobs,
125
+ pool_status=None,
126
+ show_all=False,
127
+ show_user=False)
74
128
 
75
129
  error_msg = (
76
130
  f'Controller SKYLET_VERSION ({controller_version}) does not match '