skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/jobs/server/core.py CHANGED
@@ -1,9 +1,13 @@
1
1
  """SDK functions for managed jobs."""
2
+ import concurrent.futures
3
+ import copy
4
+ import ipaddress
2
5
  import os
3
6
  import pathlib
4
7
  import tempfile
5
8
  import typing
6
9
  from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib import parse as urlparse
7
11
  import uuid
8
12
 
9
13
  import colorama
@@ -17,16 +21,21 @@ from sky import provision as provision_lib
17
21
  from sky import sky_logging
18
22
  from sky import skypilot_config
19
23
  from sky import task as task_lib
24
+ from sky.adaptors import common as adaptors_common
20
25
  from sky.backends import backend_utils
26
+ from sky.backends import cloud_vm_ray_backend
21
27
  from sky.catalog import common as service_catalog_common
22
28
  from sky.data import storage as storage_lib
23
29
  from sky.jobs import constants as managed_job_constants
24
30
  from sky.jobs import state as managed_job_state
25
31
  from sky.jobs import utils as managed_job_utils
32
+ from sky.metrics import utils as metrics_lib
26
33
  from sky.provision import common as provision_common
34
+ from sky.schemas.api import responses
27
35
  from sky.serve import serve_state
28
36
  from sky.serve import serve_utils
29
37
  from sky.serve.server import impl
38
+ from sky.server.requests import request_names
30
39
  from sky.skylet import constants as skylet_constants
31
40
  from sky.usage import usage_lib
32
41
  from sky.utils import admin_policy_utils
@@ -42,11 +51,47 @@ from sky.utils import ux_utils
42
51
  from sky.workspaces import core as workspaces_core
43
52
 
44
53
  if typing.TYPE_CHECKING:
54
+ from google.protobuf import json_format
55
+
45
56
  import sky
46
- from sky.backends import cloud_vm_ray_backend
57
+ from sky.schemas.generated import managed_jobsv1_pb2
58
+ else:
59
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
60
+
61
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
62
+ 'sky.schemas.generated.managed_jobsv1_pb2')
47
63
 
48
64
  logger = sky_logging.init_logger(__name__)
49
65
 
66
+ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
67
+ 'job_id',
68
+ 'task_id',
69
+ 'workspace',
70
+ 'job_name',
71
+ 'task_name',
72
+ 'resources',
73
+ 'submitted_at',
74
+ 'end_at',
75
+ 'job_duration',
76
+ 'recovery_count',
77
+ 'status',
78
+ 'pool',
79
+ 'current_cluster_name',
80
+ 'job_id_on_pool_cluster',
81
+ 'start_at',
82
+ 'infra',
83
+ 'cloud',
84
+ 'region',
85
+ 'zone',
86
+ 'cluster_resources',
87
+ 'schedule_state',
88
+ 'details',
89
+ 'failure_reason',
90
+ 'metadata',
91
+ 'user_name',
92
+ 'user_hash',
93
+ ]
94
+
50
95
 
51
96
  def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
52
97
  """Upload files to the controller.
@@ -129,7 +174,8 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
129
174
  force_user_workspace=True),
130
175
  entrypoint=common_utils.get_current_command(),
131
176
  pool=pool,
132
- pool_hash=pool_hash))
177
+ pool_hash=pool_hash,
178
+ user_hash=common_utils.get_user_hash()))
133
179
  for task_id, task in enumerate(dag.tasks):
134
180
  resources_str = backend_utils.get_task_resources_str(
135
181
  task, is_managed_job=True)
@@ -188,10 +234,12 @@ def launch(
188
234
 
189
235
  dag_uuid = str(uuid.uuid4().hex[:4])
190
236
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
237
+
191
238
  # Always apply the policy again here, even though it might have been applied
192
239
  # in the CLI. This is to ensure that we apply the policy to the final DAG
193
240
  # and get the mutated config.
194
- dag, mutated_user_config = admin_policy_utils.apply(dag)
241
+ dag, mutated_user_config = admin_policy_utils.apply(
242
+ dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
195
243
  dag.resolve_and_validate_volumes()
196
244
  if not dag.is_chain():
197
245
  with ux_utils.print_exception_no_traceback():
@@ -202,6 +250,21 @@ def launch(
202
250
  # pre-mount operations when submitting jobs.
203
251
  dag.pre_mount_volumes()
204
252
 
253
+ # If there is a local postgres db, when the api server tries launching on
254
+ # the remote jobs controller it will fail. therefore, we should remove this
255
+ # before sending the config to the jobs controller.
256
+ # TODO(luca) there are a lot of potential problems with postgres being sent
257
+ # to the jobs controller. for example if the postgres is whitelisted to
258
+ # only the API server, this will then break. the simple solution to that is
259
+ # telling the user to add the jobs controller to the postgres whitelist.
260
+ if not managed_job_utils.is_consolidation_mode():
261
+ db_path = mutated_user_config.get('db', None)
262
+ if db_path is not None:
263
+ parsed = urlparse.urlparse(db_path)
264
+ if ((parsed.hostname == 'localhost' or
265
+ ipaddress.ip_address(parsed.hostname).is_loopback)):
266
+ mutated_user_config.pop('db', None)
267
+
205
268
  user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
206
269
  dag, use_user_specified_yaml=True)
207
270
 
@@ -263,15 +326,13 @@ def launch(
263
326
  # Check whether cached jobs controller cluster is accessible
264
327
  cluster_name = (
265
328
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
266
- record = global_user_state.get_cluster_from_name(cluster_name)
267
- if record is not None:
329
+ if global_user_state.cluster_with_name_exists(cluster_name):
268
330
  # there is a cached jobs controller cluster
269
331
  try:
270
332
  # TODO: do something with returned status?
271
333
  _, _ = backend_utils.refresh_cluster_status_handle(
272
334
  cluster_name=cluster_name,
273
- force_refresh_statuses=set(status_lib.ClusterStatus),
274
- acquire_per_cluster_status_lock=False)
335
+ force_refresh_statuses=set(status_lib.ClusterStatus))
275
336
  except (exceptions.ClusterOwnerIdentityMismatchError,
276
337
  exceptions.CloudUserIdentityError,
277
338
  exceptions.ClusterStatusFetchingError) as e:
@@ -309,6 +370,7 @@ def launch(
309
370
  def _submit_one(
310
371
  consolidation_mode_job_id: Optional[int] = None,
311
372
  job_rank: Optional[int] = None,
373
+ num_jobs: Optional[int] = None,
312
374
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
313
375
  rank_suffix = '' if job_rank is None else f'-{job_rank}'
314
376
  remote_original_user_yaml_path = (
@@ -328,11 +390,16 @@ def launch(
328
390
  ) as original_user_yaml_path:
329
391
  original_user_yaml_path.write(user_dag_str_user_specified)
330
392
  original_user_yaml_path.flush()
331
- for task_ in dag.tasks:
393
+ # Copy tasks to avoid race conditions when multiple threads modify
394
+ # the same dag object concurrently. Each thread needs its own copy.
395
+ dag_copy = copy.deepcopy(dag)
396
+ for task_ in dag_copy.tasks:
332
397
  if job_rank is not None:
333
398
  task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
399
+ if num_jobs is not None:
400
+ task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
334
401
 
335
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
402
+ dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
336
403
 
337
404
  vars_to_fill = {
338
405
  'remote_original_user_yaml_path':
@@ -351,6 +418,8 @@ def launch(
351
418
  'priority': priority,
352
419
  'consolidation_mode_job_id': consolidation_mode_job_id,
353
420
  'pool': pool,
421
+ 'job_controller_indicator_file':
422
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
354
423
  **controller_utils.shared_controller_vars_to_fill(
355
424
  controller,
356
425
  remote_user_config_path=remote_user_config_path,
@@ -363,7 +432,8 @@ def launch(
363
432
 
364
433
  yaml_path = os.path.join(
365
434
  managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
366
- f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
435
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
436
+ )
367
437
  common_utils.fill_template(
368
438
  managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
369
439
  vars_to_fill,
@@ -371,16 +441,19 @@ def launch(
371
441
  controller_task = task_lib.Task.from_yaml(yaml_path)
372
442
  controller_task.set_resources(controller_resources)
373
443
 
374
- controller_task.managed_job_dag = dag
444
+ controller_task.managed_job_dag = dag_copy
375
445
  # pylint: disable=protected-access
376
446
  controller_task._metadata = metadata
377
447
 
378
448
  job_identity = ''
379
449
  if job_rank is not None:
380
450
  job_identity = f' (rank: {job_rank})'
381
- logger.info(f'{colorama.Fore.YELLOW}'
382
- f'Launching managed job {dag.name!r}{job_identity} '
383
- f'from jobs controller...{colorama.Style.RESET_ALL}')
451
+ job_controller_postfix = (' from jobs controller' if
452
+ consolidation_mode_job_id is None else '')
453
+ logger.info(
454
+ f'{colorama.Fore.YELLOW}'
455
+ f'Launching managed job {dag.name!r}{job_identity}'
456
+ f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
384
457
 
385
458
  # Launch with the api server's user hash, so that sky status does
386
459
  # not show the owner of the controller as whatever user launched
@@ -395,19 +468,24 @@ def launch(
395
468
  # intermediate bucket and newly created bucket should be in
396
469
  # workspace A.
397
470
  if consolidation_mode_job_id is None:
398
- return execution.launch(task=controller_task,
399
- cluster_name=controller_name,
400
- stream_logs=stream_logs,
401
- retry_until_up=True,
402
- fast=True,
403
- _disable_controller_check=True)
471
+ return execution.launch(
472
+ task=controller_task,
473
+ cluster_name=controller_name,
474
+ stream_logs=stream_logs,
475
+ retry_until_up=True,
476
+ fast=True,
477
+ _request_name=request_names.AdminPolicyRequestName.
478
+ JOBS_LAUNCH_CONTROLLER,
479
+ _disable_controller_check=True)
404
480
  # Manually launch the scheduler in consolidation mode.
405
481
  local_handle = backend_utils.is_controller_accessible(
406
482
  controller=controller, stopped_message='')
407
483
  backend = backend_utils.get_backend_from_handle(
408
484
  local_handle)
409
485
  assert isinstance(backend, backends.CloudVmRayBackend)
410
- with sky_logging.silent():
486
+ # Suppress file mount logs when submitting multiple jobs.
487
+ should_silence = num_jobs is not None and num_jobs > 1
488
+ with sky_logging.silent(should_silence):
411
489
  backend.sync_file_mounts(
412
490
  handle=local_handle,
413
491
  all_file_mounts=controller_task.file_mounts,
@@ -423,12 +501,16 @@ def launch(
423
501
  for k, v in controller_task.envs.items()
424
502
  ]
425
503
  run_script = '\n'.join(env_cmds + [run_script])
426
- # Dump script for high availability recovery.
427
- if controller_utils.high_availability_specified(
428
- controller_name):
429
- managed_job_state.set_ha_recovery_script(
430
- consolidation_mode_job_id, run_script)
431
- backend.run_on_head(local_handle, run_script)
504
+ log_dir = os.path.join(skylet_constants.SKY_LOGS_DIRECTORY,
505
+ 'managed_jobs')
506
+ os.makedirs(log_dir, exist_ok=True)
507
+ log_path = os.path.join(
508
+ log_dir, f'submit-job-{consolidation_mode_job_id}.log')
509
+ backend.run_on_head(local_handle,
510
+ run_script,
511
+ log_path=log_path)
512
+ ux_utils.starting_message(
513
+ f'Job submitted, ID: {consolidation_mode_job_id}')
432
514
  return consolidation_mode_job_id, local_handle
433
515
 
434
516
  if pool is None:
@@ -437,15 +519,49 @@ def launch(
437
519
  assert len(consolidation_mode_job_ids) == 1
438
520
  return _submit_one(consolidation_mode_job_ids[0])
439
521
 
440
- ids = []
441
- all_handle = None
442
- for job_rank in range(num_jobs):
443
- job_id = (consolidation_mode_job_ids[job_rank]
522
+ ids: List[int] = []
523
+ all_handle: Optional[backends.ResourceHandle] = None
524
+
525
+ if num_jobs == 1:
526
+ job_id = (consolidation_mode_job_ids[0]
444
527
  if consolidation_mode_job_ids is not None else None)
445
- jid, handle = _submit_one(job_id, job_rank)
528
+ jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
446
529
  assert jid is not None, (job_id, handle)
447
530
  ids.append(jid)
448
531
  all_handle = handle
532
+ else:
533
+ # Submit jobs in parallel using ThreadPoolExecutor
534
+ with concurrent.futures.ThreadPoolExecutor(
535
+ max_workers=min(num_jobs,
536
+ os.cpu_count() or 1)) as executor:
537
+ # Submit jobs concurrently
538
+ future_to_rank = {}
539
+ for job_rank in range(num_jobs):
540
+ job_id = (consolidation_mode_job_ids[job_rank]
541
+ if consolidation_mode_job_ids is not None else None)
542
+ future = executor.submit(_submit_one, job_id, job_rank,
543
+ num_jobs)
544
+ future_to_rank[future] = job_rank
545
+
546
+ # Collect results in order of job_rank to maintain consistent order.
547
+ results: List[Optional[Tuple[
548
+ int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
549
+ for future in concurrent.futures.as_completed(future_to_rank):
550
+ job_rank = future_to_rank[future]
551
+ try:
552
+ jid, handle = future.result()
553
+ assert jid is not None, (job_id, handle)
554
+ results[job_rank] = (jid, handle)
555
+ all_handle = handle # Keep the last handle.
556
+ except Exception as e:
557
+ logger.error(f'Error launching job {job_rank}: {e}')
558
+ raise e
559
+
560
+ # Extract job IDs in order
561
+ for res in results:
562
+ if res is not None:
563
+ ids.append(res[0])
564
+
449
565
  return ids, all_handle
450
566
 
451
567
 
@@ -498,7 +614,8 @@ def queue_from_kubernetes_pod(
498
614
  'kubernetes', cluster_info)[0]
499
615
 
500
616
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
501
- skip_finished=skip_finished)
617
+ skip_finished=skip_finished,
618
+ fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
502
619
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
503
620
  code,
504
621
  require_outputs=True,
@@ -575,8 +692,49 @@ def _maybe_restart_controller(
575
692
  return handle
576
693
 
577
694
 
695
+ # For backwards compatibility
696
+ # TODO(hailong): Remove before 0.12.0.
697
+ @usage_lib.entrypoint
698
+ def queue(refresh: bool,
699
+ skip_finished: bool = False,
700
+ all_users: bool = False,
701
+ job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
702
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
703
+ """Gets statuses of managed jobs.
704
+
705
+ Please refer to sky.cli.job_queue for documentation.
706
+
707
+ Returns:
708
+ [
709
+ {
710
+ 'job_id': int,
711
+ 'job_name': str,
712
+ 'resources': str,
713
+ 'submitted_at': (float) timestamp of submission,
714
+ 'end_at': (float) timestamp of end,
715
+ 'job_duration': (float) duration in seconds,
716
+ 'recovery_count': (int) Number of retries,
717
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
718
+ 'cluster_resources': (str) resources of the cluster,
719
+ 'region': (str) region of the cluster,
720
+ 'user_name': (Optional[str]) job creator's user name,
721
+ 'user_hash': (str) job creator's user hash,
722
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
723
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
724
+ }
725
+ ]
726
+ Raises:
727
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
728
+ does not exist.
729
+ RuntimeError: if failed to get the managed jobs with ssh.
730
+ """
731
+ jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
732
+
733
+ return jobs
734
+
735
+
578
736
  @usage_lib.entrypoint
579
- def queue(
737
+ def queue_v2_api(
580
738
  refresh: bool,
581
739
  skip_finished: bool = False,
582
740
  all_users: bool = False,
@@ -588,9 +746,34 @@ def queue(
588
746
  page: Optional[int] = None,
589
747
  limit: Optional[int] = None,
590
748
  statuses: Optional[List[str]] = None,
749
+ fields: Optional[List[str]] = None,
750
+ ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
751
+ """Gets statuses of managed jobs and parse the
752
+ jobs to responses.ManagedJobRecord."""
753
+ jobs, total, status_counts, total_no_filter = queue_v2(
754
+ refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
755
+ name_match, pool_match, page, limit, statuses, fields)
756
+ return [responses.ManagedJobRecord(**job) for job in jobs
757
+ ], total, status_counts, total_no_filter
758
+
759
+
760
+ @metrics_lib.time_me
761
+ def queue_v2(
762
+ refresh: bool,
763
+ skip_finished: bool = False,
764
+ all_users: bool = False,
765
+ job_ids: Optional[List[int]] = None,
766
+ user_match: Optional[str] = None,
767
+ workspace_match: Optional[str] = None,
768
+ name_match: Optional[str] = None,
769
+ pool_match: Optional[str] = None,
770
+ page: Optional[int] = None,
771
+ limit: Optional[int] = None,
772
+ statuses: Optional[List[str]] = None,
773
+ fields: Optional[List[str]] = None,
591
774
  ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
592
775
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
593
- """Gets statuses of managed jobs.
776
+ """Gets statuses of managed jobs with filtering.
594
777
 
595
778
  Please refer to sky.cli.job_queue for documentation.
596
779
 
@@ -633,20 +816,23 @@ def queue(
633
816
  if page is not None:
634
817
  raise ValueError('Limit must be specified when page is specified')
635
818
 
636
- handle = _maybe_restart_controller(refresh,
637
- stopped_message='No in-progress '
638
- 'managed jobs.',
639
- spinner_message='Checking '
640
- 'managed jobs')
819
+ with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
820
+ handle = _maybe_restart_controller(refresh,
821
+ stopped_message='No in-progress '
822
+ 'managed jobs.',
823
+ spinner_message='Checking '
824
+ 'managed jobs')
641
825
  backend = backend_utils.get_backend_from_handle(handle)
642
826
  assert isinstance(backend, backends.CloudVmRayBackend)
643
827
 
644
828
  user_hashes: Optional[List[Optional[str]]] = None
829
+ show_jobs_without_user_hash = False
645
830
  if not all_users:
646
831
  user_hashes = [common_utils.get_user_hash()]
647
832
  # For backwards compatibility, we show jobs that do not have a
648
833
  # user_hash. TODO(cooperc): Remove before 0.12.0.
649
834
  user_hashes.append(None)
835
+ show_jobs_without_user_hash = True
650
836
  elif user_match is not None:
651
837
  users = global_user_state.get_user_by_name_match(user_match)
652
838
  if not users:
@@ -654,70 +840,109 @@ def queue(
654
840
  user_hashes = [user.id for user in users]
655
841
 
656
842
  accessible_workspaces = list(workspaces_core.get_workspaces().keys())
657
- code = managed_job_utils.ManagedJobCodeGen.get_job_table(
658
- skip_finished, accessible_workspaces, job_ids, workspace_match,
659
- name_match, pool_match, page, limit, user_hashes, statuses)
660
- returncode, job_table_payload, stderr = backend.run_on_head(
661
- handle,
662
- code,
663
- require_outputs=True,
664
- stream_logs=False,
665
- separate_stderr=True)
843
+
844
+ if handle.is_grpc_enabled_with_flag:
845
+ try:
846
+ request = managed_jobsv1_pb2.GetJobTableRequest(
847
+ skip_finished=skip_finished,
848
+ accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
849
+ workspaces=accessible_workspaces)),
850
+ job_ids=managed_jobsv1_pb2.JobIds(
851
+ ids=job_ids) if job_ids is not None else None,
852
+ workspace_match=workspace_match,
853
+ name_match=name_match,
854
+ pool_match=pool_match,
855
+ page=page,
856
+ limit=limit,
857
+ # Remove None from user_hashes, as the gRPC server uses the
858
+ # show_jobs_without_user_hash flag instead.
859
+ user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
860
+ user_hash for user_hash in user_hashes
861
+ if user_hash is not None
862
+ ]) if user_hashes is not None else None,
863
+ statuses=managed_jobsv1_pb2.Statuses(
864
+ statuses=statuses) if statuses is not None else None,
865
+ fields=managed_jobsv1_pb2.Fields(
866
+ fields=fields) if fields is not None else None,
867
+ show_jobs_without_user_hash=show_jobs_without_user_hash,
868
+ )
869
+ response = backend_utils.invoke_skylet_with_retries(
870
+ lambda: cloud_vm_ray_backend.SkyletClient(
871
+ handle.get_grpc_channel()).get_managed_job_table(request))
872
+ jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
873
+ return jobs, response.total, dict(
874
+ response.status_counts), response.total_no_filter
875
+ except exceptions.SkyletMethodNotImplementedError:
876
+ pass
877
+
878
+ with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
879
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
880
+ skip_finished, accessible_workspaces, job_ids, workspace_match,
881
+ name_match, pool_match, page, limit, user_hashes, statuses, fields)
882
+ with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
883
+ returncode, job_table_payload, stderr = backend.run_on_head(
884
+ handle,
885
+ code,
886
+ require_outputs=True,
887
+ stream_logs=False,
888
+ separate_stderr=True)
666
889
 
667
890
  if returncode != 0:
668
891
  logger.error(job_table_payload + stderr)
669
892
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
670
893
  f'{returncode}.\n{job_table_payload + stderr}')
671
894
 
672
- (jobs, total, result_type, total_no_filter, status_counts
673
- ) = managed_job_utils.load_managed_job_queue(job_table_payload)
895
+ with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
896
+ (jobs, total, result_type, total_no_filter, status_counts
897
+ ) = managed_job_utils.load_managed_job_queue(job_table_payload)
674
898
 
675
899
  if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
676
900
  return jobs, total, status_counts, total_no_filter
677
901
 
678
902
  # Backward compatibility for old jobs controller without filtering
679
903
  # TODO(hailong): remove this after 0.12.0
680
- if not all_users:
904
+ with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
905
+ if not all_users:
681
906
 
682
- def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
683
- user_hash = job.get('user_hash', None)
684
- if user_hash is None:
685
- # For backwards compatibility, we show jobs that do not have a
686
- # user_hash. TODO(cooperc): Remove before 0.12.0.
687
- return True
688
- return user_hash == common_utils.get_user_hash()
907
+ def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
908
+ user_hash = job.get('user_hash', None)
909
+ if user_hash is None:
910
+ # For backwards compatibility, we show jobs that do not have
911
+ # a user_hash. TODO(cooperc): Remove before 0.12.0.
912
+ return True
913
+ return user_hash == common_utils.get_user_hash()
689
914
 
690
- jobs = list(filter(user_hash_matches_or_missing, jobs))
915
+ jobs = list(filter(user_hash_matches_or_missing, jobs))
691
916
 
692
- jobs = list(
693
- filter(
694
- lambda job: job.get('workspace', skylet_constants.
695
- SKYPILOT_DEFAULT_WORKSPACE) in
696
- accessible_workspaces, jobs))
697
-
698
- if skip_finished:
699
- # Filter out the finished jobs. If a multi-task job is partially
700
- # finished, we will include all its tasks.
701
- non_finished_tasks = list(
702
- filter(lambda job: not job['status'].is_terminal(), jobs))
703
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
704
917
  jobs = list(
705
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
706
-
707
- if job_ids:
708
- jobs = [job for job in jobs if job['job_id'] in job_ids]
709
-
710
- filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
711
- jobs,
712
- workspace_match,
713
- name_match,
714
- pool_match,
715
- page=page,
716
- limit=limit,
717
- user_match=user_match,
718
- enable_user_match=True,
719
- statuses=statuses,
720
- )
918
+ filter(
919
+ lambda job: job.get('workspace', skylet_constants.
920
+ SKYPILOT_DEFAULT_WORKSPACE) in
921
+ accessible_workspaces, jobs))
922
+
923
+ if skip_finished:
924
+ # Filter out the finished jobs. If a multi-task job is partially
925
+ # finished, we will include all its tasks.
926
+ non_finished_tasks = list(
927
+ filter(lambda job: not job['status'].is_terminal(), jobs))
928
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
929
+ jobs = list(
930
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
931
+
932
+ if job_ids:
933
+ jobs = [job for job in jobs if job['job_id'] in job_ids]
934
+
935
+ filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
936
+ jobs,
937
+ workspace_match,
938
+ name_match,
939
+ pool_match,
940
+ page=page,
941
+ limit=limit,
942
+ user_match=user_match,
943
+ enable_user_match=True,
944
+ statuses=statuses,
945
+ )
721
946
  return filtered_jobs, total, status_counts, total_no_filter
722
947
 
723
948
 
@@ -760,33 +985,60 @@ def cancel(name: Optional[str] = None,
760
985
  'Can only specify one of JOB_IDS, name, pool, or all/'
761
986
  f'all_users. Provided {" ".join(arguments)!r}.')
762
987
 
988
+ job_ids = None if (all_users or all) else job_ids
989
+
763
990
  backend = backend_utils.get_backend_from_handle(handle)
764
991
  assert isinstance(backend, backends.CloudVmRayBackend)
765
- if all_users:
766
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
767
- None, all_users=True)
768
- elif all:
769
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
770
- elif job_ids:
771
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
772
- job_ids)
773
- elif name is not None:
774
- code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
775
- else:
776
- assert pool is not None, (job_ids, name, pool, all)
777
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(pool)
778
- # The stderr is redirected to stdout
779
- returncode, stdout, stderr = backend.run_on_head(handle,
780
- code,
781
- require_outputs=True,
782
- stream_logs=False)
783
- try:
784
- subprocess_utils.handle_returncode(returncode, code,
785
- 'Failed to cancel managed job',
786
- stdout + stderr)
787
- except exceptions.CommandError as e:
788
- with ux_utils.print_exception_no_traceback():
789
- raise RuntimeError(e.error_msg) from e
992
+
993
+ use_legacy = not handle.is_grpc_enabled_with_flag
994
+
995
+ if not use_legacy:
996
+ current_workspace = skypilot_config.get_active_workspace()
997
+ try:
998
+ request = managed_jobsv1_pb2.CancelJobsRequest(
999
+ current_workspace=current_workspace)
1000
+
1001
+ if all_users or all or job_ids:
1002
+ request.all_users = all_users
1003
+ if all:
1004
+ request.user_hash = common_utils.get_user_hash()
1005
+ if job_ids is not None:
1006
+ request.job_ids.CopyFrom(
1007
+ managed_jobsv1_pb2.JobIds(ids=job_ids))
1008
+ elif name is not None:
1009
+ request.job_name = name
1010
+ else:
1011
+ assert pool is not None, (job_ids, name, pool, all)
1012
+ request.pool_name = pool
1013
+
1014
+ response = backend_utils.invoke_skylet_with_retries(
1015
+ lambda: cloud_vm_ray_backend.SkyletClient(
1016
+ handle.get_grpc_channel()).cancel_managed_jobs(request))
1017
+ stdout = response.message
1018
+ except exceptions.SkyletMethodNotImplementedError:
1019
+ use_legacy = True
1020
+
1021
+ if use_legacy:
1022
+ if all_users or all or job_ids:
1023
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
1024
+ job_ids, all_users=all_users)
1025
+ elif name is not None:
1026
+ code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
1027
+ name)
1028
+ else:
1029
+ assert pool is not None, (job_ids, name, pool, all)
1030
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
1031
+ pool)
1032
+ # The stderr is redirected to stdout
1033
+ returncode, stdout, stderr = backend.run_on_head(
1034
+ handle, code, require_outputs=True, stream_logs=False)
1035
+ try:
1036
+ subprocess_utils.handle_returncode(
1037
+ returncode, code, 'Failed to cancel managed job',
1038
+ stdout + stderr)
1039
+ except exceptions.CommandError as e:
1040
+ with ux_utils.print_exception_no_traceback():
1041
+ raise RuntimeError(e.error_msg) from e
790
1042
 
791
1043
  logger.info(stdout)
792
1044
  if 'Multiple jobs found with name' in stdout:
@@ -901,9 +1153,10 @@ def pool_apply(
901
1153
  task: 'sky.Task',
902
1154
  pool_name: str,
903
1155
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
1156
+ workers: Optional[int] = None,
904
1157
  ) -> None:
905
1158
  """Apply a config to a pool."""
906
- return impl.apply(task, pool_name, mode, pool=True)
1159
+ return impl.apply(task, workers, pool_name, mode, pool=True)
907
1160
 
908
1161
 
909
1162
  @usage_lib.entrypoint