skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/jobs/scheduler.py CHANGED
@@ -42,144 +42,220 @@ Nomenclature:
42
42
  """
43
43
 
44
44
  from argparse import ArgumentParser
45
+ import asyncio
45
46
  import contextlib
46
47
  import os
48
+ import pathlib
49
+ import shutil
47
50
  import sys
48
- import time
49
- from typing import Optional
51
+ import typing
52
+ from typing import List, Optional, Set
53
+ import uuid
50
54
 
51
55
  import filelock
52
56
 
53
- from sky import exceptions
54
57
  from sky import sky_logging
58
+ from sky import skypilot_config
59
+ from sky.adaptors import common as adaptors_common
60
+ from sky.client import sdk
55
61
  from sky.jobs import constants as managed_job_constants
56
62
  from sky.jobs import state
57
- from sky.serve import serve_utils
63
+ from sky.jobs import utils as managed_job_utils
58
64
  from sky.skylet import constants
59
- from sky.utils import common_utils
60
65
  from sky.utils import controller_utils
61
66
  from sky.utils import subprocess_utils
62
67
 
68
+ if typing.TYPE_CHECKING:
69
+ import logging
70
+
71
+ import psutil
72
+ else:
73
+ psutil = adaptors_common.LazyImport('psutil')
74
+
63
75
  logger = sky_logging.init_logger('sky.jobs.controller')
64
76
 
65
- _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
77
+ # Job controller lock. This is used to synchronize writing/reading the
78
+ # controller pid file.
79
+ JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
80
+ '~/.sky/locks/job_controller_pid.lock')
81
+
82
+ JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
83
+ JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
84
+
85
+ CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
86
+
87
+
88
+ def _parse_controller_pid_entry(
89
+ entry: str) -> Optional[state.ControllerPidRecord]:
90
+ entry = entry.strip()
91
+ if not entry:
92
+ return None
93
+ # The entry should be like <pid>,<started_at>
94
+ # pid is an integer, started_at is a float
95
+ # For backwards compatibility, we also support just <pid>
96
+ entry_parts = entry.split(',')
97
+ if len(entry_parts) == 2:
98
+ [raw_pid, raw_started_at] = entry_parts
99
+ elif len(entry_parts) == 1:
100
+ # Backwards compatibility, pre-#7847
101
+ # TODO(cooperc): Remove for 0.13.0
102
+ raw_pid = entry_parts[0]
103
+ raw_started_at = None
104
+ else:
105
+ # Unknown format
106
+ return None
107
+
108
+ try:
109
+ pid = int(raw_pid)
110
+ except ValueError:
111
+ return None
112
+
113
+ started_at: Optional[float] = None
114
+ if raw_started_at:
115
+ try:
116
+ started_at = float(raw_started_at)
117
+ except ValueError:
118
+ started_at = None
119
+ return state.ControllerPidRecord(pid=pid, started_at=started_at)
120
+
121
+
122
+ def get_controller_process_records(
123
+ ) -> Optional[List[state.ControllerPidRecord]]:
124
+ """Return recorded controller processes if the file can be read."""
125
+ if not os.path.exists(JOB_CONTROLLER_PID_PATH):
126
+ # If the file doesn't exist, it means the controller server is not
127
+ # running, so we return an empty list
128
+ return []
129
+ try:
130
+ with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
131
+ lines = f.read().splitlines()
132
+ except (FileNotFoundError, OSError):
133
+ return None
134
+
135
+ records: List[state.ControllerPidRecord] = []
136
+ for line in lines:
137
+ record = _parse_controller_pid_entry(line)
138
+ if record is not None:
139
+ records.append(record)
140
+ return records
66
141
 
67
142
 
68
- def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
69
- pool: Optional[str]) -> None:
70
- activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
71
- source_environment_cmd = (f'source {env_file_path};'
72
- if env_file_path else '')
73
- maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
74
- run_controller_cmd = (
75
- f'{sys.executable} -u -m sky.jobs.controller '
76
- f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
77
-
78
- # If the command line here is changed, please also update
79
- # utils._controller_process_alive. The substring `--job-id X`
80
- # should be in the command.
81
- run_cmd = (f'{activate_python_env_cmd}'
82
- f'{source_environment_cmd}'
83
- f'{run_controller_cmd}')
143
+ def _append_controller_pid_record(pid: int,
144
+ started_at: Optional[float]) -> None:
145
+ # Note: started_at is a float, but converting to a string will not lose any
146
+ # precision. See https://docs.python.org/3/tutorial/floatingpoint.html and
147
+ # https://github.com/python/cpython/issues/53583
148
+ entry = str(pid) if started_at is None else f'{pid},{started_at}'
149
+ with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
150
+ f.write(entry + '\n')
84
151
 
152
+
153
+ def start_controller() -> None:
154
+ """Start the job controller process.
155
+
156
+ This requires that the env file is already set up.
157
+ """
158
+ os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
85
159
  logs_dir = os.path.expanduser(
86
160
  managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
87
161
  os.makedirs(logs_dir, exist_ok=True)
88
- log_path = os.path.join(logs_dir, f'{job_id}.log')
162
+ controller_uuid = str(uuid.uuid4())
163
+ log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
164
+
165
+ activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
166
+ run_controller_cmd = (f'{sys.executable} -u -m'
167
+ f'sky.jobs.controller {controller_uuid}')
168
+
169
+ run_cmd = (f'{activate_python_env_cmd}'
170
+ f'{run_controller_cmd}')
171
+
172
+ logger.info(f'Running controller with command: {run_cmd}')
89
173
 
90
174
  pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
91
- state.set_job_controller_pid(job_id, pid)
92
-
93
- logger.debug(f'Job {job_id} started with pid {pid}')
94
-
95
-
96
- def maybe_schedule_next_jobs() -> None:
97
- """Determine if any managed jobs can be scheduled, and if so, schedule them.
98
-
99
- Here, "schedule" means to select job that is waiting, and allow it to
100
- proceed. It does NOT mean to submit a job to the scheduler.
101
-
102
- For newly submitted jobs, scheduling means updating the state of the jobs,
103
- and starting the job controller process. For jobs that are already alive but
104
- are waiting to launch a new task or recover, just update the state of the
105
- job to indicate that the launch can proceed.
106
-
107
- This function transitions jobs into LAUNCHING on a best-effort basis. That
108
- is, if we can start any jobs, we will, but if not, we will exit (almost)
109
- immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
110
- be started now (either because the lock is held, or because there are not
111
- enough resources), another call to this function will be made whenever that
112
- situation is resolved. (If the lock is held, the lock holder should start
113
- the jobs. If there aren't enough resources, the next controller to exit and
114
- free up resources should start the jobs.)
115
-
116
- If this function obtains the lock, it will launch as many jobs as possible
117
- before releasing the lock. This is what allows other calls to exit
118
- immediately if the lock is held, while ensuring that all jobs are started as
119
- soon as possible.
120
-
121
- This uses subprocess_utils.launch_new_process_tree() to start the controller
122
- processes, which should be safe to call from pretty much any code running on
123
- the jobs controller instance. New job controller processes will be detached
124
- from the current process and there will not be a parent/child relationship.
125
- See launch_new_process_tree for more.
126
-
127
- After adding the pool support, this function will be called in a per-pool
128
- basis. We employ resources limitation for each pool given the number of
129
- ready workers in the pool. Each pool will have its own scheduler queue,
130
- indicating by the argument `pool`. Finished job in pool 1 will only trigger
131
- another jobs in pool 1, but the job in pool 2 will still be waiting. When
132
- the `pool` argument is None, it schedules a job regardless of the pool.
175
+ pid_started_at = psutil.Process(pid).create_time()
176
+ _append_controller_pid_record(pid, pid_started_at)
177
+
178
+
179
+ def get_alive_controllers() -> Optional[int]:
180
+ records = get_controller_process_records()
181
+ if records is None:
182
+ # If we cannot read the file reliably, avoid starting extra controllers.
183
+ return None
184
+ if not records:
185
+ return 0
186
+
187
+ alive = 0
188
+ for record in records:
189
+ if managed_job_utils.controller_process_alive(record, quiet=False):
190
+ alive += 1
191
+ return alive
192
+
193
+
194
+ def maybe_start_controllers(from_scheduler: bool = False) -> None:
195
+ """Start the job controller process.
196
+
197
+ If the process is already running, it will not start a new one.
198
+ Will also add the job_id, dag_yaml_path, and env_file_path to the
199
+ controllers list of processes.
133
200
  """
201
+ # In consolidation mode, during rolling update, two API servers may be
202
+ # running. If we are on the new API server, and we haven't finished the
203
+ # recovery process, we should avoid starting new controllers. The old API
204
+ # server/consolidated jobs controller could run update_managed_jobs_statuses
205
+ # and if there are jobs running on the new API server, the old one will not
206
+ # see the corresponding processes and may mark them as FAILED_CONTROLLER.
207
+ if from_scheduler and managed_job_utils.is_consolidation_mode(
208
+ ) and os.path.exists(
209
+ os.path.expanduser(
210
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
211
+ # This could happen during an API server rolling update, or during
212
+ # normal running while managed-job-status-refresh-daemon is running. In
213
+ # either case, the controllers should be already started or will be
214
+ # started by the recovery process.
215
+ logger.info('Recovery is still in progress, skipping controller start.')
216
+ return
134
217
  try:
135
- # We must use a global lock rather than a per-job lock to ensure correct
136
- # parallelism control. If we cannot obtain the lock, exit immediately.
137
- # The current lock holder is expected to launch any jobs it can before
138
- # releasing the lock.
139
- with filelock.FileLock(controller_utils.get_resources_lock_path(),
140
- blocking=False):
141
- while True:
142
- maybe_next_job = state.get_waiting_job()
143
- if maybe_next_job is None:
144
- # Nothing left to start, break from scheduling loop
145
- break
146
- actual_pool = maybe_next_job['pool']
147
-
148
- current_state = maybe_next_job['schedule_state']
149
-
150
- assert current_state in (
151
- state.ManagedJobScheduleState.ALIVE_WAITING,
152
- state.ManagedJobScheduleState.WAITING), maybe_next_job
153
-
154
- # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
155
- # since they will have been submitted and therefore started
156
- # first. The requirements to launch in an alive job are more
157
- # lenient, so there is no way that we wouldn't be able to launch
158
- # an ALIVE_WAITING job, but we would be able to launch a WAITING
159
- # job.
160
- if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
161
- if not controller_utils.can_provision():
162
- # Can't schedule anything, break from scheduling loop.
163
- break
164
- elif current_state == state.ManagedJobScheduleState.WAITING:
165
- if not _can_start_new_job(actual_pool):
166
- # Can't schedule anything, break from scheduling loop.
167
- break
168
-
169
- logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
170
- state.scheduler_set_launching(maybe_next_job['job_id'],
171
- current_state)
172
-
173
- if current_state == state.ManagedJobScheduleState.WAITING:
174
- # The job controller has not been started yet. We must start
175
- # it.
176
-
177
- job_id = maybe_next_job['job_id']
178
- dag_yaml_path = maybe_next_job['dag_yaml_path']
179
- env_file_path = maybe_next_job['env_file_path']
180
-
181
- _start_controller(job_id, dag_yaml_path, env_file_path,
182
- actual_pool)
218
+ with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
219
+ if from_scheduler and not managed_job_utils.is_consolidation_mode():
220
+ cur = pathlib.Path(CURRENT_HASH)
221
+ old = pathlib.Path(f'{CURRENT_HASH}.old')
222
+
223
+ if old.exists() and cur.exists():
224
+ if (old.read_text(encoding='utf-8') !=
225
+ cur.read_text(encoding='utf-8')):
226
+ # TODO(luca): there is a 1/2^160 chance that there will
227
+ # be a collision. using a geometric distribution and
228
+ # assuming one update a day, we expect a bug slightly
229
+ # before the heat death of the universe. should get
230
+ # this fixed before then.
231
+ try:
232
+ # this will stop all the controllers and the api
233
+ # server.
234
+ sdk.api_stop()
235
+ # All controllers should be dead. Remove the PIDs so
236
+ # that update_managed_jobs_statuses won't think they
237
+ # have failed.
238
+ state.reset_jobs_for_recovery()
239
+ except Exception as e: # pylint: disable=broad-except
240
+ logger.error(f'Failed to stop the api server: {e}')
241
+ pass
242
+ else:
243
+ shutil.copyfile(cur, old)
244
+ if not old.exists():
245
+ shutil.copyfile(cur, old)
246
+
247
+ alive = get_alive_controllers()
248
+ if alive is None:
249
+ return
250
+ wanted = controller_utils.get_number_of_jobs_controllers()
251
+ started = 0
252
+
253
+ while alive + started < wanted:
254
+ start_controller()
255
+ started += 1
256
+
257
+ if started > 0:
258
+ logger.info(f'Started {started} controllers')
183
259
 
184
260
  except filelock.Timeout:
185
261
  # If we can't get the lock, just exit. The process holding the lock
@@ -188,30 +264,63 @@ def maybe_schedule_next_jobs() -> None:
188
264
 
189
265
 
190
266
  def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
191
- env_file_path: str, priority: int, pool: Optional[str]) -> None:
267
+ env_file_path: str, priority: int) -> None:
192
268
  """Submit an existing job to the scheduler.
193
269
 
194
270
  This should be called after a job is created in the `spot` table as
195
271
  PENDING. It will tell the scheduler to try and start the job controller, if
196
- there are resources available. It may block to acquire the lock, so it
197
- should not be on the critical path for `sky jobs launch -d`.
272
+ there are resources available.
198
273
 
199
274
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
200
275
  """
201
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
202
- is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
203
- original_user_yaml_path,
204
- env_file_path,
205
- common_utils.get_user_hash(),
206
- priority)
207
- if is_resume:
208
- _start_controller(job_id, dag_yaml_path, env_file_path, pool)
209
- else:
210
- maybe_schedule_next_jobs()
211
-
212
-
213
- @contextlib.contextmanager
214
- def scheduled_launch(job_id: int):
276
+ controller_process = state.get_job_controller_process(job_id)
277
+ if controller_process is not None:
278
+ # why? TODO(cooperc): figure out why this is needed, fix it, and remove
279
+ if managed_job_utils.controller_process_alive(controller_process,
280
+ job_id):
281
+ # This can happen when HA recovery runs for some reason but the job
282
+ # controller is still alive.
283
+ logger.warning(f'Job {job_id} is still alive with controller '
284
+ f'{controller_process}, skipping submission')
285
+ maybe_start_controllers(from_scheduler=True)
286
+ return
287
+
288
+ with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
289
+ dag_yaml_content = dag_file.read()
290
+ with open(original_user_yaml_path, 'r',
291
+ encoding='utf-8') as original_user_yaml_file:
292
+ original_user_yaml_content = original_user_yaml_file.read()
293
+ with open(env_file_path, 'r', encoding='utf-8') as env_file:
294
+ env_file_content = env_file.read()
295
+
296
+ # Read config file if SKYPILOT_CONFIG env var is set
297
+ config_file_content: Optional[str] = None
298
+ config_file_path = os.environ.get(skypilot_config.ENV_VAR_SKYPILOT_CONFIG)
299
+ if config_file_path:
300
+ config_file_path = os.path.expanduser(config_file_path)
301
+ if os.path.exists(config_file_path):
302
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
303
+ config_file_content = config_file.read()
304
+
305
+ config_bytes = (len(config_file_content) if config_file_content else 0)
306
+ logger.debug(f'Storing job {job_id} file contents in database '
307
+ f'(DAG bytes={len(dag_yaml_content)}, '
308
+ f'original user yaml bytes={len(original_user_yaml_content)}, '
309
+ f'env bytes={len(env_file_content)}, '
310
+ f'config bytes={config_bytes}).')
311
+ state.scheduler_set_waiting(job_id, dag_yaml_content,
312
+ original_user_yaml_content, env_file_content,
313
+ config_file_content, priority)
314
+ maybe_start_controllers(from_scheduler=True)
315
+
316
+
317
+ @contextlib.asynccontextmanager
318
+ async def scheduled_launch(
319
+ job_id: int,
320
+ starting: Set[int],
321
+ starting_lock: asyncio.Lock,
322
+ starting_signal: asyncio.Condition,
323
+ ):
215
324
  """Launch as part of an ongoing job.
216
325
 
217
326
  A newly started job will already be LAUNCHING, and this will immediately
@@ -240,30 +349,34 @@ def scheduled_launch(job_id: int):
240
349
  yield
241
350
  return
242
351
 
243
- # If we're already in LAUNCHING schedule_state, we don't need to wait.
244
- # This may be the case for the first launch of a job.
245
- if (state.get_job_schedule_state(job_id) !=
246
- state.ManagedJobScheduleState.LAUNCHING):
247
- # Since we aren't LAUNCHING, we need to wait to be scheduled.
248
- _set_alive_waiting(job_id)
352
+ assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
353
+ 'starting_lock and starting_signal must use the same lock')
354
+
355
+ while True:
356
+ async with starting_lock:
357
+ starting_count = len(starting)
358
+ if starting_count < controller_utils.LAUNCHES_PER_WORKER:
359
+ break
360
+ logger.info('Too many jobs starting, waiting for a slot')
361
+ await starting_signal.wait()
249
362
 
250
- while (state.get_job_schedule_state(job_id) !=
251
- state.ManagedJobScheduleState.LAUNCHING):
252
- time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
363
+ logger.info(f'Starting job {job_id}')
364
+
365
+ async with starting_lock:
366
+ starting.add(job_id)
367
+
368
+ await state.scheduler_set_launching_async(job_id)
253
369
 
254
370
  try:
255
371
  yield
256
- except exceptions.NoClusterLaunchedError:
257
- # NoClusterLaunchedError is indicates that the job is in retry backoff.
258
- # We should transition to ALIVE_BACKOFF instead of ALIVE.
259
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
260
- state.scheduler_set_alive_backoff(job_id)
261
- raise
372
+ except Exception as e:
373
+ raise e
262
374
  else:
263
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
264
- state.scheduler_set_alive(job_id)
375
+ await state.scheduler_set_alive_async(job_id)
265
376
  finally:
266
- maybe_schedule_next_jobs()
377
+ async with starting_lock:
378
+ starting.remove(job_id)
379
+ starting_signal.notify()
267
380
 
268
381
 
269
382
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -274,38 +387,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
274
387
 
275
388
  The job could be in any terminal ManagedJobStatus. However, once DONE, it
276
389
  should never transition back to another state.
390
+
391
+ This is only called by utils.update_managed_jobs_statuses which is sync.
277
392
  """
278
393
  if idempotent and (state.get_job_schedule_state(job_id)
279
394
  == state.ManagedJobScheduleState.DONE):
280
395
  return
281
396
 
282
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
283
- state.scheduler_set_done(job_id, idempotent)
284
- maybe_schedule_next_jobs()
285
-
286
-
287
- def _set_alive_waiting(job_id: int) -> None:
288
- """Should use wait_until_launch_okay() to transition to this state."""
289
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
290
- state.scheduler_set_alive_waiting(job_id)
291
- maybe_schedule_next_jobs()
397
+ state.scheduler_set_done(job_id, idempotent)
292
398
 
293
399
 
294
- def _can_start_new_job(pool: Optional[str]) -> bool:
295
- # Check basic resource limits
296
- # Pool jobs don't need to provision resources, so we skip the check.
297
- if not ((controller_utils.can_provision() or pool is not None) and
298
- controller_utils.can_start_new_process()):
299
- return False
300
-
301
- # Check if there are available workers in the pool
302
- if pool is not None:
303
- alive_jobs_in_pool = state.get_num_alive_jobs(pool)
304
- if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
305
- logger.debug(f'No READY workers available in pool {pool}')
306
- return False
400
+ async def job_done_async(job_id: int, idempotent: bool = False):
401
+ """Async version of job_done."""
402
+ if idempotent and (await state.get_job_schedule_state_async(job_id)
403
+ == state.ManagedJobScheduleState.DONE):
404
+ return
307
405
 
308
- return True
406
+ await state.scheduler_set_done_async(job_id, idempotent)
309
407
 
310
408
 
311
409
  if __name__ == '__main__':
@@ -337,4 +435,4 @@ if __name__ == '__main__':
337
435
  f' Default: {constants.DEFAULT_PRIORITY}.')
338
436
  args = parser.parse_args()
339
437
  submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
340
- args.priority, args.pool)
438
+ args.priority)