skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -1,24 +1,31 @@
1
- """Controller: handles the life cycle of a managed job.
2
-
3
- TODO(cooperc): Document lifecycle, and multiprocess layout.
1
+ """Controller: handles scheduling and the life cycle of a managed job.
4
2
  """
5
- import argparse
6
- import multiprocessing
3
+ import asyncio
4
+ import io
7
5
  import os
8
6
  import pathlib
7
+ import resource
9
8
  import shutil
9
+ import sys
10
+ import threading
10
11
  import time
11
12
  import traceback
12
13
  import typing
13
- from typing import Optional, Tuple
14
+ from typing import Dict, Optional, Set
14
15
 
15
- import filelock
16
+ import dotenv
16
17
 
18
+ import sky
19
+ from sky import core
17
20
  from sky import exceptions
18
21
  from sky import sky_logging
22
+ from sky import skypilot_config
19
23
  from sky.backends import backend_utils
20
24
  from sky.backends import cloud_vm_ray_backend
21
25
  from sky.data import data_utils
26
+ from sky.jobs import constants as jobs_constants
27
+ from sky.jobs import file_content_utils
28
+ from sky.jobs import log_gc
22
29
  from sky.jobs import recovery_strategy
23
30
  from sky.jobs import scheduler
24
31
  from sky.jobs import state as managed_job_state
@@ -26,39 +33,125 @@ from sky.jobs import utils as managed_job_utils
26
33
  from sky.skylet import constants
27
34
  from sky.skylet import job_lib
28
35
  from sky.usage import usage_lib
36
+ from sky.utils import annotations
29
37
  from sky.utils import common
30
38
  from sky.utils import common_utils
39
+ from sky.utils import context
40
+ from sky.utils import context_utils
31
41
  from sky.utils import controller_utils
32
42
  from sky.utils import dag_utils
33
43
  from sky.utils import status_lib
34
- from sky.utils import subprocess_utils
35
44
  from sky.utils import ux_utils
36
45
 
37
- if typing.TYPE_CHECKING:
38
- import sky
39
-
40
- # Use the explicit logger name so that the logger is under the
41
- # `sky.jobs.controller` namespace when executed directly, so as
42
- # to inherit the setup from the `sky` logger.
43
46
  logger = sky_logging.init_logger('sky.jobs.controller')
44
47
 
48
+ _background_tasks: Set[asyncio.Task] = set()
49
+ _background_tasks_lock: asyncio.Lock = asyncio.Lock()
50
+
51
+
52
+ async def create_background_task(coro: typing.Coroutine) -> None:
53
+ """Create a background task and add it to the set of background tasks.
54
+
55
+ Main reason we do this is since tasks are only held as a weak reference in
56
+ the executor, we need to keep a strong reference to the task to avoid it
57
+ being garbage collected.
58
+
59
+ Args:
60
+ coro: The coroutine to create a task for.
61
+ """
62
+ async with _background_tasks_lock:
63
+ task = asyncio.create_task(coro)
64
+ _background_tasks.add(task)
65
+ # TODO(cooperc): Discard needs a lock?
66
+ task.add_done_callback(_background_tasks.discard)
67
+
68
+
69
+ # Make sure to limit the size as we don't want to cache too many DAGs in memory.
70
+ @annotations.lru_cache(scope='global', maxsize=50)
71
+ def _get_dag(job_id: int) -> 'sky.Dag':
72
+ dag_content = file_content_utils.get_job_dag_content(job_id)
73
+ if dag_content is None:
74
+ raise RuntimeError('Managed job DAG YAML content is unavailable for '
75
+ f'job {job_id}. This can happen if the job was '
76
+ 'submitted before file migration completed or if '
77
+ 'the submission failed to persist the DAG. Please '
78
+ 're-submit the job.')
79
+
80
+ dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
81
+ assert dag.name is not None, dag
82
+ return dag
45
83
 
46
- def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
47
- dag = dag_utils.load_chain_dag_from_yaml(dag_yaml)
48
- dag_name = dag.name
49
- assert dag_name is not None, dag
50
- return dag, dag_name
51
84
 
85
+ class JobController:
86
+ """Controls the lifecycle of a single managed job.
52
87
 
53
- class JobsController:
54
- """Each jobs controller manages the life cycle of one managed job."""
88
+ This controller executes the chain DAG recorded for the job by:
89
+ - Loading the DAG and preparing per-task environment variables so each task
90
+ has a stable global job identifier across recoveries.
91
+ - Launching the task on the configured backend (``CloudVmRayBackend``),
92
+ optionally via a cluster pool.
93
+ - Persisting state transitions to the managed jobs state store
94
+ (e.g., STARTING → RUNNING → SUCCEEDED/FAILED/CANCELLED).
95
+ - Monitoring execution, downloading/streaming logs, detecting failures or
96
+ preemptions, and invoking recovery through
97
+ ``recovery_strategy.StrategyExecutor``.
98
+ - Cleaning up clusters and ephemeral resources when tasks finish.
99
+
100
+ Concurrency and coordination:
101
+ - Runs inside an ``asyncio`` event loop.
102
+ - Shares a ``starting`` set, guarded by ``starting_lock`` and signaled via
103
+ ``starting_signal``, to throttle concurrent launches across jobs that the
104
+ top-level ``Controller`` manages.
105
+
106
+ Key attributes:
107
+ - ``_job_id``: Integer identifier of this managed job.
108
+ - ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
109
+ database-backed job YAML.
110
+ - ``_backend``: Backend used to launch and manage clusters.
111
+ - ``_pool``: Optional pool name if using a cluster pool.
112
+ - ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
113
+ coordination primitives. ``starting_lock`` must be used for accessing
114
+ ``starting_signal`` and ``starting``
115
+ - ``_strategy_executor``: Recovery/launch strategy executor (created per
116
+ task).
117
+ """
118
+
119
+ def __init__(
120
+ self,
121
+ job_id: int,
122
+ starting: Set[int],
123
+ starting_lock: asyncio.Lock,
124
+ starting_signal: asyncio.Condition,
125
+ pool: Optional[str] = None,
126
+ ) -> None:
127
+ """Initialize a ``JobsController``.
128
+
129
+ Args:
130
+ job_id: Integer ID of the managed job.
131
+ starting: Shared set of job IDs currently in the STARTING phase,
132
+ used to limit concurrent launches.
133
+ starting_lock: ``asyncio.Lock`` guarding access to the shared
134
+ scheduler state (e.g., the ``starting`` set).
135
+ starting_signal: ``asyncio.Condition`` used to notify when a job
136
+ exits STARTING so more jobs can be admitted.
137
+ pool: Optional cluster pool name. When provided, the job is
138
+ submitted to the pool rather than launching a dedicated
139
+ cluster.
140
+ """
141
+
142
+ self.starting = starting
143
+ self.starting_lock = starting_lock
144
+ self.starting_signal = starting_signal
145
+
146
+ logger.info('Initializing JobsController for job_id=%s', job_id)
55
147
 
56
- def __init__(self, job_id: int, dag_yaml: str) -> None:
57
148
  self._job_id = job_id
58
- self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
59
- logger.info(self._dag)
60
- # TODO(zhwu): this assumes the specific backend.
149
+ self._dag = _get_dag(job_id)
150
+ self._dag_name = self._dag.name
151
+ logger.info(f'Loaded DAG: {self._dag}')
152
+
61
153
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
154
+ self._pool = pool
62
155
 
63
156
  # pylint: disable=line-too-long
64
157
  # Add a unique identifier to the task environment variables, so that
@@ -76,6 +169,7 @@ class JobsController:
76
169
  # dag_utils.maybe_infer_and_fill_dag_and_task_names.
77
170
  assert task_name is not None, self._dag
78
171
  task_name = f'{self._dag_name}_{task_name}'
172
+
79
173
  job_id_env_var = common_utils.get_global_job_id(
80
174
  self._backend.run_timestamp,
81
175
  f'{task_name}',
@@ -92,8 +186,10 @@ class JobsController:
92
186
  task.update_envs(task_envs)
93
187
 
94
188
  def _download_log_and_stream(
95
- self, task_id: Optional[int],
96
- handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
189
+ self,
190
+ task_id: Optional[int],
191
+ handle: Optional['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
192
+ job_id_on_pool_cluster: Optional[int],
97
193
  ) -> None:
98
194
  """Downloads and streams the logs of the current job with given task ID.
99
195
 
@@ -105,18 +201,36 @@ class JobsController:
105
201
  logger.info(f'Cluster for job {self._job_id} is not found. '
106
202
  'Skipping downloading and streaming the logs.')
107
203
  return
204
+
108
205
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
109
- 'managed_jobs')
110
- log_file = controller_utils.download_and_stream_latest_job_log(
111
- self._backend, handle, managed_job_logs_dir)
206
+ 'managed_jobs',
207
+ f'job-id-{self._job_id}')
208
+ log_file = controller_utils.download_and_stream_job_log(
209
+ self._backend,
210
+ handle,
211
+ managed_job_logs_dir,
212
+ job_ids=[str(job_id_on_pool_cluster)]
213
+ if job_id_on_pool_cluster is not None else None)
112
214
  if log_file is not None:
113
- # Set the path of the log file for the current task, so it can be
114
- # accessed even after the job is finished
215
+ # Set the path of the log file for the current task, so it can
216
+ # be accessed even after the job is finished
115
217
  managed_job_state.set_local_log_file(self._job_id, task_id,
116
218
  log_file)
219
+ else:
220
+ logger.warning(
221
+ f'No log file was downloaded for job {self._job_id}, '
222
+ f'task {task_id}')
223
+
117
224
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
118
225
 
119
- def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
226
+ async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
227
+ if cluster_name is None:
228
+ return
229
+ if self._pool is None:
230
+ await context_utils.to_thread(managed_job_utils.terminate_cluster,
231
+ cluster_name)
232
+
233
+ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
120
234
  """Busy loop monitoring cluster status and handling recovery.
121
235
 
122
236
  When the task is successfully completed, this function returns True,
@@ -151,70 +265,185 @@ class JobsController:
151
265
  3. Any unexpected error happens during the `sky.launch`.
152
266
  Other exceptions may be raised depending on the backend.
153
267
  """
268
+ task_start_time = time.time()
269
+ logger.info(
270
+ f'Starting task {task_id} ({task.name}) for job {self._job_id}')
271
+
272
+ latest_task_id, last_task_prev_status = (
273
+ await
274
+ managed_job_state.get_latest_task_id_status_async(self._job_id))
275
+
276
+ is_resume = False
277
+ if (latest_task_id is not None and last_task_prev_status !=
278
+ managed_job_state.ManagedJobStatus.PENDING):
279
+ assert latest_task_id >= task_id, (latest_task_id, task_id)
280
+ if latest_task_id > task_id:
281
+ logger.info(f'Task {task_id} ({task.name}) has already '
282
+ 'been executed. Skipping...')
283
+ return True
284
+ if latest_task_id == task_id:
285
+ # Start recovery.
286
+ is_resume = True
287
+ logger.info(f'Resuming task {task_id} from previous execution')
154
288
 
155
289
  callback_func = managed_job_utils.event_callback_func(
156
290
  job_id=self._job_id, task_id=task_id, task=task)
291
+
157
292
  if task.run is None:
158
293
  logger.info(f'Skip running task {task_id} ({task.name}) due to its '
159
294
  'run commands being empty.')
160
295
  # Call set_started first to initialize columns in the state table,
161
296
  # including start_at and last_recovery_at to avoid issues for
162
297
  # uninitialized columns.
163
- managed_job_state.set_started(job_id=self._job_id,
164
- task_id=task_id,
165
- start_time=time.time(),
166
- callback_func=callback_func)
167
- managed_job_state.set_succeeded(job_id=self._job_id,
168
- task_id=task_id,
169
- end_time=time.time(),
170
- callback_func=callback_func)
298
+ await managed_job_state.set_started_async(
299
+ job_id=self._job_id,
300
+ task_id=task_id,
301
+ start_time=time.time(),
302
+ callback_func=callback_func)
303
+ await managed_job_state.set_succeeded_async(
304
+ job_id=self._job_id,
305
+ task_id=task_id,
306
+ end_time=time.time(),
307
+ callback_func=callback_func)
308
+ logger.info(f'Empty task {task_id} marked as succeeded immediately')
171
309
  return True
310
+
172
311
  usage_lib.messages.usage.update_task_id(task_id)
173
312
  task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
174
- submitted_at = time.time()
175
- if task_id == 0:
176
- submitted_at = backend_utils.get_timestamp_from_run_timestamp(
177
- self._backend.run_timestamp)
178
313
  assert task.name is not None, task
314
+ # Set the cluster name to None if the job is submitted
315
+ # to a pool. This will be updated when we later calls the `launch`
316
+ # or `recover` function from the strategy executor.
179
317
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
180
- task.name, self._job_id)
318
+ task.name, self._job_id) if self._pool is None else None
181
319
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
182
- cluster_name, self._backend, task, self._job_id)
183
- managed_job_state.set_submitted(
184
- self._job_id,
185
- task_id,
186
- self._backend.run_timestamp,
187
- submitted_at,
188
- resources_str=backend_utils.get_task_resources_str(
189
- task, is_managed_job=True),
190
- specs={
191
- 'max_restarts_on_errors':
192
- self._strategy_executor.max_restarts_on_errors
193
- },
194
- callback_func=callback_func)
195
- logger.info(
196
- f'Submitted managed job {self._job_id} (task: {task_id}, name: '
197
- f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
320
+ cluster_name, self._backend, task, self._job_id, task_id,
321
+ self._pool, self.starting, self.starting_lock, self.starting_signal)
322
+ if not is_resume:
323
+ submitted_at = time.time()
324
+ if task_id == 0:
325
+ submitted_at = backend_utils.get_timestamp_from_run_timestamp(
326
+ self._backend.run_timestamp)
327
+
328
+ resources_str = backend_utils.get_task_resources_str(
329
+ task, is_managed_job=True)
330
+
331
+ await managed_job_state.set_starting_async(
332
+ self._job_id,
333
+ task_id,
334
+ self._backend.run_timestamp,
335
+ submitted_at,
336
+ resources_str=resources_str,
337
+ specs={
338
+ 'max_restarts_on_errors':
339
+ self._strategy_executor.max_restarts_on_errors
340
+ },
341
+ callback_func=callback_func)
342
+ logger.info(f'Submitted managed job {self._job_id} '
343
+ f'(task: {task_id}, name: {task.name!r}); '
344
+ f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
198
345
 
199
346
  logger.info('Started monitoring.')
200
- managed_job_state.set_starting(job_id=self._job_id,
201
- task_id=task_id,
202
- callback_func=callback_func)
203
- remote_job_submitted_at = self._strategy_executor.launch()
204
- assert remote_job_submitted_at is not None, remote_job_submitted_at
205
347
 
206
- managed_job_state.set_started(job_id=self._job_id,
207
- task_id=task_id,
208
- start_time=remote_job_submitted_at,
209
- callback_func=callback_func)
348
+ # Only do the initial cluster launch if not resuming from a controller
349
+ # failure. Otherwise, we will transit to recovering immediately.
350
+ remote_job_submitted_at = time.time()
351
+ if not is_resume:
352
+ launch_start = time.time()
353
+
354
+ # Run the launch in a separate thread to avoid blocking the event
355
+ # loop. The scheduler functions used internally already have their
356
+ # own file locks.
357
+ remote_job_submitted_at = await self._strategy_executor.launch()
358
+
359
+ launch_time = time.time() - launch_start
360
+ logger.info(f'Cluster launch completed in {launch_time:.2f}s')
361
+ assert remote_job_submitted_at is not None, remote_job_submitted_at
362
+ if self._pool is None:
363
+ job_id_on_pool_cluster = None
364
+ else:
365
+ # Update the cluster name when using cluster pool.
366
+ cluster_name, job_id_on_pool_cluster = (
367
+ await
368
+ managed_job_state.get_pool_submit_info_async(self._job_id))
369
+ if cluster_name is None:
370
+ # Check if we have been cancelled here, in the case where a user
371
+ # quickly cancels the job we want to gracefully handle it here,
372
+ # otherwise we will end up in the FAILED_CONTROLLER state.
373
+ logger.info(f'Cluster name is None for job {self._job_id}, '
374
+ f'task {task_id}. Checking if we have been '
375
+ 'cancelled.')
376
+ status = await (managed_job_state.get_job_status_with_task_id_async(
377
+ job_id=self._job_id, task_id=task_id))
378
+ logger.debug(f'Status for job {self._job_id}, task {task_id}:'
379
+ f'{status}')
380
+ if status == managed_job_state.ManagedJobStatus.CANCELLED:
381
+ logger.info(f'Job {self._job_id}, task {task_id} has '
382
+ 'been quickly cancelled.')
383
+ raise asyncio.CancelledError()
384
+ assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
385
+
386
+ if not is_resume:
387
+ await managed_job_state.set_started_async(
388
+ job_id=self._job_id,
389
+ task_id=task_id,
390
+ start_time=remote_job_submitted_at,
391
+ callback_func=callback_func)
392
+
393
+ monitoring_start_time = time.time()
394
+ status_check_count = 0
395
+
396
+ async with self.starting_lock:
397
+ try:
398
+ self.starting.remove(self._job_id)
399
+ # its fine if we notify again, better to wake someone up
400
+ # and have them go to sleep again, then have some stuck
401
+ # sleeping.
402
+ # ps. this shouldn't actually happen because if its been
403
+ # removed from the set then we would get a key error.
404
+ self.starting_signal.notify()
405
+ except KeyError:
406
+ pass
210
407
 
211
408
  while True:
212
- time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
409
+ status_check_count += 1
410
+
411
+ # NOTE: if we are resuming from a controller failure, we only keep
412
+ # monitoring if the job is in RUNNING state. For all other cases,
413
+ # we will directly transit to recovering since we have no idea what
414
+ # the cluster status is.
415
+ force_transit_to_recovering = False
416
+ if is_resume:
417
+ prev_status = await (
418
+ managed_job_state.get_job_status_with_task_id_async(
419
+ job_id=self._job_id, task_id=task_id))
420
+
421
+ if prev_status is not None:
422
+ if prev_status.is_terminal():
423
+ logger.info(
424
+ f'Task {task_id} already in terminal state: '
425
+ f'{prev_status}')
426
+ return (prev_status ==
427
+ managed_job_state.ManagedJobStatus.SUCCEEDED)
428
+ if (prev_status ==
429
+ managed_job_state.ManagedJobStatus.CANCELLING):
430
+ # If the controller is down when cancelling the job,
431
+ # we re-raise the error to run the `_cleanup` function
432
+ # again to clean up any remaining resources.
433
+ logger.info(f'Task {task_id} was being cancelled, '
434
+ 're-raising cancellation')
435
+ raise asyncio.CancelledError()
436
+ if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
437
+ force_transit_to_recovering = True
438
+ # This resume logic should only be triggered once.
439
+ is_resume = False
440
+
441
+ await asyncio.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
213
442
 
214
443
  # Check the network connection to avoid false alarm for job failure.
215
444
  # Network glitch was observed even in the VM.
216
445
  try:
217
- backend_utils.check_network_connection()
446
+ await backend_utils.async_check_network_connection()
218
447
  except exceptions.NetworkError:
219
448
  logger.info('Network is not available. Retrying again in '
220
449
  f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
@@ -223,31 +452,63 @@ class JobsController:
223
452
 
224
453
  # NOTE: we do not check cluster status first because race condition
225
454
  # can occur, i.e. cluster can be down during the job status check.
226
- job_status = managed_job_utils.get_job_status(
227
- self._backend, cluster_name)
455
+ # NOTE: If fetching the job status fails or we force to transit to
456
+ # recovering, we will set the job status to None, which will force
457
+ # enter the recovering logic.
458
+ job_status = None
459
+ if not force_transit_to_recovering:
460
+ try:
461
+ job_status = await managed_job_utils.get_job_status(
462
+ self._backend,
463
+ cluster_name,
464
+ job_id=job_id_on_pool_cluster,
465
+ )
466
+ except exceptions.FetchClusterInfoError as fetch_e:
467
+ logger.info(
468
+ 'Failed to fetch the job status. Start recovery.\n'
469
+ f'Exception: {common_utils.format_exception(fetch_e)}\n'
470
+ f'Traceback: {traceback.format_exc()}')
228
471
 
229
472
  if job_status == job_lib.JobStatus.SUCCEEDED:
230
- success_end_time = managed_job_utils.try_to_get_job_end_time(
231
- self._backend, cluster_name)
473
+ logger.info(f'Task {task_id} succeeded! '
474
+ 'Getting end time and cleaning up')
475
+ try:
476
+ success_end_time = await context_utils.to_thread(
477
+ managed_job_utils.try_to_get_job_end_time,
478
+ self._backend, cluster_name, job_id_on_pool_cluster)
479
+ except Exception as e: # pylint: disable=broad-except
480
+ logger.warning(
481
+ f'Failed to get job end time: '
482
+ f'{common_utils.format_exception(e)}',
483
+ exc_info=True)
484
+ success_end_time = 0
485
+
232
486
  # The job is done. Set the job to SUCCEEDED first before start
233
487
  # downloading and streaming the logs to make it more responsive.
234
- managed_job_state.set_succeeded(self._job_id,
235
- task_id,
236
- end_time=success_end_time,
237
- callback_func=callback_func)
488
+ await managed_job_state.set_succeeded_async(
489
+ self._job_id,
490
+ task_id,
491
+ end_time=success_end_time,
492
+ callback_func=callback_func)
238
493
  logger.info(
239
494
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
240
495
  f'Cleaning up the cluster {cluster_name}.')
241
496
  try:
242
- clusters = backend_utils.get_clusters(
497
+ logger.info(f'Downloading logs on cluster {cluster_name} '
498
+ f'and job id {job_id_on_pool_cluster}.')
499
+ clusters = await context_utils.to_thread(
500
+ backend_utils.get_clusters,
243
501
  cluster_names=[cluster_name],
244
502
  refresh=common.StatusRefreshMode.NONE,
245
- all_users=True)
503
+ all_users=True,
504
+ _include_is_managed=True)
246
505
  if clusters:
247
506
  assert len(clusters) == 1, (clusters, cluster_name)
248
507
  handle = clusters[0].get('handle')
249
508
  # Best effort to download and stream the logs.
250
- self._download_log_and_stream(task_id, handle)
509
+ await context_utils.to_thread(
510
+ self._download_log_and_stream, task_id, handle,
511
+ job_id_on_pool_cluster)
251
512
  except Exception as e: # pylint: disable=broad-except
252
513
  # We don't want to crash here, so just log and continue.
253
514
  logger.warning(
@@ -256,7 +517,14 @@ class JobsController:
256
517
  exc_info=True)
257
518
  # Only clean up the cluster, not the storages, because tasks may
258
519
  # share storages.
259
- managed_job_utils.terminate_cluster(cluster_name=cluster_name)
520
+ await self._cleanup_cluster(cluster_name)
521
+
522
+ task_total_time = time.time() - task_start_time
523
+ monitoring_time = time.time() - monitoring_start_time
524
+ logger.info(f'Task {task_id} completed successfully in '
525
+ f'{task_total_time:.2f}s '
526
+ f'(monitoring time: {monitoring_time:.2f}s, '
527
+ f'status checks: {status_check_count})')
260
528
  return True
261
529
 
262
530
  # For single-node jobs, non-terminated job_status indicates a
@@ -272,7 +540,7 @@ class JobsController:
272
540
  if job_status in job_lib.JobStatus.user_code_failure_states():
273
541
  # Add a grace period before the check of preemption to avoid
274
542
  # false alarm for job failure.
275
- time.sleep(5)
543
+ await asyncio.sleep(5)
276
544
 
277
545
  # Pull the actual cluster status from the cloud provider to
278
546
  # determine whether the cluster is preempted or failed.
@@ -303,14 +571,19 @@ class JobsController:
303
571
  in job_lib.JobStatus.user_code_failure_states() or
304
572
  job_status == job_lib.JobStatus.FAILED_DRIVER):
305
573
  # The user code has probably crashed, fail immediately.
306
- end_time = managed_job_utils.try_to_get_job_end_time(
307
- self._backend, cluster_name)
574
+ logger.info(
575
+ f'Task {task_id} failed with status: {job_status}')
576
+ end_time = await context_utils.to_thread(
577
+ managed_job_utils.try_to_get_job_end_time,
578
+ self._backend, cluster_name, job_id_on_pool_cluster)
308
579
  logger.info(
309
580
  f'The user job failed ({job_status}). Please check the '
310
581
  'logs below.\n'
311
582
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
312
583
 
313
- self._download_log_and_stream(task_id, handle)
584
+ await context_utils.to_thread(self._download_log_and_stream,
585
+ task_id, handle,
586
+ job_id_on_pool_cluster)
314
587
 
315
588
  failure_reason = (
316
589
  'To see the details, run: '
@@ -346,7 +619,9 @@ class JobsController:
346
619
  f'[{self._strategy_executor.restart_cnt_on_failure}'
347
620
  f'/{max_restarts}]')
348
621
  else:
349
- managed_job_state.set_failed(
622
+ logger.info(
623
+ f'Task {task_id} failed and will not be retried')
624
+ await managed_job_state.set_failed_async(
350
625
  self._job_id,
351
626
  task_id,
352
627
  failure_type=managed_job_status,
@@ -361,7 +636,7 @@ class JobsController:
361
636
  failure_reason = (
362
637
  f'Unknown job status {job_status}. To see the details, '
363
638
  f'run: sky jobs logs --controller {self._job_id}')
364
- managed_job_state.set_failed(
639
+ await managed_job_state.set_failed_async(
365
640
  self._job_id,
366
641
  task_id,
367
642
  failure_type=managed_job_state.ManagedJobStatus.
@@ -381,84 +656,131 @@ class JobsController:
381
656
  if handle is not None:
382
657
  resources = handle.launched_resources
383
658
  assert resources is not None, handle
384
- if resources.need_cleanup_after_preemption_or_failure():
659
+ # If we are forcing to transit to recovering, we need to clean
660
+ # up the cluster as it is possible that we already submitted the
661
+ # job to the worker cluster, but state is not updated yet. In
662
+ # this case, it is possible that we will double-submit the job
663
+ # to the worker cluster. So we always clean up the cluster here.
664
+ # TODO(tian,cooperc): We can check if there is a running job on
665
+ # the worker cluster, and if so, we can skip the cleanup.
666
+ # Challenge: race condition when the worker cluster thought it
667
+ # does not have a running job yet but later the job is launched.
668
+ if (resources.need_cleanup_after_preemption_or_failure() or
669
+ force_transit_to_recovering):
385
670
  # Some spot resource (e.g., Spot TPU VM) may need to be
386
671
  # cleaned up after preemption, as running launch again on
387
672
  # those clusters again may fail.
388
673
  logger.info('Cleaning up the preempted or failed cluster'
389
674
  '...')
390
- managed_job_utils.terminate_cluster(cluster_name)
675
+ await self._cleanup_cluster(cluster_name)
391
676
 
392
677
  # Try to recover the managed jobs, when the cluster is preempted or
393
678
  # failed or the job status is failed to be fetched.
394
- managed_job_state.set_recovering(job_id=self._job_id,
395
- task_id=task_id,
396
- callback_func=callback_func)
397
- recovered_time = self._strategy_executor.recover()
398
- managed_job_state.set_recovered(self._job_id,
399
- task_id,
400
- recovered_time=recovered_time,
401
- callback_func=callback_func)
402
-
403
- def run(self):
679
+ logger.info(f'Starting recovery for task {task_id}, '
680
+ f'it is currently {job_status}')
681
+ await managed_job_state.set_recovering_async(
682
+ job_id=self._job_id,
683
+ task_id=task_id,
684
+ force_transit_to_recovering=force_transit_to_recovering,
685
+ callback_func=callback_func)
686
+
687
+ recovered_time = await self._strategy_executor.recover()
688
+
689
+ if self._pool is not None:
690
+ cluster_name, job_id_on_pool_cluster = (
691
+ await
692
+ managed_job_state.get_pool_submit_info_async(self._job_id))
693
+ assert cluster_name is not None
694
+ await managed_job_state.set_recovered_async(
695
+ self._job_id,
696
+ task_id,
697
+ recovered_time=recovered_time,
698
+ callback_func=callback_func)
699
+
700
+ async def run(self):
404
701
  """Run controller logic and handle exceptions."""
702
+ logger.info(f'Starting JobsController run for job {self._job_id}')
405
703
  task_id = 0
704
+ cancelled = False
705
+
406
706
  try:
407
707
  succeeded = True
408
708
  # We support chain DAGs only for now.
409
709
  for task_id, task in enumerate(self._dag.tasks):
410
- succeeded = self._run_one_task(task_id, task)
710
+ logger.info(
711
+ f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
712
+ f'{task.name}')
713
+ task_start = time.time()
714
+ succeeded = await self._run_one_task(task_id, task)
715
+ task_time = time.time() - task_start
716
+ logger.info(f'Task {task_id} completed in {task_time:.2f}s '
717
+ f'with success={succeeded}')
718
+
411
719
  if not succeeded:
720
+ logger.info(f'Task {task_id} failed, stopping execution')
412
721
  break
722
+
413
723
  except exceptions.ProvisionPrechecksError as e:
414
724
  # Please refer to the docstring of self._run for the cases when
415
725
  # this exception can occur.
726
+ logger.error(f'Provision prechecks failed for task {task_id}')
416
727
  failure_reason = ('; '.join(
417
728
  common_utils.format_exception(reason, use_bracket=True)
418
729
  for reason in e.reasons))
419
730
  logger.error(failure_reason)
420
- self._update_failed_task_state(
731
+ await self._update_failed_task_state(
421
732
  task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
422
733
  failure_reason)
423
734
  except exceptions.ManagedJobReachedMaxRetriesError as e:
424
735
  # Please refer to the docstring of self._run for the cases when
425
736
  # this exception can occur.
737
+ logger.error(f'Managed job reached max retries for task {task_id}')
426
738
  failure_reason = common_utils.format_exception(e)
427
739
  logger.error(failure_reason)
428
740
  # The managed job should be marked as FAILED_NO_RESOURCE, as the
429
741
  # managed job may be able to launch next time.
430
- self._update_failed_task_state(
742
+ await self._update_failed_task_state(
431
743
  task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
432
744
  failure_reason)
745
+ except asyncio.CancelledError: # pylint: disable=try-except-raise
746
+ # have this here to avoid getting caught by the general except block
747
+ # below.
748
+ cancelled = True
749
+ raise
433
750
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
751
+ logger.error(
752
+ f'Unexpected error in JobsController run for task {task_id}')
434
753
  with ux_utils.enable_traceback():
435
754
  logger.error(traceback.format_exc())
436
755
  msg = ('Unexpected error occurred: ' +
437
756
  common_utils.format_exception(e, use_bracket=True))
438
757
  logger.error(msg)
439
- self._update_failed_task_state(
758
+ await self._update_failed_task_state(
440
759
  task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
441
760
  msg)
442
761
  finally:
443
- # This will set all unfinished tasks to CANCELLING, and will not
444
- # affect the jobs in terminal states.
445
- # We need to call set_cancelling before set_cancelled to make sure
446
- # the table entries are correctly set.
447
762
  callback_func = managed_job_utils.event_callback_func(
448
763
  job_id=self._job_id,
449
764
  task_id=task_id,
450
765
  task=self._dag.tasks[task_id])
451
- managed_job_state.set_cancelling(job_id=self._job_id,
452
- callback_func=callback_func)
453
- managed_job_state.set_cancelled(job_id=self._job_id,
454
- callback_func=callback_func)
766
+ await managed_job_state.set_cancelling_async(
767
+ job_id=self._job_id, callback_func=callback_func)
768
+ if not cancelled:
769
+ # the others haven't been run yet so we can set them to
770
+ # cancelled immediately (no resources to clean up).
771
+ # if we are running and get cancelled, we need to clean up the
772
+ # resources first so this will be done later.
773
+ await managed_job_state.set_cancelled_async(
774
+ job_id=self._job_id, callback_func=callback_func)
455
775
 
456
- def _update_failed_task_state(
776
+ async def _update_failed_task_state(
457
777
  self, task_id: int,
458
778
  failure_type: managed_job_state.ManagedJobStatus,
459
779
  failure_reason: str):
460
780
  """Update the state of the failed task."""
461
- managed_job_state.set_failed(
781
+ logger.info(f'Updating failed task state: task_id={task_id}, '
782
+ f'failure_type={failure_type}')
783
+ await managed_job_state.set_failed_async(
462
784
  self._job_id,
463
785
  task_id=task_id,
464
786
  failure_type=failure_type,
@@ -469,176 +791,416 @@ class JobsController:
469
791
  task=self._dag.tasks[task_id]))
470
792
 
471
793
 
472
- def _run_controller(job_id: int, dag_yaml: str):
473
- """Runs the controller in a remote process for interruption."""
474
- # The controller needs to be instantiated in the remote process, since
475
- # the controller is not serializable.
476
- jobs_controller = JobsController(job_id, dag_yaml)
477
- jobs_controller.run()
478
-
479
-
480
- def _handle_signal(job_id):
481
- """Handle the signal if the user sent it."""
482
- signal_file = pathlib.Path(
483
- managed_job_utils.SIGNAL_FILE_PREFIX.format(job_id))
484
- user_signal = None
485
- if signal_file.exists():
486
- # Filelock is needed to prevent race condition with concurrent
487
- # signal writing.
488
- with filelock.FileLock(str(signal_file) + '.lock'):
489
- with signal_file.open(mode='r', encoding='utf-8') as f:
490
- user_signal = f.read().strip()
491
- try:
492
- user_signal = managed_job_utils.UserSignal(user_signal)
493
- except ValueError:
494
- logger.warning(
495
- f'Unknown signal received: {user_signal}. Ignoring.')
496
- user_signal = None
497
- # Remove the signal file, after reading the signal.
498
- signal_file.unlink()
499
- if user_signal is None:
500
- # None or empty string.
501
- return
502
- assert user_signal == managed_job_utils.UserSignal.CANCEL, (
503
- f'Only cancel signal is supported, but {user_signal} got.')
504
- raise exceptions.ManagedJobUserCancelledError(
505
- f'User sent {user_signal.value} signal.')
506
-
507
-
508
- def _cleanup(job_id: int, dag_yaml: str):
509
- """Clean up the cluster(s) and storages.
510
-
511
- (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
512
- to be cleaned up after the whole job is finished, as the tasks
513
- may share the same storage.
514
- (2) Clean up the cluster(s) that are not cleaned up yet, which can happen
515
- when the task failed or cancelled. At most one cluster should be left
516
- when reaching here, as we currently only support chain DAGs, and only
517
- task is executed at a time.
794
+ class ControllerManager:
795
+ """Main loop for a job controller process.
796
+
797
+ Many jobs will be handled by this, each by a single JobController.
518
798
  """
519
- dag, _ = _get_dag_and_name(dag_yaml)
520
- for task in dag.tasks:
521
- assert task.name is not None, task
522
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
523
- task.name, job_id)
524
- managed_job_utils.terminate_cluster(cluster_name)
525
-
526
- # Clean up Storages with persistent=False.
527
- # TODO(zhwu): this assumes the specific backend.
528
- backend = cloud_vm_ray_backend.CloudVmRayBackend()
529
- # Need to re-construct storage object in the controller process
530
- # because when SkyPilot API server machine sends the yaml config to the
531
- # controller machine, only storage metadata is sent, not the storage
532
- # object itself.
533
- for storage in task.storage_mounts.values():
534
- storage.construct()
535
- backend.teardown_ephemeral_storage(task)
536
-
537
- # Clean up any files mounted from the local disk, such as two-hop file
538
- # mounts.
539
- for file_mount in (task.file_mounts or {}).values():
799
+
800
+ def __init__(self, controller_uuid: str) -> None:
801
+ self._controller_uuid = controller_uuid
802
+ # Global state for active jobs
803
+ self.job_tasks: Dict[int, asyncio.Task] = {}
804
+ self.starting: Set[int] = set()
805
+
806
+ # Lock for synchronizing access to global state dictionary
807
+ # Must always hold _job_tasks_lock when accessing the _starting_signal.
808
+ self._job_tasks_lock = asyncio.Lock()
809
+ # We signal whenever a job leaves the api server launching state. Feel
810
+ # free to signal as much as you want to be safe from leaks (if you
811
+ # do not signal enough there may be some jobs forever waiting to
812
+ # launch).
813
+ self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
814
+
815
+ self._pid = os.getpid()
816
+
817
+ async def _cleanup(self, job_id: int, pool: Optional[str] = None):
818
+ """Clean up the cluster(s) and storages.
819
+
820
+ (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
821
+ to be cleaned up after the whole job is finished, as the tasks
822
+ may share the same storage.
823
+ (2) Clean up the cluster(s) that are not cleaned up yet, which can
824
+ happen when the task failed or cancelled. At most one cluster
825
+ should be left when reaching here, as we currently only support
826
+ chain DAGs, and only one task is executed at a time.
827
+ """
828
+ # Cleanup the HA recovery script first as it is possible that some error
829
+ # was raised when we construct the task object (e.g.,
830
+ # sky.exceptions.ResourcesUnavailableError).
831
+ await managed_job_state.remove_ha_recovery_script_async(job_id)
832
+
833
+ def task_cleanup(task: 'sky.Task', job_id: int):
834
+ assert task.name is not None, task
835
+ error = None
836
+
540
837
  try:
541
- if not data_utils.is_cloud_store_url(file_mount):
542
- path = os.path.expanduser(file_mount)
543
- if os.path.isdir(path):
544
- shutil.rmtree(path)
545
- else:
546
- os.remove(path)
838
+ if pool is None:
839
+ cluster_name = (
840
+ managed_job_utils.generate_managed_job_cluster_name(
841
+ task.name, job_id))
842
+ managed_job_utils.terminate_cluster(cluster_name)
843
+ status = core.status(cluster_names=[cluster_name],
844
+ all_users=True)
845
+ assert (len(status) == 0 or
846
+ status[0]['status'] == sky.ClusterStatus.STOPPED), (
847
+ f'{cluster_name} is not down: {status}')
848
+ logger.info(f'{cluster_name} is down')
849
+ else:
850
+ cluster_name, job_id_on_pool_cluster = (
851
+ managed_job_state.get_pool_submit_info(job_id))
852
+ if cluster_name is not None:
853
+ if job_id_on_pool_cluster is not None:
854
+ core.cancel(cluster_name=cluster_name,
855
+ job_ids=[job_id_on_pool_cluster],
856
+ _try_cancel_if_cluster_is_init=True)
547
857
  except Exception as e: # pylint: disable=broad-except
858
+ error = e
859
+ logger.warning(
860
+ f'Failed to terminate cluster {cluster_name}: {e}')
861
+ # we continue to try cleaning up whatever else we can.
862
+ # Clean up Storages with persistent=False.
863
+ # TODO(zhwu): this assumes the specific backend.
864
+ backend = cloud_vm_ray_backend.CloudVmRayBackend()
865
+ # Need to re-construct storage object in the controller process
866
+ # because when SkyPilot API server machine sends the yaml config to
867
+ # the controller machine, only storage metadata is sent, not the
868
+ # storage object itself.
869
+ try:
870
+ for storage in task.storage_mounts.values():
871
+ storage.construct()
872
+ except (exceptions.StorageSpecError, exceptions.StorageError) as e:
548
873
  logger.warning(
549
- f'Failed to clean up file mount {file_mount}: {e}')
874
+ f'Failed to construct storage object for teardown: {e}\n'
875
+ 'This may happen because storage construction already '
876
+ 'failed during launch, storage was deleted externally, '
877
+ 'credentials expired/changed, or network connectivity '
878
+ 'issues.')
879
+ try:
880
+ backend.teardown_ephemeral_storage(task)
881
+ except Exception as e: # pylint: disable=broad-except
882
+ error = e
883
+ logger.warning(f'Failed to teardown ephemeral storage: {e}')
884
+ # we continue to try cleaning up whatever else we can.
885
+
886
+ # Clean up any files mounted from the local disk, such as two-hop
887
+ # file mounts.
888
+ for file_mount in (task.file_mounts or {}).values():
889
+ try:
890
+ # For consolidation mode, there is no two-hop file mounts
891
+ # and the file path here represents the real user data.
892
+ # We skip the cleanup for consolidation mode.
893
+ if (not data_utils.is_cloud_store_url(file_mount) and
894
+ not managed_job_utils.is_consolidation_mode()):
895
+ path = os.path.expanduser(file_mount)
896
+ if os.path.isdir(path):
897
+ shutil.rmtree(path)
898
+ else:
899
+ os.remove(path)
900
+ except Exception as e: # pylint: disable=broad-except
901
+ logger.warning(
902
+ f'Failed to clean up file mount {file_mount}: {e}')
550
903
 
904
+ if error is not None:
905
+ raise error
551
906
 
552
- def start(job_id, dag_yaml):
553
- """Start the controller."""
554
- controller_process = None
555
- cancelling = False
556
- task_id = None
557
- try:
558
- _handle_signal(job_id)
559
- # TODO(suquark): In theory, we should make controller process a
560
- # daemon process so it will be killed after this process exits,
561
- # however daemon process cannot launch subprocesses, explained here:
562
- # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Process.daemon # pylint: disable=line-too-long
563
- # So we can only enable daemon after we no longer need to
564
- # start daemon processes like Ray.
565
- controller_process = multiprocessing.Process(target=_run_controller,
566
- args=(job_id, dag_yaml))
567
- controller_process.start()
568
- while controller_process.is_alive():
569
- _handle_signal(job_id)
570
- time.sleep(1)
571
- except exceptions.ManagedJobUserCancelledError:
572
- dag, _ = _get_dag_and_name(dag_yaml)
573
- task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
574
- assert task_id is not None, job_id
575
- logger.info(
576
- f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
577
- managed_job_state.set_cancelling(
578
- job_id=job_id,
579
- callback_func=managed_job_utils.event_callback_func(
580
- job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
581
- cancelling = True
582
- finally:
583
- if controller_process is not None:
584
- logger.info(f'Killing controller process {controller_process.pid}.')
585
- # NOTE: it is ok to kill or join a killed process.
586
- # Kill the controller process first; if its child process is
587
- # killed first, then the controller process will raise errors.
588
- # Kill any possible remaining children processes recursively.
589
- subprocess_utils.kill_children_processes(
590
- parent_pids=[controller_process.pid], force=True)
591
- controller_process.join()
592
- logger.info(f'Controller process {controller_process.pid} killed.')
593
-
594
- logger.info(f'Cleaning up any cluster for job {job_id}.')
595
- # NOTE: Originally, we send an interruption signal to the controller
596
- # process and the controller process handles cleanup. However, we
597
- # figure out the behavior differs from cloud to cloud
598
- # (e.g., GCP ignores 'SIGINT'). A possible explanation is
599
- # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
600
- # But anyway, a clean solution is killing the controller process
601
- # directly, and then cleanup the cluster job_state.
602
- _cleanup(job_id, dag_yaml=dag_yaml)
603
- logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
604
-
605
- if cancelling:
606
- assert task_id is not None, job_id # Since it's set with cancelling
607
- managed_job_state.set_cancelled(
907
+ dag = _get_dag(job_id)
908
+ error = None
909
+ for task in dag.tasks:
910
+ # most things in this function are blocking
911
+ try:
912
+ await context_utils.to_thread(task_cleanup, task, job_id)
913
+ except Exception as e: # pylint: disable=broad-except
914
+ error = e
915
+
916
+ if error is not None:
917
+ # we only raise the last error that occurred, but its fine to lose
918
+ # some data here.
919
+ raise error
920
+
921
+ # Use context.contextual to enable per-job output redirection and env var
922
+ # isolation.
923
+ @context.contextual_async
924
+ async def run_job_loop(self,
925
+ job_id: int,
926
+ log_file: str,
927
+ pool: Optional[str] = None):
928
+ """Background task that runs the job loop."""
929
+ ctx = context.get()
930
+ assert ctx is not None, 'Context is not initialized'
931
+ ctx.redirect_log(pathlib.Path(log_file))
932
+
933
+ logger.info('Starting job loop for %s', job_id)
934
+ logger.info(' log_file=%s', log_file)
935
+ logger.info(' pool=%s', pool)
936
+ logger.info(f'From controller {self._controller_uuid}')
937
+ logger.info(f' pid={self._pid}')
938
+
939
+ env_content = file_content_utils.get_job_env_content(job_id)
940
+ if env_content:
941
+ try:
942
+ env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
943
+ logger.info('Loading %d environment variables for job %s',
944
+ len(env_vars), job_id)
945
+ if ctx is not None:
946
+ for key, value in env_vars.items():
947
+ if value is not None:
948
+ ctx.override_envs({key: value})
949
+ logger.debug('Set environment variable: %s=%s', key,
950
+ value)
951
+ skypilot_config.reload_config()
952
+ else: # pragma: no cover - defensive
953
+ logger.error('Context is None, cannot set environment '
954
+ 'variables')
955
+ except Exception as e: # pylint: disable=broad-except
956
+ logger.error(
957
+ 'Failed to load environment variables for job %s: '
958
+ '%s', job_id, e)
959
+
960
+ cancelling = False
961
+ try:
962
+ controller = JobController(job_id, self.starting,
963
+ self._job_tasks_lock,
964
+ self._starting_signal, pool)
965
+
966
+ async with self._job_tasks_lock:
967
+ if job_id in self.job_tasks:
968
+ logger.error(f'Job {job_id} already exists in job_tasks')
969
+ raise ValueError(f'Job {job_id} already exists')
970
+
971
+ # Create the task and store it
972
+ # This function should return instantly and run the job loop in
973
+ # the background.
974
+ task = asyncio.create_task(controller.run())
975
+ self.job_tasks[job_id] = task
976
+ await task
977
+ except asyncio.CancelledError:
978
+ logger.info(f'Job {job_id} was cancelled')
979
+ dag = _get_dag(job_id)
980
+ task_id, _ = await (
981
+ managed_job_state.get_latest_task_id_status_async(job_id))
982
+ assert task_id is not None, job_id
983
+ logger.info(f'Cancelling managed job, job_id: {job_id}, '
984
+ f'task_id: {task_id}')
985
+ await managed_job_state.set_cancelling_async(
608
986
  job_id=job_id,
609
987
  callback_func=managed_job_utils.event_callback_func(
610
988
  job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
989
+ cancelling = True
990
+ raise
991
+ except Exception as e:
992
+ logger.error(f'Unexpected error in job loop for {job_id}: '
993
+ f'{common_utils.format_exception(e)}')
994
+ raise
995
+ finally:
996
+ try:
997
+ await self._cleanup(job_id, pool=pool)
998
+ logger.info(
999
+ f'Cluster of managed job {job_id} has been cleaned up.')
1000
+ except Exception as e: # pylint: disable=broad-except
1001
+ failure_reason = ('Failed to clean up: '
1002
+ f'{common_utils.format_exception(e)}')
1003
+ await managed_job_state.set_failed_async(
1004
+ job_id,
1005
+ task_id=None,
1006
+ failure_type=managed_job_state.ManagedJobStatus.
1007
+ FAILED_CONTROLLER,
1008
+ failure_reason=failure_reason,
1009
+ override_terminal=True)
1010
+
1011
+ if cancelling:
1012
+ # Since it's set with cancelling
1013
+ assert task_id is not None, job_id
1014
+ await managed_job_state.set_cancelled_async(
1015
+ job_id=job_id,
1016
+ callback_func=managed_job_utils.event_callback_func(
1017
+ job_id=job_id, task_id=task_id,
1018
+ task=dag.tasks[task_id]))
1019
+
1020
+ # We should check job status after 'set_cancelled', otherwise
1021
+ # the job status is not terminal.
1022
+ job_status = await managed_job_state.get_status_async(job_id)
1023
+ assert job_status is not None
1024
+ # The job can be non-terminal if the controller exited abnormally,
1025
+ # e.g. failed to launch cluster after reaching the MAX_RETRY.
1026
+ if not job_status.is_terminal():
1027
+ logger.info(f'Previous job status: {job_status.value}')
1028
+ await managed_job_state.set_failed_async(
1029
+ job_id,
1030
+ task_id=None,
1031
+ failure_type=managed_job_state.ManagedJobStatus.
1032
+ FAILED_CONTROLLER,
1033
+ failure_reason=(
1034
+ 'Unexpected error occurred. For details, '
1035
+ f'run: sky jobs logs --controller {job_id}'))
1036
+
1037
+ await scheduler.job_done_async(job_id)
1038
+
1039
+ async with self._job_tasks_lock:
1040
+ try:
1041
+ # just in case we were cancelled or some other error
1042
+ # occurred during launch
1043
+ self.starting.remove(job_id)
1044
+ # its fine if we notify again, better to wake someone up
1045
+ # and have them go to sleep again, then have some stuck
1046
+ # sleeping.
1047
+ self._starting_signal.notify()
1048
+ except KeyError:
1049
+ pass
1050
+
1051
+ # Remove the job from the job_tasks dictionary.
1052
+ async with self._job_tasks_lock:
1053
+ if job_id in self.job_tasks:
1054
+ del self.job_tasks[job_id]
1055
+
1056
+ async def start_job(
1057
+ self,
1058
+ job_id: int,
1059
+ pool: Optional[str] = None,
1060
+ ):
1061
+ """Start a new job.
1062
+
1063
+ Args:
1064
+ job_id: The ID of the job to start.
1065
+ """
1066
+ # Create log file path for job output redirection
1067
+ log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
1068
+ os.makedirs(log_dir, exist_ok=True)
1069
+ log_file = os.path.join(log_dir, f'{job_id}.log')
1070
+
1071
+ logger.info(f'Starting job {job_id} with log_file={log_file}')
1072
+
1073
+ async with self._job_tasks_lock:
1074
+ self.starting.add(job_id)
1075
+ await create_background_task(self.run_job_loop(job_id, log_file, pool))
1076
+
1077
+ logger.info(f'Job {job_id} started successfully')
1078
+
1079
+ async def cancel_job(self):
1080
+ """Cancel an existing job."""
1081
+ while True:
1082
+ cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
1083
+ for cancel in cancels:
1084
+ async with self._job_tasks_lock:
1085
+ job_id = int(cancel)
1086
+ if job_id in self.job_tasks:
1087
+ logger.info(f'Cancelling job {job_id}')
1088
+
1089
+ task = self.job_tasks[job_id]
1090
+
1091
+ # Run the cancellation in the background, so we can
1092
+ # return immediately.
1093
+ task.cancel()
1094
+ logger.info(f'Job {job_id} cancelled successfully')
1095
+
1096
+ os.remove(f'{jobs_constants.CONSOLIDATED_SIGNAL_PATH}/'
1097
+ f'{job_id}')
1098
+ await asyncio.sleep(15)
1099
+
1100
+ async def monitor_loop(self):
1101
+ """Monitor the job loop."""
1102
+ logger.info(f'Starting monitor loop for pid {os.getpid()}...')
1103
+
1104
+ while True:
1105
+ async with self._job_tasks_lock:
1106
+ running_tasks = [
1107
+ task for task in self.job_tasks.values() if not task.done()
1108
+ ]
1109
+
1110
+ async with self._job_tasks_lock:
1111
+ starting_count = len(self.starting)
1112
+
1113
+ if starting_count >= scheduler.LAUNCHES_PER_WORKER:
1114
+ # launching a job takes around 1 minute, so lets wait half that
1115
+ # time
1116
+ await asyncio.sleep(30)
1117
+ continue
1118
+
1119
+ # Normally, 200 jobs can run on each controller. But if we have a
1120
+ # ton of controllers, we need to limit the number of jobs that can
1121
+ # run on each controller, to achieve a total of 2000 jobs across all
1122
+ # controllers.
1123
+ max_jobs = min(scheduler.MAX_JOBS_PER_WORKER,
1124
+ (scheduler.MAX_TOTAL_RUNNING_JOBS //
1125
+ scheduler.get_number_of_controllers()))
611
1126
 
612
- # We should check job status after 'set_cancelled', otherwise
613
- # the job status is not terminal.
614
- job_status = managed_job_state.get_status(job_id)
615
- assert job_status is not None
616
- # The job can be non-terminal if the controller exited abnormally,
617
- # e.g. failed to launch cluster after reaching the MAX_RETRY.
618
- if not job_status.is_terminal():
619
- logger.info(f'Previous job status: {job_status.value}')
620
- managed_job_state.set_failed(
621
- job_id,
622
- task_id=None,
623
- failure_type=managed_job_state.ManagedJobStatus.
624
- FAILED_CONTROLLER,
625
- failure_reason=('Unexpected error occurred. For details, '
626
- f'run: sky jobs logs --controller {job_id}'))
627
-
628
- scheduler.job_done(job_id)
1127
+ if len(running_tasks) >= max_jobs:
1128
+ logger.info('Too many jobs running, waiting for 60 seconds')
1129
+ await asyncio.sleep(60)
1130
+ continue
1131
+
1132
+ # Check if there are any jobs that are waiting to launch
1133
+ try:
1134
+ waiting_job = await managed_job_state.get_waiting_job_async(
1135
+ pid=-os.getpid())
1136
+ except Exception as e: # pylint: disable=broad-except
1137
+ logger.error(f'Failed to get waiting job: {e}')
1138
+ await asyncio.sleep(5)
1139
+ continue
1140
+
1141
+ if waiting_job is None:
1142
+ logger.info('No waiting job, waiting for 10 seconds')
1143
+ await asyncio.sleep(10)
1144
+ continue
1145
+
1146
+ logger.info(f'Claiming job {waiting_job["job_id"]}')
1147
+ job_id = waiting_job['job_id']
1148
+ pool = waiting_job.get('pool', None)
1149
+
1150
+ cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
1151
+ if str(job_id) in cancels:
1152
+ status = await managed_job_state.get_status_async(job_id)
1153
+ if status == managed_job_state.ManagedJobStatus.PENDING:
1154
+ logger.info(f'Job {job_id} cancelled')
1155
+ os.remove(f'{jobs_constants.CONSOLIDATED_SIGNAL_PATH}/'
1156
+ f'{job_id}')
1157
+ await managed_job_state.set_cancelling_async(
1158
+ job_id=job_id,
1159
+ callback_func=managed_job_utils.event_callback_func(
1160
+ job_id=job_id, task_id=None, task=None))
1161
+ await managed_job_state.set_cancelled_async(
1162
+ job_id=job_id,
1163
+ callback_func=managed_job_utils.event_callback_func(
1164
+ job_id=job_id, task_id=None, task=None))
1165
+ continue
1166
+
1167
+ await self.start_job(job_id, pool)
1168
+
1169
+
1170
+ async def main(controller_uuid: str):
1171
+ logger.info(f'Starting controller {controller_uuid}')
1172
+
1173
+ context_utils.hijack_sys_attrs()
1174
+
1175
+ controller = ControllerManager(controller_uuid)
1176
+
1177
+ # Will happen multiple times, who cares though
1178
+ os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
1179
+
1180
+ # Increase number of files we can open
1181
+ soft = None
1182
+ try:
1183
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
1184
+ logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
1185
+ logger.info(f'Increasing soft limit to {hard}')
1186
+ resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
1187
+ except OSError as e:
1188
+ logger.warning(f'Failed to increase number of files we can open: {e}\n'
1189
+ f'Current soft limit: {soft}, hard limit: {hard}')
1190
+
1191
+ # Will loop forever, do it in the background
1192
+ cancel_job_task = asyncio.create_task(controller.cancel_job())
1193
+ monitor_loop_task = asyncio.create_task(controller.monitor_loop())
1194
+ # Run the garbage collector in a dedicated daemon thread to avoid affecting
1195
+ # the main event loop.
1196
+ gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
1197
+ gc_thread.start()
1198
+ try:
1199
+ await asyncio.gather(cancel_job_task, monitor_loop_task)
1200
+ except Exception as e: # pylint: disable=broad-except
1201
+ logger.error(f'Controller server crashed: {e}')
1202
+ sys.exit(1)
629
1203
 
630
1204
 
631
1205
  if __name__ == '__main__':
632
- parser = argparse.ArgumentParser()
633
- parser.add_argument('--job-id',
634
- required=True,
635
- type=int,
636
- help='Job id for the controller job.')
637
- parser.add_argument('dag_yaml',
638
- type=str,
639
- help='The path to the user job yaml file.')
640
- args = parser.parse_args()
641
- # We start process with 'spawn', because 'fork' could result in weird
642
- # behaviors; 'spawn' is also cross-platform.
643
- multiprocessing.set_start_method('spawn', force=True)
644
- start(args.job_id, args.dag_yaml)
1206
+ asyncio.run(main(sys.argv[1]))