skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/controller.py CHANGED
@@ -1,31 +1,32 @@
1
- """Controller: handles the life cycle of a managed job.
2
-
3
- TODO(cooperc): Document lifecycle, and multiprocess layout.
1
+ """Controller: handles scheduling and the life cycle of a managed job.
4
2
  """
5
- import argparse
6
- import multiprocessing
3
+ import asyncio
4
+ import io
7
5
  import os
8
6
  import pathlib
7
+ import resource
9
8
  import shutil
9
+ import sys
10
+ import threading
10
11
  import time
11
12
  import traceback
12
13
  import typing
13
- from typing import Optional, Tuple
14
+ from typing import Dict, Optional, Set
14
15
 
15
- import filelock
16
+ import dotenv
16
17
 
17
- # This import ensures backward compatibility. Controller processes may not have
18
- # imported this module initially, but will attempt to import it during job
19
- # termination on the fly. If a job was launched with an old SkyPilot runtime
20
- # and a new job is launched with a newer runtime, the old job's termination
21
- # will try to import code from a different SkyPilot runtime, causing exceptions.
22
- # pylint: disable=unused-import
18
+ import sky
23
19
  from sky import core
24
20
  from sky import exceptions
25
21
  from sky import sky_logging
22
+ from sky import skypilot_config
23
+ from sky.adaptors import common as adaptors_common
26
24
  from sky.backends import backend_utils
27
25
  from sky.backends import cloud_vm_ray_backend
28
26
  from sky.data import data_utils
27
+ from sky.jobs import constants as jobs_constants
28
+ from sky.jobs import file_content_utils
29
+ from sky.jobs import log_gc
29
30
  from sky.jobs import recovery_strategy
30
31
  from sky.jobs import scheduler
31
32
  from sky.jobs import state as managed_job_state
@@ -33,38 +34,128 @@ from sky.jobs import utils as managed_job_utils
33
34
  from sky.skylet import constants
34
35
  from sky.skylet import job_lib
35
36
  from sky.usage import usage_lib
37
+ from sky.utils import annotations
36
38
  from sky.utils import common
37
39
  from sky.utils import common_utils
40
+ from sky.utils import context
41
+ from sky.utils import context_utils
38
42
  from sky.utils import controller_utils
39
43
  from sky.utils import dag_utils
40
44
  from sky.utils import status_lib
41
- from sky.utils import subprocess_utils
42
45
  from sky.utils import ux_utils
43
46
 
44
47
  if typing.TYPE_CHECKING:
45
- import sky
48
+ import psutil
49
+ else:
50
+ psutil = adaptors_common.LazyImport('psutil')
46
51
 
47
- # Use the explicit logger name so that the logger is under the
48
- # `sky.jobs.controller` namespace when executed directly, so as
49
- # to inherit the setup from the `sky` logger.
50
52
  logger = sky_logging.init_logger('sky.jobs.controller')
51
53
 
54
+ _background_tasks: Set[asyncio.Task] = set()
55
+ _background_tasks_lock: asyncio.Lock = asyncio.Lock()
56
+
57
+
58
+ async def create_background_task(coro: typing.Coroutine) -> None:
59
+ """Create a background task and add it to the set of background tasks.
60
+
61
+ Main reason we do this is since tasks are only held as a weak reference in
62
+ the executor, we need to keep a strong reference to the task to avoid it
63
+ being garbage collected.
64
+
65
+ Args:
66
+ coro: The coroutine to create a task for.
67
+ """
68
+ async with _background_tasks_lock:
69
+ task = asyncio.create_task(coro)
70
+ _background_tasks.add(task)
71
+ # TODO(cooperc): Discard needs a lock?
72
+ task.add_done_callback(_background_tasks.discard)
73
+
74
+
75
+ # Make sure to limit the size as we don't want to cache too many DAGs in memory.
76
+ @annotations.lru_cache(scope='global', maxsize=50)
77
+ def _get_dag(job_id: int) -> 'sky.Dag':
78
+ dag_content = file_content_utils.get_job_dag_content(job_id)
79
+ if dag_content is None:
80
+ raise RuntimeError('Managed job DAG YAML content is unavailable for '
81
+ f'job {job_id}. This can happen if the job was '
82
+ 'submitted before file migration completed or if '
83
+ 'the submission failed to persist the DAG. Please '
84
+ 're-submit the job.')
85
+
86
+ dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
87
+ assert dag.name is not None, dag
88
+ return dag
89
+
52
90
 
53
- def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
54
- dag = dag_utils.load_chain_dag_from_yaml(dag_yaml)
55
- dag_name = dag.name
56
- assert dag_name is not None, dag
57
- return dag, dag_name
91
+ class JobController:
92
+ """Controls the lifecycle of a single managed job.
58
93
 
94
+ This controller executes the chain DAG recorded for the job by:
95
+ - Loading the DAG and preparing per-task environment variables so each task
96
+ has a stable global job identifier across recoveries.
97
+ - Launching the task on the configured backend (``CloudVmRayBackend``),
98
+ optionally via a pool.
99
+ - Persisting state transitions to the managed jobs state store
100
+ (e.g., STARTING → RUNNING → SUCCEEDED/FAILED/CANCELLED).
101
+ - Monitoring execution, downloading/streaming logs, detecting failures or
102
+ preemptions, and invoking recovery through
103
+ ``recovery_strategy.StrategyExecutor``.
104
+ - Cleaning up clusters and ephemeral resources when tasks finish.
59
105
 
60
- class JobsController:
61
- """Each jobs controller manages the life cycle of one managed job."""
106
+ Concurrency and coordination:
107
+ - Runs inside an ``asyncio`` event loop.
108
+ - Shares a ``starting`` set, guarded by ``starting_lock`` and signaled via
109
+ ``starting_signal``, to throttle concurrent launches across jobs that the
110
+ top-level ``Controller`` manages.
111
+
112
+ Key attributes:
113
+ - ``_job_id``: Integer identifier of this managed job.
114
+ - ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
115
+ database-backed job YAML.
116
+ - ``_backend``: Backend used to launch and manage clusters.
117
+ - ``_pool``: Optional pool name if using a pool.
118
+ - ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
119
+ coordination primitives. ``starting_lock`` must be used for accessing
120
+ ``starting_signal`` and ``starting``
121
+ - ``_strategy_executor``: Recovery/launch strategy executor (created per
122
+ task).
123
+ """
124
+
125
+ def __init__(
126
+ self,
127
+ job_id: int,
128
+ starting: Set[int],
129
+ starting_lock: asyncio.Lock,
130
+ starting_signal: asyncio.Condition,
131
+ pool: Optional[str] = None,
132
+ ) -> None:
133
+ """Initialize a ``JobsController``.
134
+
135
+ Args:
136
+ job_id: Integer ID of the managed job.
137
+ starting: Shared set of job IDs currently in the STARTING phase,
138
+ used to limit concurrent launches.
139
+ starting_lock: ``asyncio.Lock`` guarding access to the shared
140
+ scheduler state (e.g., the ``starting`` set).
141
+ starting_signal: ``asyncio.Condition`` used to notify when a job
142
+ exits STARTING so more jobs can be admitted.
143
+ pool: Optional pool name. When provided, the job is
144
+ submitted to the pool rather than launching a dedicated
145
+ cluster.
146
+ """
147
+
148
+ self.starting = starting
149
+ self.starting_lock = starting_lock
150
+ self.starting_signal = starting_signal
151
+
152
+ logger.info('Initializing JobsController for job_id=%s', job_id)
62
153
 
63
- def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
64
154
  self._job_id = job_id
65
- self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
66
- logger.info(self._dag)
67
- # TODO(zhwu): this assumes the specific backend.
155
+ self._dag = _get_dag(job_id)
156
+ self._dag_name = self._dag.name
157
+ logger.info(f'Loaded DAG: {self._dag}')
158
+
68
159
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
69
160
  self._pool = pool
70
161
 
@@ -84,6 +175,7 @@ class JobsController:
84
175
  # dag_utils.maybe_infer_and_fill_dag_and_task_names.
85
176
  assert task_name is not None, self._dag
86
177
  task_name = f'{self._dag_name}_{task_name}'
178
+
87
179
  job_id_env_var = common_utils.get_global_job_id(
88
180
  self._backend.run_timestamp,
89
181
  f'{task_name}',
@@ -102,7 +194,7 @@ class JobsController:
102
194
  def _download_log_and_stream(
103
195
  self,
104
196
  task_id: Optional[int],
105
- handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
197
+ handle: Optional['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
106
198
  job_id_on_pool_cluster: Optional[int],
107
199
  ) -> None:
108
200
  """Downloads and streams the logs of the current job with given task ID.
@@ -115,6 +207,7 @@ class JobsController:
115
207
  logger.info(f'Cluster for job {self._job_id} is not found. '
116
208
  'Skipping downloading and streaming the logs.')
117
209
  return
210
+
118
211
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
119
212
  'managed_jobs',
120
213
  f'job-id-{self._job_id}')
@@ -125,19 +218,25 @@ class JobsController:
125
218
  job_ids=[str(job_id_on_pool_cluster)]
126
219
  if job_id_on_pool_cluster is not None else None)
127
220
  if log_file is not None:
128
- # Set the path of the log file for the current task, so it can be
129
- # accessed even after the job is finished
221
+ # Set the path of the log file for the current task, so it can
222
+ # be accessed even after the job is finished
130
223
  managed_job_state.set_local_log_file(self._job_id, task_id,
131
224
  log_file)
225
+ else:
226
+ logger.warning(
227
+ f'No log file was downloaded for job {self._job_id}, '
228
+ f'task {task_id}')
229
+
132
230
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
133
231
 
134
- def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
232
+ async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
135
233
  if cluster_name is None:
136
234
  return
137
235
  if self._pool is None:
138
- managed_job_utils.terminate_cluster(cluster_name)
236
+ await context_utils.to_thread(managed_job_utils.terminate_cluster,
237
+ cluster_name)
139
238
 
140
- def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
239
+ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
141
240
  """Busy loop monitoring cluster status and handling recovery.
142
241
 
143
242
  When the task is successfully completed, this function returns True,
@@ -172,9 +271,14 @@ class JobsController:
172
271
  3. Any unexpected error happens during the `sky.launch`.
173
272
  Other exceptions may be raised depending on the backend.
174
273
  """
274
+ task_start_time = time.time()
275
+ logger.info(
276
+ f'Starting task {task_id} ({task.name}) for job {self._job_id}')
175
277
 
176
278
  latest_task_id, last_task_prev_status = (
177
- managed_job_state.get_latest_task_id_status(self._job_id))
279
+ await
280
+ managed_job_state.get_latest_task_id_status_async(self._job_id))
281
+
178
282
  is_resume = False
179
283
  if (latest_task_id is not None and last_task_prev_status !=
180
284
  managed_job_state.ManagedJobStatus.PENDING):
@@ -186,24 +290,30 @@ class JobsController:
186
290
  if latest_task_id == task_id:
187
291
  # Start recovery.
188
292
  is_resume = True
293
+ logger.info(f'Resuming task {task_id} from previous execution')
189
294
 
190
295
  callback_func = managed_job_utils.event_callback_func(
191
296
  job_id=self._job_id, task_id=task_id, task=task)
297
+
192
298
  if task.run is None:
193
299
  logger.info(f'Skip running task {task_id} ({task.name}) due to its '
194
300
  'run commands being empty.')
195
301
  # Call set_started first to initialize columns in the state table,
196
302
  # including start_at and last_recovery_at to avoid issues for
197
303
  # uninitialized columns.
198
- managed_job_state.set_started(job_id=self._job_id,
199
- task_id=task_id,
200
- start_time=time.time(),
201
- callback_func=callback_func)
202
- managed_job_state.set_succeeded(job_id=self._job_id,
203
- task_id=task_id,
204
- end_time=time.time(),
205
- callback_func=callback_func)
304
+ await managed_job_state.set_started_async(
305
+ job_id=self._job_id,
306
+ task_id=task_id,
307
+ start_time=time.time(),
308
+ callback_func=callback_func)
309
+ await managed_job_state.set_succeeded_async(
310
+ job_id=self._job_id,
311
+ task_id=task_id,
312
+ end_time=time.time(),
313
+ callback_func=callback_func)
314
+ logger.info(f'Empty task {task_id} marked as succeeded immediately')
206
315
  return True
316
+
207
317
  usage_lib.messages.usage.update_task_id(task_id)
208
318
  task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
209
319
  assert task.name is not None, task
@@ -214,19 +324,22 @@ class JobsController:
214
324
  task.name, self._job_id) if self._pool is None else None
215
325
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
216
326
  cluster_name, self._backend, task, self._job_id, task_id,
217
- self._pool)
327
+ self._pool, self.starting, self.starting_lock, self.starting_signal)
218
328
  if not is_resume:
219
329
  submitted_at = time.time()
220
330
  if task_id == 0:
221
331
  submitted_at = backend_utils.get_timestamp_from_run_timestamp(
222
332
  self._backend.run_timestamp)
223
- managed_job_state.set_starting(
333
+
334
+ resources_str = backend_utils.get_task_resources_str(
335
+ task, is_managed_job=True)
336
+
337
+ await managed_job_state.set_starting_async(
224
338
  self._job_id,
225
339
  task_id,
226
340
  self._backend.run_timestamp,
227
341
  submitted_at,
228
- resources_str=backend_utils.get_task_resources_str(
229
- task, is_managed_job=True),
342
+ resources_str=resources_str,
230
343
  specs={
231
344
  'max_restarts_on_errors':
232
345
  self._strategy_executor.max_restarts_on_errors
@@ -242,33 +355,80 @@ class JobsController:
242
355
  # failure. Otherwise, we will transit to recovering immediately.
243
356
  remote_job_submitted_at = time.time()
244
357
  if not is_resume:
245
- remote_job_submitted_at = self._strategy_executor.launch()
358
+ launch_start = time.time()
359
+
360
+ # Run the launch in a separate thread to avoid blocking the event
361
+ # loop. The scheduler functions used internally already have their
362
+ # own file locks.
363
+ remote_job_submitted_at = await self._strategy_executor.launch()
364
+
365
+ launch_time = time.time() - launch_start
366
+ logger.info(f'Cluster launch completed in {launch_time:.2f}s')
246
367
  assert remote_job_submitted_at is not None, remote_job_submitted_at
247
368
  if self._pool is None:
248
369
  job_id_on_pool_cluster = None
249
370
  else:
250
- # Update the cluster name when using cluster pool.
371
+ # Update the cluster name when using pool.
251
372
  cluster_name, job_id_on_pool_cluster = (
252
- managed_job_state.get_pool_submit_info(self._job_id))
373
+ await
374
+ managed_job_state.get_pool_submit_info_async(self._job_id))
375
+ if cluster_name is None:
376
+ # Check if we have been cancelled here, in the case where a user
377
+ # quickly cancels the job we want to gracefully handle it here,
378
+ # otherwise we will end up in the FAILED_CONTROLLER state.
379
+ logger.info(f'Cluster name is None for job {self._job_id}, '
380
+ f'task {task_id}. Checking if we have been '
381
+ 'cancelled.')
382
+ status = await (managed_job_state.get_job_status_with_task_id_async(
383
+ job_id=self._job_id, task_id=task_id))
384
+ logger.debug(f'Status for job {self._job_id}, task {task_id}:'
385
+ f'{status}')
386
+ if status == managed_job_state.ManagedJobStatus.CANCELLED:
387
+ logger.info(f'Job {self._job_id}, task {task_id} has '
388
+ 'been quickly cancelled.')
389
+ raise asyncio.CancelledError()
253
390
  assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
254
391
 
255
392
  if not is_resume:
256
- managed_job_state.set_started(job_id=self._job_id,
257
- task_id=task_id,
258
- start_time=remote_job_submitted_at,
259
- callback_func=callback_func)
393
+ await managed_job_state.set_started_async(
394
+ job_id=self._job_id,
395
+ task_id=task_id,
396
+ start_time=remote_job_submitted_at,
397
+ callback_func=callback_func)
398
+
399
+ monitoring_start_time = time.time()
400
+ status_check_count = 0
401
+
402
+ async with self.starting_lock:
403
+ try:
404
+ self.starting.remove(self._job_id)
405
+ # its fine if we notify again, better to wake someone up
406
+ # and have them go to sleep again, then have some stuck
407
+ # sleeping.
408
+ # ps. this shouldn't actually happen because if its been
409
+ # removed from the set then we would get a key error.
410
+ self.starting_signal.notify()
411
+ except KeyError:
412
+ pass
260
413
 
261
414
  while True:
415
+ status_check_count += 1
416
+
262
417
  # NOTE: if we are resuming from a controller failure, we only keep
263
418
  # monitoring if the job is in RUNNING state. For all other cases,
264
419
  # we will directly transit to recovering since we have no idea what
265
420
  # the cluster status is.
266
421
  force_transit_to_recovering = False
267
422
  if is_resume:
268
- prev_status = managed_job_state.get_job_status_with_task_id(
269
- job_id=self._job_id, task_id=task_id)
423
+ prev_status = await (
424
+ managed_job_state.get_job_status_with_task_id_async(
425
+ job_id=self._job_id, task_id=task_id))
426
+
270
427
  if prev_status is not None:
271
428
  if prev_status.is_terminal():
429
+ logger.info(
430
+ f'Task {task_id} already in terminal state: '
431
+ f'{prev_status}')
272
432
  return (prev_status ==
273
433
  managed_job_state.ManagedJobStatus.SUCCEEDED)
274
434
  if (prev_status ==
@@ -276,19 +436,20 @@ class JobsController:
276
436
  # If the controller is down when cancelling the job,
277
437
  # we re-raise the error to run the `_cleanup` function
278
438
  # again to clean up any remaining resources.
279
- raise exceptions.ManagedJobUserCancelledError(
280
- 'Recovering cancel signal.')
439
+ logger.info(f'Task {task_id} was being cancelled, '
440
+ 're-raising cancellation')
441
+ raise asyncio.CancelledError()
281
442
  if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
282
443
  force_transit_to_recovering = True
283
444
  # This resume logic should only be triggered once.
284
445
  is_resume = False
285
446
 
286
- time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
447
+ await asyncio.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
287
448
 
288
449
  # Check the network connection to avoid false alarm for job failure.
289
450
  # Network glitch was observed even in the VM.
290
451
  try:
291
- backend_utils.check_network_connection()
452
+ await backend_utils.async_check_network_connection()
292
453
  except exceptions.NetworkError:
293
454
  logger.info('Network is not available. Retrying again in '
294
455
  f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
@@ -303,10 +464,11 @@ class JobsController:
303
464
  job_status = None
304
465
  if not force_transit_to_recovering:
305
466
  try:
306
- job_status = managed_job_utils.get_job_status(
467
+ job_status = await managed_job_utils.get_job_status(
307
468
  self._backend,
308
469
  cluster_name,
309
- job_id=job_id_on_pool_cluster)
470
+ job_id=job_id_on_pool_cluster,
471
+ )
310
472
  except exceptions.FetchClusterInfoError as fetch_e:
311
473
  logger.info(
312
474
  'Failed to fetch the job status. Start recovery.\n'
@@ -314,21 +476,34 @@ class JobsController:
314
476
  f'Traceback: {traceback.format_exc()}')
315
477
 
316
478
  if job_status == job_lib.JobStatus.SUCCEEDED:
317
- success_end_time = managed_job_utils.try_to_get_job_end_time(
318
- self._backend, cluster_name, job_id_on_pool_cluster)
479
+ logger.info(f'Task {task_id} succeeded! '
480
+ 'Getting end time and cleaning up')
481
+ try:
482
+ success_end_time = await context_utils.to_thread(
483
+ managed_job_utils.try_to_get_job_end_time,
484
+ self._backend, cluster_name, job_id_on_pool_cluster)
485
+ except Exception as e: # pylint: disable=broad-except
486
+ logger.warning(
487
+ f'Failed to get job end time: '
488
+ f'{common_utils.format_exception(e)}',
489
+ exc_info=True)
490
+ success_end_time = 0
491
+
319
492
  # The job is done. Set the job to SUCCEEDED first before start
320
493
  # downloading and streaming the logs to make it more responsive.
321
- managed_job_state.set_succeeded(self._job_id,
322
- task_id,
323
- end_time=success_end_time,
324
- callback_func=callback_func)
494
+ await managed_job_state.set_succeeded_async(
495
+ self._job_id,
496
+ task_id,
497
+ end_time=success_end_time,
498
+ callback_func=callback_func)
325
499
  logger.info(
326
500
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
327
501
  f'Cleaning up the cluster {cluster_name}.')
328
502
  try:
329
503
  logger.info(f'Downloading logs on cluster {cluster_name} '
330
504
  f'and job id {job_id_on_pool_cluster}.')
331
- clusters = backend_utils.get_clusters(
505
+ clusters = await context_utils.to_thread(
506
+ backend_utils.get_clusters,
332
507
  cluster_names=[cluster_name],
333
508
  refresh=common.StatusRefreshMode.NONE,
334
509
  all_users=True,
@@ -337,8 +512,9 @@ class JobsController:
337
512
  assert len(clusters) == 1, (clusters, cluster_name)
338
513
  handle = clusters[0].get('handle')
339
514
  # Best effort to download and stream the logs.
340
- self._download_log_and_stream(task_id, handle,
341
- job_id_on_pool_cluster)
515
+ await context_utils.to_thread(
516
+ self._download_log_and_stream, task_id, handle,
517
+ job_id_on_pool_cluster)
342
518
  except Exception as e: # pylint: disable=broad-except
343
519
  # We don't want to crash here, so just log and continue.
344
520
  logger.warning(
@@ -347,7 +523,14 @@ class JobsController:
347
523
  exc_info=True)
348
524
  # Only clean up the cluster, not the storages, because tasks may
349
525
  # share storages.
350
- self._cleanup_cluster(cluster_name)
526
+ await self._cleanup_cluster(cluster_name)
527
+
528
+ task_total_time = time.time() - task_start_time
529
+ monitoring_time = time.time() - monitoring_start_time
530
+ logger.info(f'Task {task_id} completed successfully in '
531
+ f'{task_total_time:.2f}s '
532
+ f'(monitoring time: {monitoring_time:.2f}s, '
533
+ f'status checks: {status_check_count})')
351
534
  return True
352
535
 
353
536
  # For single-node jobs, non-terminated job_status indicates a
@@ -363,7 +546,7 @@ class JobsController:
363
546
  if job_status in job_lib.JobStatus.user_code_failure_states():
364
547
  # Add a grace period before the check of preemption to avoid
365
548
  # false alarm for job failure.
366
- time.sleep(5)
549
+ await asyncio.sleep(5)
367
550
 
368
551
  # Pull the actual cluster status from the cloud provider to
369
552
  # determine whether the cluster is preempted or failed.
@@ -394,14 +577,18 @@ class JobsController:
394
577
  in job_lib.JobStatus.user_code_failure_states() or
395
578
  job_status == job_lib.JobStatus.FAILED_DRIVER):
396
579
  # The user code has probably crashed, fail immediately.
397
- end_time = managed_job_utils.try_to_get_job_end_time(
580
+ logger.info(
581
+ f'Task {task_id} failed with status: {job_status}')
582
+ end_time = await context_utils.to_thread(
583
+ managed_job_utils.try_to_get_job_end_time,
398
584
  self._backend, cluster_name, job_id_on_pool_cluster)
399
585
  logger.info(
400
586
  f'The user job failed ({job_status}). Please check the '
401
587
  'logs below.\n'
402
588
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
403
589
 
404
- self._download_log_and_stream(task_id, handle,
590
+ await context_utils.to_thread(self._download_log_and_stream,
591
+ task_id, handle,
405
592
  job_id_on_pool_cluster)
406
593
 
407
594
  failure_reason = (
@@ -438,7 +625,9 @@ class JobsController:
438
625
  f'[{self._strategy_executor.restart_cnt_on_failure}'
439
626
  f'/{max_restarts}]')
440
627
  else:
441
- managed_job_state.set_failed(
628
+ logger.info(
629
+ f'Task {task_id} failed and will not be retried')
630
+ await managed_job_state.set_failed_async(
442
631
  self._job_id,
443
632
  task_id,
444
633
  failure_type=managed_job_status,
@@ -453,7 +642,7 @@ class JobsController:
453
642
  failure_reason = (
454
643
  f'Unknown job status {job_status}. To see the details, '
455
644
  f'run: sky jobs logs --controller {self._job_id}')
456
- managed_job_state.set_failed(
645
+ await managed_job_state.set_failed_async(
457
646
  self._job_id,
458
647
  task_id,
459
648
  failure_type=managed_job_state.ManagedJobStatus.
@@ -489,84 +678,115 @@ class JobsController:
489
678
  # those clusters again may fail.
490
679
  logger.info('Cleaning up the preempted or failed cluster'
491
680
  '...')
492
- self._cleanup_cluster(cluster_name)
681
+ await self._cleanup_cluster(cluster_name)
493
682
 
494
683
  # Try to recover the managed jobs, when the cluster is preempted or
495
684
  # failed or the job status is failed to be fetched.
496
- managed_job_state.set_recovering(
685
+ logger.info(f'Starting recovery for task {task_id}, '
686
+ f'it is currently {job_status}')
687
+ await managed_job_state.set_recovering_async(
497
688
  job_id=self._job_id,
498
689
  task_id=task_id,
499
690
  force_transit_to_recovering=force_transit_to_recovering,
500
691
  callback_func=callback_func)
501
- recovered_time = self._strategy_executor.recover()
692
+
693
+ recovered_time = await self._strategy_executor.recover()
694
+
502
695
  if self._pool is not None:
503
696
  cluster_name, job_id_on_pool_cluster = (
504
- managed_job_state.get_pool_submit_info(self._job_id))
697
+ await
698
+ managed_job_state.get_pool_submit_info_async(self._job_id))
505
699
  assert cluster_name is not None
506
- managed_job_state.set_recovered(self._job_id,
507
- task_id,
508
- recovered_time=recovered_time,
509
- callback_func=callback_func)
700
+ await managed_job_state.set_recovered_async(
701
+ self._job_id,
702
+ task_id,
703
+ recovered_time=recovered_time,
704
+ callback_func=callback_func)
510
705
 
511
- def run(self):
706
+ async def run(self):
512
707
  """Run controller logic and handle exceptions."""
708
+ logger.info(f'Starting JobsController run for job {self._job_id}')
513
709
  task_id = 0
710
+ cancelled = False
711
+
514
712
  try:
515
713
  succeeded = True
516
714
  # We support chain DAGs only for now.
517
715
  for task_id, task in enumerate(self._dag.tasks):
518
- succeeded = self._run_one_task(task_id, task)
716
+ logger.info(
717
+ f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
718
+ f'{task.name}')
719
+ task_start = time.time()
720
+ succeeded = await self._run_one_task(task_id, task)
721
+ task_time = time.time() - task_start
722
+ logger.info(f'Task {task_id} completed in {task_time:.2f}s '
723
+ f'with success={succeeded}')
724
+
519
725
  if not succeeded:
726
+ logger.info(f'Task {task_id} failed, stopping execution')
520
727
  break
728
+
521
729
  except exceptions.ProvisionPrechecksError as e:
522
730
  # Please refer to the docstring of self._run for the cases when
523
731
  # this exception can occur.
732
+ logger.error(f'Provision prechecks failed for task {task_id}')
524
733
  failure_reason = ('; '.join(
525
734
  common_utils.format_exception(reason, use_bracket=True)
526
735
  for reason in e.reasons))
527
736
  logger.error(failure_reason)
528
- self._update_failed_task_state(
737
+ await self._update_failed_task_state(
529
738
  task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
530
739
  failure_reason)
531
740
  except exceptions.ManagedJobReachedMaxRetriesError as e:
532
741
  # Please refer to the docstring of self._run for the cases when
533
742
  # this exception can occur.
743
+ logger.error(f'Managed job reached max retries for task {task_id}')
534
744
  failure_reason = common_utils.format_exception(e)
535
745
  logger.error(failure_reason)
536
746
  # The managed job should be marked as FAILED_NO_RESOURCE, as the
537
747
  # managed job may be able to launch next time.
538
- self._update_failed_task_state(
748
+ await self._update_failed_task_state(
539
749
  task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
540
750
  failure_reason)
751
+ except asyncio.CancelledError: # pylint: disable=try-except-raise
752
+ # have this here to avoid getting caught by the general except block
753
+ # below.
754
+ cancelled = True
755
+ raise
541
756
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
757
+ logger.error(
758
+ f'Unexpected error in JobsController run for task {task_id}')
542
759
  with ux_utils.enable_traceback():
543
760
  logger.error(traceback.format_exc())
544
761
  msg = ('Unexpected error occurred: ' +
545
762
  common_utils.format_exception(e, use_bracket=True))
546
763
  logger.error(msg)
547
- self._update_failed_task_state(
764
+ await self._update_failed_task_state(
548
765
  task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
549
766
  msg)
550
767
  finally:
551
- # This will set all unfinished tasks to CANCELLING, and will not
552
- # affect the jobs in terminal states.
553
- # We need to call set_cancelling before set_cancelled to make sure
554
- # the table entries are correctly set.
555
768
  callback_func = managed_job_utils.event_callback_func(
556
769
  job_id=self._job_id,
557
770
  task_id=task_id,
558
771
  task=self._dag.tasks[task_id])
559
- managed_job_state.set_cancelling(job_id=self._job_id,
560
- callback_func=callback_func)
561
- managed_job_state.set_cancelled(job_id=self._job_id,
562
- callback_func=callback_func)
772
+ await managed_job_state.set_cancelling_async(
773
+ job_id=self._job_id, callback_func=callback_func)
774
+ if not cancelled:
775
+ # the others haven't been run yet so we can set them to
776
+ # cancelled immediately (no resources to clean up).
777
+ # if we are running and get cancelled, we need to clean up the
778
+ # resources first so this will be done later.
779
+ await managed_job_state.set_cancelled_async(
780
+ job_id=self._job_id, callback_func=callback_func)
563
781
 
564
- def _update_failed_task_state(
782
+ async def _update_failed_task_state(
565
783
  self, task_id: int,
566
784
  failure_type: managed_job_state.ManagedJobStatus,
567
785
  failure_reason: str):
568
786
  """Update the state of the failed task."""
569
- managed_job_state.set_failed(
787
+ logger.info(f'Updating failed task state: task_id={task_id}, '
788
+ f'failure_type={failure_type}')
789
+ await managed_job_state.set_failed_async(
570
790
  self._job_id,
571
791
  task_id=task_id,
572
792
  failure_type=failure_type,
@@ -577,199 +797,421 @@ class JobsController:
577
797
  task=self._dag.tasks[task_id]))
578
798
 
579
799
 
580
- def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
581
- """Runs the controller in a remote process for interruption."""
582
- # The controller needs to be instantiated in the remote process, since
583
- # the controller is not serializable.
584
- jobs_controller = JobsController(job_id, dag_yaml, pool)
585
- jobs_controller.run()
586
-
587
-
588
- def _handle_signal(job_id):
589
- """Handle the signal if the user sent it."""
590
- signal_file = pathlib.Path(
591
- managed_job_utils.SIGNAL_FILE_PREFIX.format(job_id))
592
- user_signal = None
593
- if signal_file.exists():
594
- # Filelock is needed to prevent race condition with concurrent
595
- # signal writing.
596
- with filelock.FileLock(str(signal_file) + '.lock'):
597
- with signal_file.open(mode='r', encoding='utf-8') as f:
598
- user_signal = f.read().strip()
599
- try:
600
- user_signal = managed_job_utils.UserSignal(user_signal)
601
- except ValueError:
602
- logger.warning(
603
- f'Unknown signal received: {user_signal}. Ignoring.')
604
- user_signal = None
605
- # Remove the signal file, after reading the signal.
606
- signal_file.unlink()
607
- if user_signal is None:
608
- # None or empty string.
609
- return
610
- assert user_signal == managed_job_utils.UserSignal.CANCEL, (
611
- f'Only cancel signal is supported, but {user_signal} got.')
612
- raise exceptions.ManagedJobUserCancelledError(
613
- f'User sent {user_signal.value} signal.')
614
-
615
-
616
- def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
617
- """Clean up the cluster(s) and storages.
618
-
619
- (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
620
- to be cleaned up after the whole job is finished, as the tasks
621
- may share the same storage.
622
- (2) Clean up the cluster(s) that are not cleaned up yet, which can happen
623
- when the task failed or cancelled. At most one cluster should be left
624
- when reaching here, as we currently only support chain DAGs, and only
625
- task is executed at a time.
800
+ class ControllerManager:
801
+ """Main loop for a job controller process.
802
+
803
+ Many jobs will be handled by this, each by a single JobController.
626
804
  """
627
- # Cleanup the HA recovery script first as it is possible that some error
628
- # was raised when we construct the task object (e.g.,
629
- # sky.exceptions.ResourcesUnavailableError).
630
- managed_job_state.remove_ha_recovery_script(job_id)
631
- dag, _ = _get_dag_and_name(dag_yaml)
632
- for task in dag.tasks:
633
- assert task.name is not None, task
634
- if pool is None:
635
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
636
- task.name, job_id)
637
- managed_job_utils.terminate_cluster(cluster_name)
638
- else:
639
- cluster_name, job_id_on_pool_cluster = (
640
- managed_job_state.get_pool_submit_info(job_id))
641
- if cluster_name is not None:
642
- if job_id_on_pool_cluster is not None:
643
- core.cancel(cluster_name=cluster_name,
644
- job_ids=[job_id_on_pool_cluster],
645
- _try_cancel_if_cluster_is_init=True)
646
-
647
- # Clean up Storages with persistent=False.
648
- # TODO(zhwu): this assumes the specific backend.
649
- backend = cloud_vm_ray_backend.CloudVmRayBackend()
650
- # Need to re-construct storage object in the controller process
651
- # because when SkyPilot API server machine sends the yaml config to the
652
- # controller machine, only storage metadata is sent, not the storage
653
- # object itself.
654
- for storage in task.storage_mounts.values():
655
- storage.construct()
656
- backend.teardown_ephemeral_storage(task)
657
-
658
- # Clean up any files mounted from the local disk, such as two-hop file
659
- # mounts.
660
- for file_mount in (task.file_mounts or {}).values():
805
+
806
+ def __init__(self, controller_uuid: str) -> None:
807
+ self._controller_uuid = controller_uuid
808
+ # Global state for active jobs
809
+ self.job_tasks: Dict[int, asyncio.Task] = {}
810
+ self.starting: Set[int] = set()
811
+
812
+ # Lock for synchronizing access to global state dictionary
813
+ # Must always hold _job_tasks_lock when accessing the _starting_signal.
814
+ self._job_tasks_lock = asyncio.Lock()
815
+ # We signal whenever a job leaves the api server launching state. Feel
816
+ # free to signal as much as you want to be safe from leaks (if you
817
+ # do not signal enough there may be some jobs forever waiting to
818
+ # launch).
819
+ self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
820
+
821
+ self._pid = os.getpid()
822
+ self._pid_started_at = psutil.Process(self._pid).create_time()
823
+
824
+ async def _cleanup(self, job_id: int, pool: Optional[str] = None):
825
+ """Clean up the cluster(s) and storages.
826
+
827
+ (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
828
+ to be cleaned up after the whole job is finished, as the tasks
829
+ may share the same storage.
830
+ (2) Clean up the cluster(s) that are not cleaned up yet, which can
831
+ happen when the task failed or cancelled. At most one cluster
832
+ should be left when reaching here, as we currently only support
833
+ chain DAGs, and only one task is executed at a time.
834
+ """
835
+ # Cleanup the HA recovery script first as it is possible that some error
836
+ # was raised when we construct the task object (e.g.,
837
+ # sky.exceptions.ResourcesUnavailableError).
838
+ await managed_job_state.remove_ha_recovery_script_async(job_id)
839
+
840
+ def task_cleanup(task: 'sky.Task', job_id: int):
841
+ assert task.name is not None, task
842
+ error = None
843
+
661
844
  try:
662
- # For consolidation mode, there is no two-hop file mounts
663
- # and the file path here represents the real user data.
664
- # We skip the cleanup for consolidation mode.
665
- if (not data_utils.is_cloud_store_url(file_mount) and
666
- not managed_job_utils.is_consolidation_mode()):
667
- path = os.path.expanduser(file_mount)
668
- if os.path.isdir(path):
669
- shutil.rmtree(path)
670
- else:
671
- os.remove(path)
845
+ if pool is None:
846
+ cluster_name = (
847
+ managed_job_utils.generate_managed_job_cluster_name(
848
+ task.name, job_id))
849
+ managed_job_utils.terminate_cluster(cluster_name)
850
+ status = core.status(cluster_names=[cluster_name],
851
+ all_users=True)
852
+ assert (len(status) == 0 or
853
+ status[0]['status'] == sky.ClusterStatus.STOPPED), (
854
+ f'{cluster_name} is not down: {status}')
855
+ logger.info(f'{cluster_name} is down')
856
+ else:
857
+ cluster_name, job_id_on_pool_cluster = (
858
+ managed_job_state.get_pool_submit_info(job_id))
859
+ if cluster_name is not None:
860
+ if job_id_on_pool_cluster is not None:
861
+ core.cancel(cluster_name=cluster_name,
862
+ job_ids=[job_id_on_pool_cluster],
863
+ _try_cancel_if_cluster_is_init=True)
672
864
  except Exception as e: # pylint: disable=broad-except
865
+ error = e
866
+ logger.warning(
867
+ f'Failed to terminate cluster {cluster_name}: {e}')
868
+ # we continue to try cleaning up whatever else we can.
869
+ # Clean up Storages with persistent=False.
870
+ # TODO(zhwu): this assumes the specific backend.
871
+ backend = cloud_vm_ray_backend.CloudVmRayBackend()
872
+ # Need to re-construct storage object in the controller process
873
+ # because when SkyPilot API server machine sends the yaml config to
874
+ # the controller machine, only storage metadata is sent, not the
875
+ # storage object itself.
876
+ try:
877
+ for storage in task.storage_mounts.values():
878
+ storage.construct()
879
+ except (exceptions.StorageSpecError, exceptions.StorageError) as e:
673
880
  logger.warning(
674
- f'Failed to clean up file mount {file_mount}: {e}')
881
+ f'Failed to construct storage object for teardown: {e}\n'
882
+ 'This may happen because storage construction already '
883
+ 'failed during launch, storage was deleted externally, '
884
+ 'credentials expired/changed, or network connectivity '
885
+ 'issues.')
886
+ try:
887
+ backend.teardown_ephemeral_storage(task)
888
+ except Exception as e: # pylint: disable=broad-except
889
+ error = e
890
+ logger.warning(f'Failed to teardown ephemeral storage: {e}')
891
+ # we continue to try cleaning up whatever else we can.
892
+
893
+ # Clean up any files mounted from the local disk, such as two-hop
894
+ # file mounts.
895
+ for file_mount in (task.file_mounts or {}).values():
896
+ try:
897
+ # For consolidation mode, there is no two-hop file mounts
898
+ # and the file path here represents the real user data.
899
+ # We skip the cleanup for consolidation mode.
900
+ if (not data_utils.is_cloud_store_url(file_mount) and
901
+ not managed_job_utils.is_consolidation_mode()):
902
+ path = os.path.expanduser(file_mount)
903
+ if os.path.isdir(path):
904
+ shutil.rmtree(path)
905
+ else:
906
+ os.remove(path)
907
+ except Exception as e: # pylint: disable=broad-except
908
+ logger.warning(
909
+ f'Failed to clean up file mount {file_mount}: {e}')
675
910
 
911
+ if error is not None:
912
+ raise error
676
913
 
677
- def start(job_id, dag_yaml, pool):
678
- """Start the controller."""
679
- controller_process = None
680
- cancelling = False
681
- task_id = None
682
- try:
683
- _handle_signal(job_id)
684
- # TODO(suquark): In theory, we should make controller process a
685
- # daemon process so it will be killed after this process exits,
686
- # however daemon process cannot launch subprocesses, explained here:
687
- # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Process.daemon # pylint: disable=line-too-long
688
- # So we can only enable daemon after we no longer need to
689
- # start daemon processes like Ray.
690
- controller_process = multiprocessing.Process(target=_run_controller,
691
- args=(job_id, dag_yaml,
692
- pool))
693
- controller_process.start()
694
- while controller_process.is_alive():
695
- _handle_signal(job_id)
696
- time.sleep(1)
697
- except exceptions.ManagedJobUserCancelledError:
698
- dag, _ = _get_dag_and_name(dag_yaml)
699
- task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
700
- assert task_id is not None, job_id
701
- logger.info(
702
- f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
703
- managed_job_state.set_cancelling(
704
- job_id=job_id,
705
- callback_func=managed_job_utils.event_callback_func(
706
- job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
707
- cancelling = True
708
- finally:
709
- if controller_process is not None:
710
- logger.info(f'Killing controller process {controller_process.pid}.')
711
- # NOTE: it is ok to kill or join a killed process.
712
- # Kill the controller process first; if its child process is
713
- # killed first, then the controller process will raise errors.
714
- # Kill any possible remaining children processes recursively.
715
- subprocess_utils.kill_children_processes(
716
- parent_pids=[controller_process.pid], force=True)
717
- controller_process.join()
718
- logger.info(f'Controller process {controller_process.pid} killed.')
719
-
720
- logger.info(f'Cleaning up any cluster for job {job_id}.')
721
- # NOTE: Originally, we send an interruption signal to the controller
722
- # process and the controller process handles cleanup. However, we
723
- # figure out the behavior differs from cloud to cloud
724
- # (e.g., GCP ignores 'SIGINT'). A possible explanation is
725
- # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
726
- # But anyway, a clean solution is killing the controller process
727
- # directly, and then cleanup the cluster job_state.
728
- _cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
729
- logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
730
-
731
- if cancelling:
732
- assert task_id is not None, job_id # Since it's set with cancelling
733
- managed_job_state.set_cancelled(
914
+ dag = _get_dag(job_id)
915
+ error = None
916
+ for task in dag.tasks:
917
+ # most things in this function are blocking
918
+ try:
919
+ await context_utils.to_thread(task_cleanup, task, job_id)
920
+ except Exception as e: # pylint: disable=broad-except
921
+ error = e
922
+
923
+ if error is not None:
924
+ # we only raise the last error that occurred, but its fine to lose
925
+ # some data here.
926
+ raise error
927
+
928
+ # Use context.contextual to enable per-job output redirection and env var
929
+ # isolation.
930
+ @context.contextual_async
931
+ async def run_job_loop(self,
932
+ job_id: int,
933
+ log_file: str,
934
+ pool: Optional[str] = None):
935
+ """Background task that runs the job loop."""
936
+ ctx = context.get()
937
+ assert ctx is not None, 'Context is not initialized'
938
+ ctx.redirect_log(pathlib.Path(log_file))
939
+
940
+ logger.info(f'Starting job loop for {job_id}')
941
+ logger.info(f' log_file={log_file}')
942
+ logger.info(f' pool={pool}')
943
+ logger.info(f'From controller {self._controller_uuid}')
944
+ logger.info(f' pid={self._pid}')
945
+
946
+ env_content = file_content_utils.get_job_env_content(job_id)
947
+ if env_content:
948
+ try:
949
+ env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
950
+ logger.info('Loading %d environment variables for job %s',
951
+ len(env_vars), job_id)
952
+ if ctx is not None:
953
+ for key, value in env_vars.items():
954
+ if value is not None:
955
+ ctx.override_envs({key: value})
956
+ logger.debug('Set environment variable: %s=%s', key,
957
+ value)
958
+
959
+ # Restore config file if needed
960
+ file_content_utils.restore_job_config_file(job_id)
961
+
962
+ skypilot_config.reload_config()
963
+ else: # pragma: no cover - defensive
964
+ logger.error('Context is None, cannot set environment '
965
+ 'variables')
966
+ except Exception as e: # pylint: disable=broad-except
967
+ logger.error(
968
+ 'Failed to load environment variables for job %s: '
969
+ '%s', job_id, e)
970
+
971
+ cancelling = False
972
+ try:
973
+ controller = JobController(job_id, self.starting,
974
+ self._job_tasks_lock,
975
+ self._starting_signal, pool)
976
+
977
+ async with self._job_tasks_lock:
978
+ if job_id in self.job_tasks:
979
+ logger.error(f'Job {job_id} already exists in job_tasks')
980
+ raise ValueError(f'Job {job_id} already exists')
981
+
982
+ # Create the task and store it
983
+ # This function should return instantly and run the job loop in
984
+ # the background.
985
+ task = asyncio.create_task(controller.run())
986
+ self.job_tasks[job_id] = task
987
+ await task
988
+ except asyncio.CancelledError:
989
+ logger.info(f'Job {job_id} was cancelled')
990
+ dag = _get_dag(job_id)
991
+ task_id, _ = await (
992
+ managed_job_state.get_latest_task_id_status_async(job_id))
993
+ assert task_id is not None, job_id
994
+ logger.info(f'Cancelling managed job, job_id: {job_id}, '
995
+ f'task_id: {task_id}')
996
+ await managed_job_state.set_cancelling_async(
734
997
  job_id=job_id,
735
998
  callback_func=managed_job_utils.event_callback_func(
736
999
  job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
1000
+ cancelling = True
1001
+ raise
1002
+ except Exception as e:
1003
+ logger.error(f'Unexpected error in job loop for {job_id}: '
1004
+ f'{common_utils.format_exception(e)}')
1005
+ raise
1006
+ finally:
1007
+ try:
1008
+ await self._cleanup(job_id, pool=pool)
1009
+ logger.info(
1010
+ f'Cluster of managed job {job_id} has been cleaned up.')
1011
+ except Exception as e: # pylint: disable=broad-except
1012
+ failure_reason = ('Failed to clean up: '
1013
+ f'{common_utils.format_exception(e)}')
1014
+ await managed_job_state.set_failed_async(
1015
+ job_id,
1016
+ task_id=None,
1017
+ failure_type=managed_job_state.ManagedJobStatus.
1018
+ FAILED_CONTROLLER,
1019
+ failure_reason=failure_reason,
1020
+ override_terminal=True)
1021
+
1022
+ if cancelling:
1023
+ # Since it's set with cancelling
1024
+ assert task_id is not None, job_id
1025
+ await managed_job_state.set_cancelled_async(
1026
+ job_id=job_id,
1027
+ callback_func=managed_job_utils.event_callback_func(
1028
+ job_id=job_id, task_id=task_id,
1029
+ task=dag.tasks[task_id]))
1030
+
1031
+ # We should check job status after 'set_cancelled', otherwise
1032
+ # the job status is not terminal.
1033
+ job_status = await managed_job_state.get_status_async(job_id)
1034
+ assert job_status is not None
1035
+ # The job can be non-terminal if the controller exited abnormally,
1036
+ # e.g. failed to launch cluster after reaching the MAX_RETRY.
1037
+ if not job_status.is_terminal():
1038
+ logger.info(f'Previous job status: {job_status.value}')
1039
+ await managed_job_state.set_failed_async(
1040
+ job_id,
1041
+ task_id=None,
1042
+ failure_type=managed_job_state.ManagedJobStatus.
1043
+ FAILED_CONTROLLER,
1044
+ failure_reason=(
1045
+ 'Unexpected error occurred. For details, '
1046
+ f'run: sky jobs logs --controller {job_id}'))
1047
+
1048
+ await scheduler.job_done_async(job_id)
1049
+
1050
+ async with self._job_tasks_lock:
1051
+ try:
1052
+ # just in case we were cancelled or some other error
1053
+ # occurred during launch
1054
+ self.starting.remove(job_id)
1055
+ # its fine if we notify again, better to wake someone up
1056
+ # and have them go to sleep again, then have some stuck
1057
+ # sleeping.
1058
+ self._starting_signal.notify()
1059
+ except KeyError:
1060
+ pass
1061
+
1062
+ # Remove the job from the job_tasks dictionary.
1063
+ async with self._job_tasks_lock:
1064
+ if job_id in self.job_tasks:
1065
+ del self.job_tasks[job_id]
737
1066
 
738
- # We should check job status after 'set_cancelled', otherwise
739
- # the job status is not terminal.
740
- job_status = managed_job_state.get_status(job_id)
741
- assert job_status is not None
742
- # The job can be non-terminal if the controller exited abnormally,
743
- # e.g. failed to launch cluster after reaching the MAX_RETRY.
744
- if not job_status.is_terminal():
745
- logger.info(f'Previous job status: {job_status.value}')
746
- managed_job_state.set_failed(
747
- job_id,
748
- task_id=None,
749
- failure_type=managed_job_state.ManagedJobStatus.
750
- FAILED_CONTROLLER,
751
- failure_reason=('Unexpected error occurred. For details, '
752
- f'run: sky jobs logs --controller {job_id}'))
753
-
754
- scheduler.job_done(job_id)
1067
+ async def start_job(
1068
+ self,
1069
+ job_id: int,
1070
+ pool: Optional[str] = None,
1071
+ ):
1072
+ """Start a new job.
1073
+
1074
+ Args:
1075
+ job_id: The ID of the job to start.
1076
+ """
1077
+ # Create log file path for job output redirection
1078
+ log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
1079
+ os.makedirs(log_dir, exist_ok=True)
1080
+ log_file = os.path.join(log_dir, f'{job_id}.log')
1081
+
1082
+ logger.info(f'Starting job {job_id} with log_file={log_file}')
1083
+
1084
+ async with self._job_tasks_lock:
1085
+ self.starting.add(job_id)
1086
+ await create_background_task(self.run_job_loop(job_id, log_file, pool))
1087
+
1088
+ logger.info(f'Job {job_id} started successfully')
1089
+
1090
+ async def cancel_job(self):
1091
+ """Cancel an existing job."""
1092
+ while True:
1093
+ cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
1094
+ for cancel in cancels:
1095
+ async with self._job_tasks_lock:
1096
+ job_id = int(cancel)
1097
+ if job_id in self.job_tasks:
1098
+ logger.info(f'Cancelling job {job_id}')
1099
+
1100
+ task = self.job_tasks[job_id]
1101
+
1102
+ # Run the cancellation in the background, so we can
1103
+ # return immediately.
1104
+ task.cancel()
1105
+ logger.info(f'Job {job_id} cancelled successfully')
1106
+
1107
+ os.remove(f'{jobs_constants.CONSOLIDATED_SIGNAL_PATH}/'
1108
+ f'{job_id}')
1109
+ await asyncio.sleep(15)
1110
+
1111
+ async def monitor_loop(self):
1112
+ """Monitor the job loop."""
1113
+ logger.info(f'Starting monitor loop for pid {self._pid}...')
1114
+
1115
+ while True:
1116
+ async with self._job_tasks_lock:
1117
+ running_tasks = [
1118
+ task for task in self.job_tasks.values() if not task.done()
1119
+ ]
1120
+
1121
+ async with self._job_tasks_lock:
1122
+ starting_count = len(self.starting)
1123
+
1124
+ if starting_count >= controller_utils.LAUNCHES_PER_WORKER:
1125
+ # launching a job takes around 1 minute, so lets wait half that
1126
+ # time
1127
+ await asyncio.sleep(30)
1128
+ continue
1129
+
1130
+ # Normally, 200 jobs can run on each controller. But if we have a
1131
+ # ton of controllers, we need to limit the number of jobs that can
1132
+ # run on each controller, to achieve a total of 2000 jobs across all
1133
+ # controllers.
1134
+ max_jobs = min(controller_utils.MAX_JOBS_PER_WORKER,
1135
+ (controller_utils.MAX_TOTAL_RUNNING_JOBS //
1136
+ controller_utils.get_number_of_jobs_controllers()))
1137
+
1138
+ if len(running_tasks) >= max_jobs:
1139
+ logger.info('Too many jobs running, waiting for 60 seconds')
1140
+ await asyncio.sleep(60)
1141
+ continue
1142
+
1143
+ # Check if there are any jobs that are waiting to launch
1144
+ try:
1145
+ waiting_job = await managed_job_state.get_waiting_job_async(
1146
+ pid=self._pid, pid_started_at=self._pid_started_at)
1147
+ except Exception as e: # pylint: disable=broad-except
1148
+ logger.error(f'Failed to get waiting job: {e}')
1149
+ await asyncio.sleep(5)
1150
+ continue
1151
+
1152
+ if waiting_job is None:
1153
+ logger.info('No waiting job, waiting for 10 seconds')
1154
+ await asyncio.sleep(10)
1155
+ continue
1156
+
1157
+ logger.info(f'Claiming job {waiting_job["job_id"]}')
1158
+ job_id = waiting_job['job_id']
1159
+ pool = waiting_job.get('pool', None)
1160
+
1161
+ cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
1162
+ if str(job_id) in cancels:
1163
+ status = await managed_job_state.get_status_async(job_id)
1164
+ if status == managed_job_state.ManagedJobStatus.PENDING:
1165
+ logger.info(f'Job {job_id} cancelled')
1166
+ os.remove(f'{jobs_constants.CONSOLIDATED_SIGNAL_PATH}/'
1167
+ f'{job_id}')
1168
+ await managed_job_state.set_cancelling_async(
1169
+ job_id=job_id,
1170
+ callback_func=managed_job_utils.event_callback_func(
1171
+ job_id=job_id, task_id=None, task=None))
1172
+ await managed_job_state.set_cancelled_async(
1173
+ job_id=job_id,
1174
+ callback_func=managed_job_utils.event_callback_func(
1175
+ job_id=job_id, task_id=None, task=None))
1176
+ continue
1177
+
1178
+ await self.start_job(job_id, pool)
1179
+
1180
+
1181
+ async def main(controller_uuid: str):
1182
+ logger.info(f'Starting controller {controller_uuid}')
1183
+
1184
+ context_utils.hijack_sys_attrs()
1185
+
1186
+ controller = ControllerManager(controller_uuid)
1187
+
1188
+ # Will happen multiple times, who cares though
1189
+ os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
1190
+
1191
+ # Increase number of files we can open
1192
+ soft = None
1193
+ try:
1194
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
1195
+ logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
1196
+ logger.info(f'Increasing soft limit to {hard}')
1197
+ resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
1198
+ except OSError as e:
1199
+ logger.warning(f'Failed to increase number of files we can open: {e}\n'
1200
+ f'Current soft limit: {soft}, hard limit: {hard}')
1201
+
1202
+ # Will loop forever, do it in the background
1203
+ cancel_job_task = asyncio.create_task(controller.cancel_job())
1204
+ monitor_loop_task = asyncio.create_task(controller.monitor_loop())
1205
+ # Run the garbage collector in a dedicated daemon thread to avoid affecting
1206
+ # the main event loop.
1207
+ gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
1208
+ gc_thread.start()
1209
+ try:
1210
+ await asyncio.gather(cancel_job_task, monitor_loop_task)
1211
+ except Exception as e: # pylint: disable=broad-except
1212
+ logger.error(f'Controller server crashed: {e}')
1213
+ sys.exit(1)
755
1214
 
756
1215
 
757
1216
  if __name__ == '__main__':
758
- parser = argparse.ArgumentParser()
759
- parser.add_argument('--job-id',
760
- required=True,
761
- type=int,
762
- help='Job id for the controller job.')
763
- parser.add_argument('dag_yaml',
764
- type=str,
765
- help='The path to the user job yaml file.')
766
- parser.add_argument('--pool',
767
- required=False,
768
- default=None,
769
- type=str,
770
- help='The pool to use for the controller job.')
771
- args = parser.parse_args()
772
- # We start process with 'spawn', because 'fork' could result in weird
773
- # behaviors; 'spawn' is also cross-platform.
774
- multiprocessing.set_start_method('spawn', force=True)
775
- start(args.job_id, args.dag_yaml, args.pool)
1217
+ asyncio.run(main(sys.argv[1]))