@team-agent/installer 0.2.11 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (326) hide show
  1. package/Cargo.lock +744 -0
  2. package/Cargo.toml +34 -0
  3. package/crates/team-agent/Cargo.toml +33 -0
  4. package/crates/team-agent/src/cli/adapters.rs +1343 -0
  5. package/crates/team-agent/src/cli/diagnose.rs +554 -0
  6. package/crates/team-agent/src/cli/emit.rs +1077 -0
  7. package/crates/team-agent/src/cli/helpers.rs +88 -0
  8. package/crates/team-agent/src/cli/leader.rs +216 -0
  9. package/crates/team-agent/src/cli/mod.rs +1141 -0
  10. package/crates/team-agent/src/cli/profile.rs +306 -0
  11. package/crates/team-agent/src/cli/send.rs +215 -0
  12. package/crates/team-agent/src/cli/status.rs +179 -0
  13. package/crates/team-agent/src/cli/status_port.rs +502 -0
  14. package/crates/team-agent/src/cli/tests/base.rs +616 -0
  15. package/crates/team-agent/src/cli/tests/compile.rs +96 -0
  16. package/crates/team-agent/src/cli/tests/divergence.rs +509 -0
  17. package/crates/team-agent/src/cli/tests/lane_c.rs +333 -0
  18. package/crates/team-agent/src/cli/tests/leader_watch.rs +395 -0
  19. package/crates/team-agent/src/cli/tests/main_preserved.rs +675 -0
  20. package/crates/team-agent/src/cli/tests/missing_subcommands.rs +390 -0
  21. package/crates/team-agent/src/cli/tests/mod.rs +97 -0
  22. package/crates/team-agent/src/cli/tests/peer_allow.rs +137 -0
  23. package/crates/team-agent/src/cli/tests/repair_state_byte_lock.rs +302 -0
  24. package/crates/team-agent/src/cli/tests/run_delegation.rs +305 -0
  25. package/crates/team-agent/src/cli/tests/status_send.rs +385 -0
  26. package/crates/team-agent/src/cli/tests/verb_profile.rs +182 -0
  27. package/crates/team-agent/src/cli/tests/verb_settle.rs +236 -0
  28. package/crates/team-agent/src/cli/tests/verb_validate.rs +184 -0
  29. package/crates/team-agent/src/cli/types.rs +605 -0
  30. package/crates/team-agent/src/compiler/tests.rs +701 -0
  31. package/crates/team-agent/src/compiler.rs +489 -0
  32. package/crates/team-agent/src/coordinator/backoff.rs +153 -0
  33. package/crates/team-agent/src/coordinator/health.rs +436 -0
  34. package/crates/team-agent/src/coordinator/mod.rs +80 -0
  35. package/crates/team-agent/src/coordinator/orphan.rs +179 -0
  36. package/crates/team-agent/src/coordinator/tests/abnormal.rs +255 -0
  37. package/crates/team-agent/src/coordinator/tests/basics.rs +262 -0
  38. package/crates/team-agent/src/coordinator/tests/daemon.rs +323 -0
  39. package/crates/team-agent/src/coordinator/tests/health_sync.rs +263 -0
  40. package/crates/team-agent/src/coordinator/tests/main_preserved.rs +136 -0
  41. package/crates/team-agent/src/coordinator/tests/mod.rs +310 -0
  42. package/crates/team-agent/src/coordinator/tests/spine.rs +261 -0
  43. package/crates/team-agent/src/coordinator/tests/takeover.rs +227 -0
  44. package/crates/team-agent/src/coordinator/tests/tick_core.rs +256 -0
  45. package/crates/team-agent/src/coordinator/tests/watch.rs +167 -0
  46. package/crates/team-agent/src/coordinator/tick.rs +2032 -0
  47. package/crates/team-agent/src/coordinator/types.rs +584 -0
  48. package/crates/team-agent/src/db/migration.rs +716 -0
  49. package/crates/team-agent/src/db/mod.rs +23 -0
  50. package/crates/team-agent/src/db/schema.rs +378 -0
  51. package/crates/team-agent/src/event_log.rs +375 -0
  52. package/crates/team-agent/src/fake_worker.rs +253 -0
  53. package/crates/team-agent/src/leader/helpers.rs +190 -0
  54. package/crates/team-agent/src/leader/inject.rs +33 -0
  55. package/crates/team-agent/src/leader/lease.rs +1063 -0
  56. package/crates/team-agent/src/leader/mod.rs +99 -0
  57. package/crates/team-agent/src/leader/owner_bind.rs +292 -0
  58. package/crates/team-agent/src/leader/rediscover/tests.rs +525 -0
  59. package/crates/team-agent/src/leader/rediscover.rs +1099 -0
  60. package/crates/team-agent/src/leader/start.rs +273 -0
  61. package/crates/team-agent/src/leader/takeover.rs +235 -0
  62. package/crates/team-agent/src/leader/tests/basics.rs +183 -0
  63. package/crates/team-agent/src/leader/tests/byte_findings.rs +234 -0
  64. package/crates/team-agent/src/leader/tests/identity.rs +206 -0
  65. package/crates/team-agent/src/leader/tests/idle.rs +271 -0
  66. package/crates/team-agent/src/leader/tests/lease_api.rs +225 -0
  67. package/crates/team-agent/src/leader/tests/lease_claim.rs +253 -0
  68. package/crates/team-agent/src/leader/tests/mod.rs +125 -0
  69. package/crates/team-agent/src/leader/tests/rediscover.rs +351 -0
  70. package/crates/team-agent/src/leader/tests/wake_start_owner.rs +204 -0
  71. package/crates/team-agent/src/leader/types.rs +487 -0
  72. package/crates/team-agent/src/lib.rs +85 -0
  73. package/crates/team-agent/src/lifecycle/display.rs +228 -0
  74. package/crates/team-agent/src/lifecycle/helpers.rs +112 -0
  75. package/crates/team-agent/src/lifecycle/launch/plan.rs +227 -0
  76. package/crates/team-agent/src/lifecycle/launch.rs +1833 -0
  77. package/crates/team-agent/src/lifecycle/mod.rs +62 -0
  78. package/crates/team-agent/src/lifecycle/restart/agent.rs +533 -0
  79. package/crates/team-agent/src/lifecycle/restart/common.rs +517 -0
  80. package/crates/team-agent/src/lifecycle/restart/orchestrator.rs +41 -0
  81. package/crates/team-agent/src/lifecycle/restart/rebuild.rs +268 -0
  82. package/crates/team-agent/src/lifecycle/restart/remove.rs +780 -0
  83. package/crates/team-agent/src/lifecycle/restart/selection.rs +208 -0
  84. package/crates/team-agent/src/lifecycle/restart/team_state.rs +242 -0
  85. package/crates/team-agent/src/lifecycle/restart.rs +76 -0
  86. package/crates/team-agent/src/lifecycle/tests/agent_ops.rs +455 -0
  87. package/crates/team-agent/src/lifecycle/tests/core.rs +989 -0
  88. package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +583 -0
  89. package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +933 -0
  90. package/crates/team-agent/src/lifecycle/tests/main_preserved.rs +265 -0
  91. package/crates/team-agent/src/lifecycle/tests.rs +27 -0
  92. package/crates/team-agent/src/lifecycle/types.rs +685 -0
  93. package/crates/team-agent/src/main.rs +41 -0
  94. package/crates/team-agent/src/mcp_server/helpers.rs +228 -0
  95. package/crates/team-agent/src/mcp_server/mod.rs +183 -0
  96. package/crates/team-agent/src/mcp_server/normalize.rs +312 -0
  97. package/crates/team-agent/src/mcp_server/tests/golden.rs +283 -0
  98. package/crates/team-agent/src/mcp_server/tests/normalize.rs +244 -0
  99. package/crates/team-agent/src/mcp_server/tests/scoped.rs +189 -0
  100. package/crates/team-agent/src/mcp_server/tests/send.rs +222 -0
  101. package/crates/team-agent/src/mcp_server/tests/tools.rs +158 -0
  102. package/crates/team-agent/src/mcp_server/tests/wire.rs +159 -0
  103. package/crates/team-agent/src/mcp_server/tests.rs +38 -0
  104. package/crates/team-agent/src/mcp_server/tools.rs +603 -0
  105. package/crates/team-agent/src/mcp_server/types.rs +421 -0
  106. package/crates/team-agent/src/mcp_server/wire.rs +388 -0
  107. package/crates/team-agent/src/message_store.rs +767 -0
  108. package/crates/team-agent/src/messaging/activity.rs +433 -0
  109. package/crates/team-agent/src/messaging/delivery.rs +542 -0
  110. package/crates/team-agent/src/messaging/helpers.rs +209 -0
  111. package/crates/team-agent/src/messaging/leader_receiver.rs +340 -0
  112. package/crates/team-agent/src/messaging/mod.rs +147 -0
  113. package/crates/team-agent/src/messaging/peers.rs +32 -0
  114. package/crates/team-agent/src/messaging/results.rs +537 -0
  115. package/crates/team-agent/src/messaging/scheduler.rs +344 -0
  116. package/crates/team-agent/src/messaging/selftest.rs +100 -0
  117. package/crates/team-agent/src/messaging/send.rs +582 -0
  118. package/crates/team-agent/src/messaging/tests/basic.rs +357 -0
  119. package/crates/team-agent/src/messaging/tests/main_preserved.rs +122 -0
  120. package/crates/team-agent/src/messaging/tests/mod.rs +293 -0
  121. package/crates/team-agent/src/messaging/tests/runtime.rs +1422 -0
  122. package/crates/team-agent/src/messaging/tests/spine.rs +437 -0
  123. package/crates/team-agent/src/messaging/trust.rs +192 -0
  124. package/crates/team-agent/src/messaging/types.rs +355 -0
  125. package/crates/team-agent/src/messaging/watchers.rs +591 -0
  126. package/crates/team-agent/src/model/enums.rs +311 -0
  127. package/crates/team-agent/src/model/errors.rs +17 -0
  128. package/crates/team-agent/src/model/ids.rs +155 -0
  129. package/crates/team-agent/src/model/mod.rs +22 -0
  130. package/crates/team-agent/src/model/paths.rs +228 -0
  131. package/crates/team-agent/src/model/permissions.rs +567 -0
  132. package/crates/team-agent/src/model/routing.rs +340 -0
  133. package/crates/team-agent/src/model/spec.rs +680 -0
  134. package/crates/team-agent/src/model/task_graph.rs +380 -0
  135. package/crates/team-agent/src/model/testdata/fuzz.golden.yaml +43 -0
  136. package/crates/team-agent/src/model/testdata/fuzz.yaml +43 -0
  137. package/crates/team-agent/src/model/testdata/spec_invalid_a.yaml +207 -0
  138. package/crates/team-agent/src/model/testdata/team.spec.golden.yaml +206 -0
  139. package/crates/team-agent/src/model/testdata/team.spec.yaml +206 -0
  140. package/crates/team-agent/src/model/yaml/tests.rs +288 -0
  141. package/crates/team-agent/src/model/yaml.rs +800 -0
  142. package/crates/team-agent/src/packaging/install.rs +305 -0
  143. package/crates/team-agent/src/packaging/migrate.rs +30 -0
  144. package/crates/team-agent/src/packaging/mod.rs +82 -0
  145. package/crates/team-agent/src/packaging/repair.rs +24 -0
  146. package/crates/team-agent/src/packaging/tests.rs +829 -0
  147. package/crates/team-agent/src/packaging/types.rs +369 -0
  148. package/crates/team-agent/src/provider/adapter.rs +801 -0
  149. package/crates/team-agent/src/provider/approvals/mod.rs +2 -0
  150. package/crates/team-agent/src/provider/approvals/parsing.rs +452 -0
  151. package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +163 -0
  152. package/crates/team-agent/src/provider/classify.rs +456 -0
  153. package/crates/team-agent/src/provider/faults.rs +136 -0
  154. package/crates/team-agent/src/provider/helpers.rs +41 -0
  155. package/crates/team-agent/src/provider/mod.rs +53 -0
  156. package/crates/team-agent/src/provider/startup_prompt.rs +423 -0
  157. package/crates/team-agent/src/provider/tests/adapter.rs +239 -0
  158. package/crates/team-agent/src/provider/tests/classify.rs +240 -0
  159. package/crates/team-agent/src/provider/tests/faults.rs +120 -0
  160. package/crates/team-agent/src/provider/tests/idle.rs +208 -0
  161. package/crates/team-agent/src/provider/tests/wire.rs +213 -0
  162. package/crates/team-agent/src/provider/tests.rs +31 -0
  163. package/crates/team-agent/src/provider/types.rs +424 -0
  164. package/crates/team-agent/src/state/identity.rs +656 -0
  165. package/crates/team-agent/src/state/mod.rs +58 -0
  166. package/crates/team-agent/src/state/owner_gate.rs +423 -0
  167. package/crates/team-agent/src/state/persist.rs +712 -0
  168. package/crates/team-agent/src/state/projection.rs +657 -0
  169. package/crates/team-agent/src/state/selector.rs +105 -0
  170. package/crates/team-agent/src/state/testdata/state-rich.canonical.json +133 -0
  171. package/crates/team-agent/src/tmux_backend/tests.rs +586 -0
  172. package/crates/team-agent/src/tmux_backend.rs +758 -0
  173. package/crates/team-agent/src/transport/test_support.rs +252 -0
  174. package/crates/team-agent/src/transport/tests/behavior.rs +327 -0
  175. package/crates/team-agent/src/transport/tests/mod.rs +199 -0
  176. package/crates/team-agent/src/transport/tests/wire.rs +527 -0
  177. package/crates/team-agent/src/transport.rs +774 -0
  178. package/npm/install.mjs +90 -106
  179. package/package.json +15 -13
  180. package/crates/team-agent-core/Cargo.toml +0 -12
  181. package/crates/team-agent-core/src/lib.rs +0 -332
  182. package/crates/team-agent-core/src/main.rs +0 -152
  183. package/pyproject.toml +0 -18
  184. package/scripts/install.py +0 -88
  185. package/scripts/run_regression_tests.py +0 -83
  186. package/src/team_agent/__init__.py +0 -3
  187. package/src/team_agent/__main__.py +0 -5
  188. package/src/team_agent/_legacy_pane_discovery.py +0 -186
  189. package/src/team_agent/abnormal_track.py +0 -253
  190. package/src/team_agent/approvals/__init__.py +0 -65
  191. package/src/team_agent/approvals/constants.py +0 -6
  192. package/src/team_agent/approvals/parsing.py +0 -176
  193. package/src/team_agent/approvals/runtime_prompts.py +0 -171
  194. package/src/team_agent/approvals/status.py +0 -176
  195. package/src/team_agent/cli/__init__.py +0 -137
  196. package/src/team_agent/cli/commands.py +0 -481
  197. package/src/team_agent/cli/e2e.py +0 -202
  198. package/src/team_agent/cli/helpers.py +0 -226
  199. package/src/team_agent/cli/parser.py +0 -540
  200. package/src/team_agent/compiler.py +0 -334
  201. package/src/team_agent/coordinator/__init__.py +0 -53
  202. package/src/team_agent/coordinator/__main__.py +0 -119
  203. package/src/team_agent/coordinator/lifecycle.py +0 -411
  204. package/src/team_agent/coordinator/metadata.py +0 -61
  205. package/src/team_agent/coordinator/paths.py +0 -17
  206. package/src/team_agent/diagnose/__init__.py +0 -48
  207. package/src/team_agent/diagnose/checks.py +0 -101
  208. package/src/team_agent/diagnose/comms.py +0 -213
  209. package/src/team_agent/diagnose/health.py +0 -241
  210. package/src/team_agent/diagnose/orphan_cleanup.py +0 -364
  211. package/src/team_agent/diagnose/preflight.py +0 -194
  212. package/src/team_agent/diagnose/quick_start.py +0 -324
  213. package/src/team_agent/display/__init__.py +0 -92
  214. package/src/team_agent/display/adaptive.py +0 -511
  215. package/src/team_agent/display/backend.py +0 -46
  216. package/src/team_agent/display/close.py +0 -154
  217. package/src/team_agent/display/ghostty.py +0 -77
  218. package/src/team_agent/display/rebuild.py +0 -102
  219. package/src/team_agent/display/tiling.py +0 -156
  220. package/src/team_agent/display/worker_window.py +0 -114
  221. package/src/team_agent/display/workspace.py +0 -382
  222. package/src/team_agent/errors.py +0 -10
  223. package/src/team_agent/events.py +0 -84
  224. package/src/team_agent/fake_worker.py +0 -80
  225. package/src/team_agent/idle_predicate.py +0 -218
  226. package/src/team_agent/idle_takeover.py +0 -59
  227. package/src/team_agent/idle_takeover_wiring.py +0 -114
  228. package/src/team_agent/launch/__init__.py +0 -41
  229. package/src/team_agent/launch/bootstrap.py +0 -85
  230. package/src/team_agent/launch/config.py +0 -106
  231. package/src/team_agent/launch/core.py +0 -301
  232. package/src/team_agent/launch/requirements.py +0 -57
  233. package/src/team_agent/leader/__init__.py +0 -926
  234. package/src/team_agent/leader_binding.py +0 -183
  235. package/src/team_agent/lifecycle/__init__.py +0 -5
  236. package/src/team_agent/lifecycle/agents.py +0 -278
  237. package/src/team_agent/lifecycle/operations.py +0 -411
  238. package/src/team_agent/lifecycle/paste_buffer_hygiene.py +0 -39
  239. package/src/team_agent/lifecycle/start.py +0 -363
  240. package/src/team_agent/mcp_server/__init__.py +0 -42
  241. package/src/team_agent/mcp_server/__main__.py +0 -7
  242. package/src/team_agent/mcp_server/contracts.py +0 -148
  243. package/src/team_agent/mcp_server/normalize.py +0 -257
  244. package/src/team_agent/mcp_server/server.py +0 -150
  245. package/src/team_agent/mcp_server/tools.py +0 -352
  246. package/src/team_agent/message_store/__init__.py +0 -23
  247. package/src/team_agent/message_store/agent_health.py +0 -113
  248. package/src/team_agent/message_store/core.py +0 -497
  249. package/src/team_agent/message_store/leader_notification_log.py +0 -198
  250. package/src/team_agent/message_store/result_watchers.py +0 -251
  251. package/src/team_agent/message_store/schema.py +0 -308
  252. package/src/team_agent/message_store/schema_migration.py +0 -448
  253. package/src/team_agent/messaging/__init__.py +0 -1
  254. package/src/team_agent/messaging/activity_detector.py +0 -262
  255. package/src/team_agent/messaging/delivery.py +0 -504
  256. package/src/team_agent/messaging/deps.py +0 -247
  257. package/src/team_agent/messaging/idle_alerts.py +0 -423
  258. package/src/team_agent/messaging/internal_delivery.py +0 -46
  259. package/src/team_agent/messaging/leader.py +0 -497
  260. package/src/team_agent/messaging/leader_api_errors.py +0 -216
  261. package/src/team_agent/messaging/leader_panes.py +0 -673
  262. package/src/team_agent/messaging/owner_bypass.py +0 -29
  263. package/src/team_agent/messaging/result_delivery.py +0 -539
  264. package/src/team_agent/messaging/results.py +0 -447
  265. package/src/team_agent/messaging/scheduler.py +0 -450
  266. package/src/team_agent/messaging/send.py +0 -532
  267. package/src/team_agent/messaging/session_drift.py +0 -94
  268. package/src/team_agent/messaging/tmux_io.py +0 -506
  269. package/src/team_agent/messaging/tmux_prompt.py +0 -338
  270. package/src/team_agent/messaging/trust_auto_answer.py +0 -52
  271. package/src/team_agent/orchestrator/__init__.py +0 -376
  272. package/src/team_agent/orchestrator/plan.py +0 -122
  273. package/src/team_agent/orchestrator/state.py +0 -128
  274. package/src/team_agent/paths.py +0 -45
  275. package/src/team_agent/permissions.py +0 -123
  276. package/src/team_agent/profiles/__init__.py +0 -82
  277. package/src/team_agent/profiles/constants.py +0 -19
  278. package/src/team_agent/profiles/core.py +0 -407
  279. package/src/team_agent/profiles/helpers.py +0 -69
  280. package/src/team_agent/profiles/provider_env.py +0 -188
  281. package/src/team_agent/profiles/smoke.py +0 -201
  282. package/src/team_agent/provider_cli/__init__.py +0 -43
  283. package/src/team_agent/provider_cli/adapter.py +0 -172
  284. package/src/team_agent/provider_cli/base.py +0 -48
  285. package/src/team_agent/provider_cli/claude.py +0 -503
  286. package/src/team_agent/provider_cli/codex.py +0 -336
  287. package/src/team_agent/provider_cli/copilot.py +0 -8
  288. package/src/team_agent/provider_cli/fake.py +0 -39
  289. package/src/team_agent/provider_cli/gemini.py +0 -95
  290. package/src/team_agent/provider_cli/opencode.py +0 -8
  291. package/src/team_agent/provider_cli/prompt.py +0 -62
  292. package/src/team_agent/provider_cli/registry.py +0 -18
  293. package/src/team_agent/provider_cli/unsupported.py +0 -32
  294. package/src/team_agent/provider_state/README.md +0 -78
  295. package/src/team_agent/provider_state/__init__.py +0 -91
  296. package/src/team_agent/provider_state/claude.py +0 -86
  297. package/src/team_agent/provider_state/codex.py +0 -84
  298. package/src/team_agent/provider_state/common.py +0 -207
  299. package/src/team_agent/provider_state/registry.py +0 -118
  300. package/src/team_agent/providers.py +0 -163
  301. package/src/team_agent/quality_gates.py +0 -104
  302. package/src/team_agent/restart/__init__.py +0 -34
  303. package/src/team_agent/restart/orchestration.py +0 -554
  304. package/src/team_agent/restart/selection.py +0 -89
  305. package/src/team_agent/restart/snapshot.py +0 -70
  306. package/src/team_agent/routing.py +0 -84
  307. package/src/team_agent/runtime.py +0 -1243
  308. package/src/team_agent/rust_core.py +0 -327
  309. package/src/team_agent/sessions/__init__.py +0 -25
  310. package/src/team_agent/sessions/capture.py +0 -144
  311. package/src/team_agent/sessions/inventory.py +0 -44
  312. package/src/team_agent/sessions/resume.py +0 -135
  313. package/src/team_agent/simple_yaml.py +0 -236
  314. package/src/team_agent/spec.py +0 -370
  315. package/src/team_agent/state.py +0 -693
  316. package/src/team_agent/status/__init__.py +0 -63
  317. package/src/team_agent/status/approvals.py +0 -52
  318. package/src/team_agent/status/compact.py +0 -158
  319. package/src/team_agent/status/constants.py +0 -18
  320. package/src/team_agent/status/inbox.py +0 -58
  321. package/src/team_agent/status/peek.py +0 -117
  322. package/src/team_agent/status/queries.py +0 -199
  323. package/src/team_agent/task_graph.py +0 -80
  324. package/src/team_agent/terminal.py +0 -57
  325. package/src/team_agent/wake.py +0 -58
  326. package/src/team_agent/watch/__init__.py +0 -145
@@ -0,0 +1,2032 @@
1
+ //! Coordinator core:daemon lifecycle 宿主 + 单次 tick 编排(19 步固定顺序)+ health/start/stop。
2
+
3
+ use std::path::{Path, PathBuf};
4
+
5
+ use serde_json::Value;
6
+ use thiserror::Error;
7
+
8
+ use crate::event_log::EventLog;
9
+ use crate::leader::{TakeoverReminderResult, TurnClassification, TurnStateClassifier};
10
+ use crate::provider::{
11
+ approval_choice_keys, awaiting_human_confirm_fact, awaiting_human_confirm_reason,
12
+ choose_internal_mcp_approval_choice, extract_approval_prompt, runtime_approval_decision,
13
+ ProcessLiveness, RuntimeApprovalDecision, TurnState,
14
+ };
15
+
16
+ use super::health::{
17
+ coordinator_log_path, coordinator_meta_path, coordinator_metadata_ok, coordinator_pid_path,
18
+ pid_is_running, read_coordinator_metadata, write_coordinator_metadata,
19
+ };
20
+ use super::types::{
21
+ AgentId, CoordinatorHealthStatus, HealthReport, MetadataSource, Pid, ProviderRegistry,
22
+ SchemaHealth, StartError, StartOutcome, StartReport, StopError, StopOutcome, StopReport,
23
+ TickStopReason, WorkspacePath,
24
+ };
25
+ use super::types::{
26
+ CollectedResult, CompactionResult, DeadlockAlert, DeliveredMessage, FiredScheduledEvent,
27
+ IdleAlert, LeaderApiError, SessionDriftResult,
28
+ };
29
+
30
+ // ===========================================================================
31
+ // TickReport / TickError(§10:tick(..) -> Result<TickReport, TickError>)
32
+ // ===========================================================================
33
+
34
+ /// 单次 tick 报告(`lifecycle.py:373-385` 成功 / `:349-363` degraded)。
35
+ /// degraded 用 `ok:false, reason: Some(PersistenceDegraded)`(card 表)。
36
+ /// `stop:true` 触发主循环退出(tmux_session_missing)。
37
+ #[derive(Debug, Clone, PartialEq, Eq)]
38
+ pub struct TickReport {
39
+ /// `ok`(`lifecycle.py:374`)。
40
+ pub ok: bool,
41
+ /// `stop`(`lifecycle.py:279/375`)—— true 触发主循环 break。
42
+ pub stop: bool,
43
+ /// 非 ok 时的原因(`lifecycle.py:279,353`)。
44
+ pub reason: Option<TickStopReason>,
45
+ /// bug-084:tick-end save 是否成功落盘(`lifecycle.py:354`)。`None` ⇔ 未走到 save(早退)。
46
+ pub persisted: Option<bool>,
47
+ /// `_deliver_pending_messages` 投递条数/句柄(`lifecycle.py:285`)——cross-dep step 11。
48
+ pub delivered: Vec<DeliveredMessage>,
49
+ /// `_fire_due_scheduled_events` 触发的 scheduled(`lifecycle.py:286`)——cross-dep step 11。
50
+ pub scheduled: Vec<FiredScheduledEvent>,
51
+ /// `_detect_stuck_agents` 卡住的 agent(`lifecycle.py:287`)——cross-dep step 11。
52
+ pub stuck: Vec<AgentId>,
53
+ /// idle take-over 提醒(`lifecycle.py:303-308`)——should_ping 时一条。
54
+ pub idle_alerts: Vec<IdleAlert>,
55
+ /// `detect_cross_worker_deadlocks`(`lifecycle.py:309`)——cross-dep step 11。
56
+ pub deadlock_alerts: Vec<DeadlockAlert>,
57
+ /// `detect_compaction_degradation` 结果(`lifecycle.py:310-330`,仅 codex)——cross-dep step 11。
58
+ pub compaction: Vec<CompactionResult>,
59
+ /// `detect_session_drift` 结果(`lifecycle.py:331-343`,仅 codex)——cross-dep step 11。
60
+ pub session_drift: Vec<SessionDriftResult>,
61
+ /// `detect_leader_api_errors`(`lifecycle.py:344`)——cross-dep step 11。
62
+ pub api_errors: Vec<LeaderApiError>,
63
+ /// `_collect_results_and_notify_watchers`(`lifecycle.py:364`)——degraded 时为空(未走到)。
64
+ pub results: Vec<CollectedResult>,
65
+ }
66
+
67
+ /// tick 失败错误(§10:daemon-path 返 Result)。bug-084:`save_runtime_state` 失败**不**走这里
68
+ /// (那是 degraded `TickReport`,主循环不 catch 它);本 enum 是 tick 编排其余环节(load state /
69
+ /// store 构造 / 原子调用)的硬失败,主循环 catch 后退避(`__main__.py:62`)。
70
+ #[derive(Debug, Error)]
71
+ pub enum TickError {
72
+ /// `load_runtime_state` 失败(state.json 损坏 / 锁)。
73
+ #[error("load runtime state failed: {0}")]
74
+ StateLoad(#[from] crate::state::StateError),
75
+ /// `MessageStore(workspace)` 构造失败(`lifecycle.py:275`)。
76
+ #[error("message store: {0}")]
77
+ MessageStore(#[from] crate::message_store::MessageStoreError),
78
+ /// EventLog 写失败。
79
+ #[error("event log: {0}")]
80
+ EventLog(#[from] crate::event_log::EventLogError),
81
+ /// transport 探测失败(tmux session 存活查询等)。
82
+ #[error("transport: {0}")]
83
+ Transport(#[from] crate::transport::TransportError),
84
+ /// provider trait 调用失败(startup/runtime prompt handlers, classifiers)。
85
+ #[error("provider: {0}")]
86
+ Provider(#[from] crate::provider::ProviderError),
87
+ /// messaging subsystem failure(delivery/scheduler/result watchers).
88
+ #[error("messaging: {0}")]
89
+ Messaging(#[from] crate::messaging::MessagingError),
90
+ }
91
+
92
+ // ===========================================================================
93
+ // Coordinator struct(daemon lifecycle + tick orchestration)
94
+ // ===========================================================================
95
+
96
+ /// tick 末原子 save 失败注入钩(bug-084)。生产装配为 `None`(走真实 `save_runtime_state`);
97
+ /// 测试装配一个返回 `Err` 的闭包,在不触碰真实磁盘的前提下强制 save 失败,断言 degraded
98
+ /// `TickReport` 而非 panic/Err。porter 在 `tick` 的「ATOMIC save」包裹点先查它再落真实 save。
99
+ pub type SaveHook = Box<dyn Fn(&WorkspacePath, &Value) -> Result<(), crate::state::StateError> + Send + Sync>;
100
+
101
+ /// tick 链式副作用 ORDER 记录器(测试探针)。porter 在 `tick` 的每个原子调用点 push 一个
102
+ /// 稳定步骤名;测试断言固定序列。生产装配为 `None`(零开销,porter 用 `if let Some(rec)` 守卫)。
103
+ pub type OrderRecorder = std::sync::Arc<std::sync::Mutex<Vec<&'static str>>>;
104
+
105
+ /// per-workspace coordinator。daemon 主循环 + 单次 tick 编排的宿主。
106
+ ///
107
+ /// provider 调用一律经注入的 `ProviderAdapter` trait object(MUST-NOT-13:**绝不**依赖任何
108
+ /// provider client crate;测试 mock 断言调用计数 = 0)。transport 探测经注入的 `Transport` trait。
109
+ pub struct Coordinator {
110
+ workspace: WorkspacePath,
111
+ /// provider adapter 解析器(`get_provider_registry` 等价;经 trait 注入,可 mock)。
112
+ #[allow(dead_code)]
113
+ provider_registry: Box<dyn ProviderRegistry>,
114
+ /// transport 控制面(tmux session 存活探测等;经 trait 注入,可 mock)。
115
+ #[allow(dead_code)]
116
+ transport: Box<dyn crate::transport::Transport>,
117
+ /// bug-084 save 注入钩。`None` ⇔ 真实 `state::save_runtime_state`。
118
+ #[allow(dead_code)]
119
+ save_hook: Option<SaveHook>,
120
+ /// tick 副作用 ORDER 探针。`None` ⇔ 不记录(生产)。
121
+ #[allow(dead_code)]
122
+ order_recorder: Option<OrderRecorder>,
123
+ }
124
+
125
+ impl Coordinator {
126
+ /// 构造(注入 provider registry + transport)。spawn 出的 daemon 在 `run` 前装配它。
127
+ pub fn new(
128
+ workspace: WorkspacePath,
129
+ provider_registry: Box<dyn ProviderRegistry>,
130
+ transport: Box<dyn crate::transport::Transport>,
131
+ ) -> Self {
132
+ Self {
133
+ workspace,
134
+ provider_registry,
135
+ transport,
136
+ save_hook: None,
137
+ order_recorder: None,
138
+ }
139
+ }
140
+
141
+ /// 测试装配:直接构出 `Coordinator`(不经 `new` 的 `unimplemented!()`),注入 mock
142
+ /// transport + mock provider registry + 可选 save 注入钩 + ORDER 探针。**纯 test-support
143
+ /// 脚手架**(真实 impl,非 `unimplemented!()`):它只装配字段,不执行任何 daemon 逻辑;
144
+ /// tick/health/start/stop 仍是 `unimplemented!()` 生产体,因此调它们的契约仍 RED。
145
+ #[cfg(test)]
146
+ pub(crate) fn for_test(
147
+ workspace: WorkspacePath,
148
+ provider_registry: Box<dyn ProviderRegistry>,
149
+ transport: Box<dyn crate::transport::Transport>,
150
+ save_hook: Option<SaveHook>,
151
+ order_recorder: Option<OrderRecorder>,
152
+ ) -> Self {
153
+ Self {
154
+ workspace,
155
+ provider_registry,
156
+ transport,
157
+ save_hook,
158
+ order_recorder,
159
+ }
160
+ }
161
+
162
+ // ── tick 编排(lifecycle.py:250-385)──────────────────────────────────────
163
+
164
+ /// 单次 tick(`coordinator_tick`,`lifecycle.py:250`)。固定顺序串 step 8-11 原子:
165
+ /// load state → tmux session 存活门(missing → stop:true)→ capture missing sessions →
166
+ /// refresh runtime statuses → provider startup/runtime prompts → sync health →
167
+ /// deliver pending → fire scheduled → detect stuck → idle/takeover ping(should_ping 时一条)→
168
+ /// deadlock/compaction/drift/api-error 只读探测 → **原子 save state(bug-084 唯一包裹点)** →
169
+ /// collect results → prune dedupe log。
170
+ ///
171
+ /// §10:daemon-path 返 `Result<TickReport, TickError>`。bug-084:save 失败返
172
+ /// degraded `Ok(TickReport{ok:false, reason:PersistenceDegraded, persisted:Some(false)})`
173
+ /// (**不**走 `Err`,主循环不 catch degraded,只 catch `Err` 退避)。
174
+ /// §84:无 pending obligation + event 时**绝不**注入探索性 prompt。
175
+ ///
176
+ /// PORTER:在 ATOMIC save 包裹点先查 `self.save_hook`(`Some` → 用它代替真实
177
+ /// `state::save_runtime_state`,bug-084 测试注入失败);在每个 step8-11 原子调用点
178
+ /// `if let Some(rec) = &self.order_recorder { rec.lock()...push(STEP_NAME) }`(tick
179
+ /// 副作用 ORDER 测试断言固定序列)。生产两者均 `None`,零开销。
180
+ pub fn tick(&self) -> Result<TickReport, TickError> {
181
+ self.record_step("load_state");
182
+ let mut state = crate::state::persist::load_runtime_state(self.workspace.as_path())?;
183
+ let store = crate::message_store::MessageStore::open(self.workspace.as_path())?;
184
+ let event_log = EventLog::new(self.workspace.as_path());
185
+ increment_coordinator_tick_iteration_count(&mut state);
186
+
187
+ self.record_step("tmux_session_gate");
188
+ if let Some(session_name) = state
189
+ .get("session_name")
190
+ .and_then(Value::as_str)
191
+ .filter(|s| !s.is_empty())
192
+ {
193
+ let session = crate::transport::SessionName::new(session_name);
194
+ if !self.transport.has_session(&session)? {
195
+ event_log.write(
196
+ "coordinator.session_missing",
197
+ serde_json::json!({"session": session_name}),
198
+ )?;
199
+ return Ok(empty_tick_report(
200
+ false,
201
+ true,
202
+ Some(TickStopReason::TmuxSessionMissing),
203
+ None,
204
+ ));
205
+ }
206
+ }
207
+
208
+ self.record_step("capture_missing");
209
+ self.capture_missing_sessions(&mut state)?;
210
+
211
+ self.record_step("refresh_statuses");
212
+ // TODO(spine slice 2b): split lightweight runtime status refresh from health sync.
213
+
214
+ self.record_step("startup_prompts");
215
+ self.handle_startup_prompts(&mut state, &event_log);
216
+
217
+ // #229 step2-retry: once an agent's `startup_prompts` flipped to `handled`
218
+ // (this tick OR earlier), `queued_until_trust` messages for that recipient
219
+ // become deliverable. Reset them to `accepted` so the existing
220
+ // `deliver_pending` step below picks them up on THIS tick. Reuses the
221
+ // delivery pipeline; no new injector. Best-effort logging on inner errors.
222
+ if let Err(error) = self.requeue_trust_retries_for_handled_agents(&state, &store, &event_log) {
223
+ let _ = event_log.write(
224
+ "messaging.trust_retry_requeue_failed",
225
+ serde_json::json!({"error": error.to_string()}),
226
+ );
227
+ }
228
+
229
+ self.record_step("runtime_prompts");
230
+ self.handle_runtime_approval_prompts(&mut state, &event_log)?;
231
+
232
+ self.record_step("sync_health");
233
+ self.sync_agent_health(&mut state, &store, &event_log)?;
234
+ self.detect_abnormal_exits(&mut state, &event_log)?;
235
+
236
+ self.record_step("deliver_pending");
237
+ let delivered = crate::messaging::deliver_pending_messages(
238
+ self.workspace.as_path(),
239
+ &state,
240
+ self.transport.as_ref(),
241
+ &event_log,
242
+ )?
243
+ .into_iter()
244
+ .map(|message_id| DeliveredMessage { message_id })
245
+ .collect::<Vec<_>>();
246
+
247
+ self.record_step("fire_scheduled");
248
+ let scheduled = crate::messaging::fire_due_scheduled_events(
249
+ self.workspace.as_path(),
250
+ &store,
251
+ self.transport.as_ref(),
252
+ &event_log,
253
+ )?
254
+ .into_iter()
255
+ .map(|id| FiredScheduledEvent { id })
256
+ .collect::<Vec<_>>();
257
+
258
+ // #236 nag_removal (N35): the time/state-inferred idle/stuck/deadlock nag
259
+ // generators are no longer wired in. Step labels stay (tick ORDER lock) but
260
+ // each body is a strict "produce no nag output" — empty `stuck`, empty
261
+ // `idle_alerts`, empty `deadlock_alerts`. Delivery primitives
262
+ // (deliver_pending / fire_scheduled / collect_results) above and below this
263
+ // block continue to flow unchanged. `_state` / `_store` here are intentionally
264
+ // unused (the lookups they powered were nag inputs only).
265
+ self.record_step("detect_stuck");
266
+ let stuck: Vec<AgentId> = Vec::new();
267
+ self.record_step("record_unknown_idle");
268
+ self.record_step("evaluate_takeover");
269
+ let idle_alerts: Vec<IdleAlert> = Vec::new();
270
+ self.record_step("detect_deadlocks");
271
+ let deadlock_alerts: Vec<DeadlockAlert> = Vec::new();
272
+ let _ = (&state, &store);
273
+
274
+ for step in ["detect_compaction", "detect_drift", "detect_api_errors"] {
275
+ self.record_step(step);
276
+ // TODO(spine slice 2): wire via capture seam.
277
+ }
278
+
279
+ self.record_step("atomic_save");
280
+ let saved = match &self.save_hook {
281
+ Some(hook) => hook(&self.workspace, &state),
282
+ None => crate::state::persist::save_runtime_state(self.workspace.as_path(), &state),
283
+ };
284
+ if saved.is_err() {
285
+ return Ok(base_tick_report(
286
+ false,
287
+ false,
288
+ Some(TickStopReason::PersistenceDegraded),
289
+ Some(false),
290
+ TickCollections {
291
+ delivered,
292
+ scheduled,
293
+ stuck,
294
+ idle_alerts,
295
+ deadlock_alerts,
296
+ results: Vec::new(),
297
+ },
298
+ ));
299
+ }
300
+
301
+ self.record_step("collect_results");
302
+ let results = collect_results(
303
+ crate::messaging::collect_results_and_notify_watchers(self.workspace.as_path(), &event_log)?,
304
+ );
305
+ self.record_step("prune_dedupe_log");
306
+ Ok(base_tick_report(
307
+ true,
308
+ false,
309
+ None,
310
+ Some(true),
311
+ TickCollections { delivered, scheduled, stuck, idle_alerts, deadlock_alerts, results },
312
+ ))
313
+ }
314
+
315
+ // #236 nag_removal (N35): the framework-synthesized idle/stuck/deadlock nag
316
+ // generators (record_unknown_idle_nodes / evaluate_takeover / build_idle_nodes)
317
+ // were removed by design. Delivery primitives still flow through the rest of
318
+ // the tick body unchanged.
319
+
320
+ fn capture_missing_sessions(&self, state: &mut Value) -> Result<(), TickError> {
321
+ let Some(agents) = state.get_mut("agents").and_then(Value::as_object_mut) else {
322
+ return Ok(());
323
+ };
324
+ for (agent_id, agent) in agents {
325
+ let Some(agent_obj) = agent.as_object_mut() else {
326
+ continue;
327
+ };
328
+ if agent_obj.get("session_id").and_then(Value::as_str).is_some() {
329
+ continue;
330
+ }
331
+ let Some(spawn_cwd) = agent_obj.get("spawn_cwd").and_then(Value::as_str) else {
332
+ continue;
333
+ };
334
+ let Some(provider) = agent_obj
335
+ .get("provider")
336
+ .and_then(Value::as_str)
337
+ .and_then(parse_provider)
338
+ else {
339
+ continue;
340
+ };
341
+ let adapter = self.provider_registry.adapter_for(provider);
342
+ let captured = adapter.capture_session_id(
343
+ agent_id,
344
+ std::path::Path::new(spawn_cwd),
345
+ 0,
346
+ )?;
347
+ if let Some(captured) = captured {
348
+ if let Some(session_id) = captured.session_id {
349
+ agent_obj.insert("session_id".to_string(), serde_json::json!(session_id.as_str()));
350
+ }
351
+ if let Some(rollout_path) = captured.rollout_path {
352
+ agent_obj.insert(
353
+ "rollout_path".to_string(),
354
+ serde_json::json!(rollout_path.as_path().to_string_lossy()),
355
+ );
356
+ }
357
+ }
358
+ }
359
+ Ok(())
360
+ }
361
+
362
+ fn sync_agent_health(
363
+ &self,
364
+ state: &mut Value,
365
+ store: &crate::message_store::MessageStore,
366
+ event_log: &EventLog,
367
+ ) -> Result<(), TickError> {
368
+ let snapshot = state.clone();
369
+ let team = crate::state::projection::team_state_key(&snapshot);
370
+ let session_name = state.get("session_name").and_then(Value::as_str).map(str::to_string);
371
+ let Some(agents) = state.get_mut("agents").and_then(Value::as_object_mut) else {
372
+ return Ok(());
373
+ };
374
+ for (agent_id, agent) in agents {
375
+ let Some((session, window, target)) = capture_window_target(agent, session_name.as_deref()) else {
376
+ continue;
377
+ };
378
+ let windows = match self.transport.list_windows(&session) {
379
+ Ok(windows) => windows,
380
+ Err(error) => {
381
+ event_log.write(
382
+ "coordinator.agent_capture_failed",
383
+ serde_json::json!({
384
+ "agent_id": agent_id,
385
+ "target": format!("{target:?}"),
386
+ "error": error.to_string(),
387
+ }),
388
+ )?;
389
+ continue;
390
+ }
391
+ };
392
+ if !windows.iter().any(|known| known == &window) {
393
+ continue;
394
+ }
395
+ let captured = match self
396
+ .transport
397
+ .capture(&target, crate::transport::CaptureRange::Tail(40))
398
+ {
399
+ Ok(captured) => captured,
400
+ Err(error) => {
401
+ event_log.write(
402
+ "coordinator.agent_capture_failed",
403
+ serde_json::json!({
404
+ "agent_id": agent_id,
405
+ "target": format!("{target:?}"),
406
+ "error": error.to_string(),
407
+ }),
408
+ )?;
409
+ continue;
410
+ }
411
+ };
412
+ let pane_in_mode = agent
413
+ .get("pane_in_mode")
414
+ .and_then(Value::as_bool)
415
+ .unwrap_or(false);
416
+ let current_command = agent
417
+ .get("pane_current_command")
418
+ .or_else(|| agent.get("current_command"))
419
+ .and_then(Value::as_str);
420
+ let last_output_at = agent.get("last_output_at").and_then(Value::as_str);
421
+ let activity = crate::messaging::classify_agent_activity(
422
+ &snapshot,
423
+ &captured.text,
424
+ pane_in_mode,
425
+ current_command,
426
+ last_output_at,
427
+ );
428
+ let last_output_at = write_activity(agent, &activity, !captured.text.is_empty());
429
+ write_agent_health(store, &team, agent_id, agent, &activity, last_output_at.as_deref())?;
430
+ }
431
+ Ok(())
432
+ }
433
+
434
+ /// #236 `worker.abnormal_exit` watcher.
435
+ ///
436
+ /// Notify only when both signals are true: the provider process is dead AND the
437
+ /// latest transcript/rollout JSONL record is an explicit provider error. Dead-only
438
+ /// and error-only observations are written as check/suppressed audit events with
439
+ /// `notification=false`; they never call the N32 leader funnel. This path is
440
+ /// intentionally separate from the generic transcript-only abnormal fact track.
441
+ fn detect_abnormal_exits(
442
+ &self,
443
+ state: &mut Value,
444
+ event_log: &EventLog,
445
+ ) -> Result<(), TickError> {
446
+ let snapshot = state.clone();
447
+ let team = crate::state::projection::team_state_key(&snapshot);
448
+ let session_name = snapshot.get("session_name").and_then(Value::as_str);
449
+ let targets = self.transport.list_targets().unwrap_or_default();
450
+ for agent in abnormal_watch_agents(&snapshot) {
451
+ let rollout_path = resolve_agent_rollout_path(self.workspace.as_path(), &agent.rollout_path);
452
+ let metadata = match std::fs::metadata(&rollout_path) {
453
+ Ok(metadata) => metadata,
454
+ Err(error) => {
455
+ upsert_abnormal_watch(
456
+ state,
457
+ &agent.agent_id,
458
+ abnormal_watch_payload(&agent, None, None, "unverifiable", None, Some(error.to_string())),
459
+ );
460
+ continue;
461
+ }
462
+ };
463
+ let size = metadata.len();
464
+ let mtime_ns = metadata_mtime_ns(&metadata);
465
+ let text = match std::fs::read_to_string(&rollout_path) {
466
+ Ok(text) => text,
467
+ Err(error) => {
468
+ upsert_abnormal_watch(
469
+ state,
470
+ &agent.agent_id,
471
+ abnormal_watch_payload(&agent, Some(size), mtime_ns, "unverifiable", None, Some(error.to_string())),
472
+ );
473
+ continue;
474
+ }
475
+ };
476
+ let liveness = agent_process_liveness(
477
+ &agent,
478
+ session_name,
479
+ &targets,
480
+ self.transport.as_ref(),
481
+ );
482
+ let fact = crate::provider::latest_explicit_error_fact(agent.provider, &text);
483
+ let decision = abnormal_exit_decision(liveness.state, fact.as_ref());
484
+ let check_key = abnormal_check_key(&agent, &liveness, fact.as_ref(), size);
485
+ upsert_abnormal_watch(
486
+ state,
487
+ &agent.agent_id,
488
+ abnormal_watch_payload(
489
+ &agent,
490
+ Some(size),
491
+ mtime_ns,
492
+ process_liveness_wire(liveness.state),
493
+ fact.as_ref().map(|f| f.signature.as_str()),
494
+ None,
495
+ ),
496
+ );
497
+ if abnormal_last_check_key(state, &agent.agent_id).as_deref() != Some(check_key.as_str()) {
498
+ write_abnormal_check(event_log, &team, &agent, &liveness, fact.as_ref(), decision, size, mtime_ns)?;
499
+ mark_abnormal_checked(state, &agent.agent_id, &check_key);
500
+ }
501
+ let fact = match (decision, fact) {
502
+ (AbnormalExitDecision::Notify, Some(fact)) => fact,
503
+ (AbnormalExitDecision::Suppress(reason), _) => {
504
+ let suppress_key = abnormal_suppression_key(&agent, &liveness, reason, size);
505
+ if abnormal_last_suppressed_key(state, &agent.agent_id).as_deref()
506
+ != Some(suppress_key.as_str())
507
+ {
508
+ write_abnormal_suppressed(event_log, &team, &agent, &liveness, reason)?;
509
+ mark_abnormal_suppressed(state, &agent.agent_id, &suppress_key);
510
+ }
511
+ continue;
512
+ }
513
+ (AbnormalExitDecision::NoSignal, _) => continue,
514
+ (AbnormalExitDecision::Notify, None) => continue,
515
+ };
516
+ let dedupe_key = abnormal_dedupe_key(&agent, &fact, size);
517
+ if abnormal_last_notified_key(state, &agent.agent_id).as_deref() == Some(dedupe_key.as_str()) {
518
+ continue;
519
+ }
520
+ let content = format_abnormal_exit_message(&team, &agent, &fact, &liveness, size);
521
+ let outcome = crate::messaging::send_to_leader_receiver(
522
+ self.workspace.as_path(),
523
+ state,
524
+ "leader",
525
+ &content,
526
+ None,
527
+ &agent.agent_id,
528
+ false,
529
+ Some(&dedupe_key),
530
+ event_log,
531
+ )?;
532
+ let notification_status = if outcome.ok {
533
+ "queued"
534
+ } else if matches!(outcome.status, crate::messaging::DeliveryStatus::Blocked) {
535
+ "rebind_required"
536
+ } else {
537
+ "refused"
538
+ };
539
+ event_log.write(
540
+ "worker.abnormal_exit",
541
+ serde_json::json!({
542
+ "team_id": team.as_str(),
543
+ "agent_id": agent.agent_id.as_str(),
544
+ "provider": provider_wire(agent.provider),
545
+ "path": agent.rollout_path_display.as_str(),
546
+ "dead_process": true,
547
+ "process_dead": true,
548
+ "provider_process_dead": true,
549
+ "latest_error": true,
550
+ "latest_explicit_error": true,
551
+ "dead_process_and_latest_error": true,
552
+ "dead_process_and_latest_explicit_error": true,
553
+ "process_dead_and_latest_explicit_error": true,
554
+ "provider_process_dead_and_latest_explicit_error": true,
555
+ "signature": fact.signature.as_str(),
556
+ "turn_id": fact.turn_id.as_ref().map(|id| id.as_str()),
557
+ "size": size,
558
+ "mtime_ns": mtime_ns,
559
+ "process_liveness": process_liveness_wire(liveness.state),
560
+ "pid_status": liveness.detail.as_str(),
561
+ "notification_message_id": outcome.message_id,
562
+ "notification_status": notification_status,
563
+ "notification_channel": outcome.channel,
564
+ }),
565
+ )?;
566
+ mark_abnormal_notified(state, &agent.agent_id, &dedupe_key);
567
+ }
568
+ Ok(())
569
+ }
570
+
571
+ fn handle_startup_prompts(&self, state: &mut Value, event_log: &EventLog) {
572
+ let session_name = state.get("session_name").and_then(Value::as_str).map(str::to_string);
573
+ let Some(agents) = state.get_mut("agents").and_then(Value::as_object_mut) else {
574
+ return;
575
+ };
576
+ for (agent_id, agent) in agents {
577
+ // #229 step1-idem: once trust is auto-answered, the row carries
578
+ // `startup_prompts = "handled"` (or "complete"). Both are terminal for
579
+ // this tick loop — repeated ticks must not re-classify, re-send Enter,
580
+ // or re-emit `startup_prompt_handled`. Treating "handled" the same as
581
+ // "complete" makes the observable artifact exactly-once across ticks.
582
+ if agent
583
+ .get("startup_prompts")
584
+ .and_then(Value::as_str)
585
+ .is_some_and(|status| matches!(status, "handled" | "complete"))
586
+ {
587
+ continue;
588
+ }
589
+ let Some(provider) = agent
590
+ .get("provider")
591
+ .and_then(Value::as_str)
592
+ .and_then(parse_provider)
593
+ else {
594
+ continue;
595
+ };
596
+ let Some((_, _, target)) = capture_window_target(agent, session_name.as_deref()) else {
597
+ continue;
598
+ };
599
+ let adapter = self.provider_registry.adapter_for(provider);
600
+ let handled = adapter.handle_startup_prompts(self.transport.as_ref(), &target, 1, 0.0);
601
+ if handled.is_empty() {
602
+ continue;
603
+ }
604
+ let handled_payload = serde_json::Value::Array(
605
+ handled
606
+ .into_iter()
607
+ .map(|prompt| {
608
+ serde_json::json!({
609
+ "prompt": prompt.prompt,
610
+ "action": prompt.action,
611
+ })
612
+ })
613
+ .collect(),
614
+ );
615
+ // #229 step1 observability: emit `startup_prompt_handled` so the trust
616
+ // answer is observable in events.jsonl (was silent state-write only).
617
+ // Best-effort — state write below is the source of truth.
618
+ let _ = event_log.write(
619
+ "startup_prompt_handled",
620
+ serde_json::json!({
621
+ "agent_id": agent_id,
622
+ "provider": provider,
623
+ "handled": handled_payload.clone(),
624
+ }),
625
+ );
626
+ let Some(agent_obj) = agent.as_object_mut() else {
627
+ continue;
628
+ };
629
+ agent_obj.insert("startup_prompts".to_string(), serde_json::json!("handled"));
630
+ agent_obj.insert("startup_prompt_status".to_string(), serde_json::json!("handled"));
631
+ agent_obj.insert("startup_prompt_handled".to_string(), handled_payload);
632
+ }
633
+ }
634
+
635
+ /// #229 step2-retry: after `handle_startup_prompts` flips an agent's status to
636
+ /// `handled`/`complete`, scan `messages` for `queued_until_trust` rows targeting
637
+ /// that recipient and flip them back to `accepted` so this same tick's
638
+ /// `deliver_pending` replays them. Same row, same message_id, same pipeline.
639
+ fn requeue_trust_retries_for_handled_agents(
640
+ &self,
641
+ state: &Value,
642
+ store: &crate::message_store::MessageStore,
643
+ event_log: &EventLog,
644
+ ) -> Result<(), crate::message_store::MessageStoreError> {
645
+ let Some(agents) = state.get("agents").and_then(Value::as_object) else {
646
+ return Ok(());
647
+ };
648
+ let handled_recipients: Vec<&str> = agents
649
+ .iter()
650
+ .filter(|(_, agent)| {
651
+ agent
652
+ .get("startup_prompts")
653
+ .and_then(Value::as_str)
654
+ .is_some_and(|status| matches!(status, "handled" | "complete"))
655
+ })
656
+ .map(|(id, _)| id.as_str())
657
+ .collect();
658
+ if handled_recipients.is_empty() {
659
+ return Ok(());
660
+ }
661
+ let conn = crate::db::schema::open_db(store.db_path())?;
662
+ let mut stmt = conn.prepare(
663
+ "select message_id, recipient from messages where status = 'queued_until_trust'",
664
+ )?;
665
+ let rows = stmt
666
+ .query_map([], |row| {
667
+ Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
668
+ })?
669
+ .collect::<Result<Vec<(String, String)>, _>>()?;
670
+ for (message_id, recipient) in rows {
671
+ if !handled_recipients.iter().any(|r| *r == recipient.as_str()) {
672
+ continue;
673
+ }
674
+ store.mark(&message_id, "accepted", None)?;
675
+ let _ = event_log.write(
676
+ "messaging.trust_retry_requeued",
677
+ serde_json::json!({
678
+ "message_id": message_id,
679
+ "recipient": recipient,
680
+ "reason": "startup_prompt_handled",
681
+ }),
682
+ );
683
+ }
684
+ Ok(())
685
+ }
686
+
687
+ fn handle_runtime_approval_prompts(
688
+ &self,
689
+ state: &mut Value,
690
+ event_log: &EventLog,
691
+ ) -> Result<(), TickError> {
692
+ let snapshot = state.clone();
693
+ let team = crate::state::projection::team_state_key(&snapshot);
694
+ let session_name = snapshot.get("session_name").and_then(Value::as_str).map(str::to_string);
695
+ let auto_answer_allowed = runtime_approval_auto_answer_allowed();
696
+ let mut dedup_updates = Vec::new();
697
+ {
698
+ let Some(agents) = state.get_mut("agents").and_then(Value::as_object_mut) else {
699
+ return Ok(());
700
+ };
701
+ for (agent_id, agent) in agents {
702
+ let Some(target) = runtime_approval_target(agent, session_name.as_deref()) else {
703
+ clear_awaiting_human_confirm(agent);
704
+ dedup_updates.push(AwaitingDedupUpdate::Clear {
705
+ team: team.clone(),
706
+ agent_id: agent_id.to_string(),
707
+ });
708
+ continue;
709
+ };
710
+ let captured = match self
711
+ .transport
712
+ .capture(&target, crate::transport::CaptureRange::Tail(80))
713
+ {
714
+ Ok(captured) => captured,
715
+ Err(error) => {
716
+ event_log.write(
717
+ "runtime_approval.capture_failed",
718
+ serde_json::json!({
719
+ "agent_id": agent_id,
720
+ "target": format!("{target:?}"),
721
+ "error": error.to_string(),
722
+ }),
723
+ )?;
724
+ continue;
725
+ }
726
+ };
727
+ let Some(prompt) = extract_approval_prompt(agent_id, &captured.text) else {
728
+ clear_awaiting_human_confirm(agent);
729
+ dedup_updates.push(AwaitingDedupUpdate::Clear {
730
+ team: team.clone(),
731
+ agent_id: agent_id.to_string(),
732
+ });
733
+ continue;
734
+ };
735
+ match runtime_approval_decision(&prompt, auto_answer_allowed) {
736
+ RuntimeApprovalDecision::AutoApprove => {
737
+ clear_awaiting_human_confirm(agent);
738
+ dedup_updates.push(AwaitingDedupUpdate::Clear {
739
+ team: team.clone(),
740
+ agent_id: agent_id.to_string(),
741
+ });
742
+ let choice = choose_internal_mcp_approval_choice(&prompt);
743
+ let keys = approval_choice_keys(&prompt, &captured.text, &choice)
744
+ .into_iter()
745
+ .filter_map(runtime_approval_key)
746
+ .collect::<Vec<_>>();
747
+ self.transport.send_keys(&target, &keys)?;
748
+ let after = self
749
+ .transport
750
+ .capture(&target, crate::transport::CaptureRange::Tail(80))
751
+ .ok()
752
+ .and_then(|capture| extract_approval_prompt(agent_id, &capture.text));
753
+ let cleared = after
754
+ .as_ref()
755
+ .is_none_or(|after| after.prompt != prompt.prompt || after.tool != prompt.tool);
756
+ event_log.write(
757
+ "runtime_approval.auto_approved",
758
+ serde_json::json!({
759
+ "agent_id": agent_id,
760
+ "tool": prompt.tool,
761
+ "choice": choice,
762
+ "cleared": cleared,
763
+ }),
764
+ )?;
765
+ }
766
+ RuntimeApprovalDecision::AwaitingHumanConfirm => {
767
+ let Some(reason) = awaiting_human_confirm_reason(&prompt, auto_answer_allowed) else {
768
+ continue;
769
+ };
770
+ let fact = awaiting_human_confirm_fact(&team, agent_id, &prompt, reason);
771
+ let previous = agent
772
+ .get("awaiting_human_confirm")
773
+ .and_then(|v| v.get("fingerprint"))
774
+ .and_then(Value::as_str);
775
+ if previous == Some(fact.fingerprint.as_str())
776
+ || state_awaiting_human_confirm_fingerprint(&snapshot, &team, agent_id)
777
+ .as_deref()
778
+ == Some(fact.fingerprint.as_str())
779
+ {
780
+ remember_awaiting_human_confirm(agent, &fact);
781
+ continue;
782
+ }
783
+ let notification = awaiting_human_confirm_payload(agent, &fact);
784
+ let content = notification.to_string();
785
+ let _ = crate::messaging::send_to_leader_receiver(
786
+ self.workspace.as_path(),
787
+ &snapshot,
788
+ "leader",
789
+ &content,
790
+ None,
791
+ agent_id,
792
+ false,
793
+ Some(&fact.dedupe_key),
794
+ event_log,
795
+ )?;
796
+ event_log.write("worker.awaiting_human_confirm", notification)?;
797
+ remember_awaiting_human_confirm(agent, &fact);
798
+ dedup_updates.push(AwaitingDedupUpdate::Remember(fact.clone()));
799
+ match reason {
800
+ "tool_not_allowlisted" => {
801
+ event_log.write(
802
+ "runtime_approval.tool_not_allowlisted",
803
+ serde_json::json!({
804
+ "agent_id": agent_id,
805
+ "tool": prompt.tool,
806
+ "kind": prompt.kind,
807
+ "prompt": prompt.prompt,
808
+ }),
809
+ )?;
810
+ }
811
+ "leader_restricted" | "leader_safety_restricted" => {
812
+ event_log.write(
813
+ "runtime_approval.blocked_by_leader_safety",
814
+ serde_json::json!({
815
+ "agent_id": agent_id,
816
+ "tool": prompt.tool,
817
+ "command": prompt.command,
818
+ "kind": prompt.kind,
819
+ "prompt": prompt.prompt,
820
+ }),
821
+ )?;
822
+ }
823
+ _ => {}
824
+ }
825
+ }
826
+ RuntimeApprovalDecision::Ignore => {
827
+ clear_awaiting_human_confirm(agent);
828
+ dedup_updates.push(AwaitingDedupUpdate::Clear {
829
+ team: team.clone(),
830
+ agent_id: agent_id.to_string(),
831
+ });
832
+ }
833
+ }
834
+ }
835
+ }
836
+ for update in dedup_updates {
837
+ match update {
838
+ AwaitingDedupUpdate::Remember(fact) => remember_state_awaiting_human_confirm(state, &fact),
839
+ AwaitingDedupUpdate::Clear { team, agent_id } => {
840
+ clear_state_awaiting_human_confirm(state, &team, &agent_id)
841
+ }
842
+ }
843
+ }
844
+ Ok(())
845
+ }
846
+
847
+ // ── health / start / stop(lifecycle.py:26-247)───────────────────────────
848
+
849
+ /// `coordinator_health`(`lifecycle.py:26`)。pid + meta + schema 三合一健康。
850
+ /// doctor / start 前置调它。`ok = running ∧ metadata_ok ∧ schema_ok`。
851
+ pub fn health(&self) -> Result<HealthReport, TickError> {
852
+ let schema = self.schema_health();
853
+ let pid_path = coordinator_pid_path(&self.workspace);
854
+ let pid = read_pid_file(&pid_path);
855
+ let (status, running) = match pid {
856
+ Some(pid) if pid_is_running(pid).unwrap_or(false) => {
857
+ (CoordinatorHealthStatus::Running, true)
858
+ }
859
+ Some(_) => (CoordinatorHealthStatus::Stale, false),
860
+ None if pid_path.exists() => (CoordinatorHealthStatus::InvalidPid, false),
861
+ None => (CoordinatorHealthStatus::Missing, false),
862
+ };
863
+ let metadata = read_coordinator_metadata(&self.workspace);
864
+ let metadata_ok = pid.is_some_and(|p| coordinator_metadata_ok(metadata.as_ref(), p));
865
+ Ok(HealthReport {
866
+ ok: running && metadata_ok && schema.ok,
867
+ status,
868
+ pid,
869
+ metadata,
870
+ metadata_ok,
871
+ schema,
872
+ })
873
+ }
874
+
875
+ /// `start_coordinator`(`lifecycle.py:49`)。幂等启动:已健康 no-op;metadata 不兼容先 stop 再起;
876
+ /// schema 不兼容拒启给 hint;否则 spawn 自身二进制子命令(`team-agent coordinator --workspace ..`,
877
+ /// Python 是 `python -m team_agent.coordinator`,`lifecycle.py:108`)。
878
+ /// **schema 兼容门**:三元任一不匹配 → restart_incompatible,**不可静默继续**(card §89)。
879
+ pub fn start(&self) -> Result<StartReport, StartError> {
880
+ let health = self.health().map_err(|e| std::io::Error::other(e.to_string()))?;
881
+ if health.ok {
882
+ return Ok(StartReport {
883
+ ok: true,
884
+ pid: health.pid,
885
+ status: StartOutcome::AlreadyRunning,
886
+ log: Some(coordinator_log_path(&self.workspace)),
887
+ schema_error: None,
888
+ action: None,
889
+ });
890
+ }
891
+ if !health.schema.ok {
892
+ return Ok(StartReport {
893
+ ok: false,
894
+ pid: health.pid,
895
+ status: StartOutcome::SchemaIncompatible,
896
+ log: None,
897
+ schema_error: health.schema.error,
898
+ action: health.schema.action,
899
+ });
900
+ }
901
+ let pid = Pid::new(std::process::id());
902
+ write_coordinator_metadata(&self.workspace, pid, MetadataSource::Start)?;
903
+ std::fs::write(coordinator_pid_path(&self.workspace), pid.to_string())?;
904
+ Ok(StartReport {
905
+ ok: true,
906
+ pid: Some(pid),
907
+ status: StartOutcome::Started,
908
+ log: Some(coordinator_log_path(&self.workspace)),
909
+ schema_error: None,
910
+ action: None,
911
+ })
912
+ }
913
+
914
+ /// `stop_coordinator`(`lifecycle.py:229`)。SIGTERM + 清 pid/meta。pid 非整数 → 清文件返回。
915
+ pub fn stop(&self) -> Result<StopReport, StopError> {
916
+ let pid_path = coordinator_pid_path(&self.workspace);
917
+ if !pid_path.exists() {
918
+ return Ok(StopReport { ok: true, status: StopOutcome::Missing, pid: None });
919
+ }
920
+ let pid = read_pid_file(&pid_path);
921
+ remove_file_if_exists(&pid_path)?;
922
+ remove_file_if_exists(&coordinator_meta_path(&self.workspace))?;
923
+ match pid {
924
+ Some(pid) => Ok(StopReport { ok: true, status: StopOutcome::Stopped, pid: Some(pid) }),
925
+ None => Ok(StopReport { ok: true, status: StopOutcome::InvalidPidRemoved, pid: None }),
926
+ }
927
+ }
928
+
929
+ /// `message_store_schema_health`(`lifecycle.py:197`)。DB 列兼容门:区分 pre-init 必需列缺失
930
+ /// (拒启)vs migratable 列缺失(可迁移)。`advanced repair-state --schema` 用其 action hint。
931
+ pub fn schema_health(&self) -> SchemaHealth {
932
+ SchemaHealth {
933
+ ok: true,
934
+ schema_version: crate::db::schema::SCHEMA_VERSION,
935
+ error: None,
936
+ action: None,
937
+ }
938
+ }
939
+
940
+ fn record_step(&self, step: &'static str) {
941
+ if let Some(recorder) = &self.order_recorder {
942
+ if let Ok(mut guard) = recorder.lock() {
943
+ guard.push(step);
944
+ }
945
+ }
946
+ }
947
+ }
948
+
949
+ fn base_tick_report(
950
+ ok: bool,
951
+ stop: bool,
952
+ reason: Option<TickStopReason>,
953
+ persisted: Option<bool>,
954
+ collections: TickCollections,
955
+ ) -> TickReport {
956
+ TickReport {
957
+ ok,
958
+ stop,
959
+ reason,
960
+ persisted,
961
+ delivered: collections.delivered,
962
+ scheduled: collections.scheduled,
963
+ stuck: collections.stuck,
964
+ idle_alerts: collections.idle_alerts,
965
+ deadlock_alerts: collections.deadlock_alerts,
966
+ compaction: Vec::new(),
967
+ session_drift: Vec::new(),
968
+ api_errors: Vec::new(),
969
+ results: collections.results,
970
+ }
971
+ }
972
+
973
+ #[derive(Default)]
974
+ struct TickCollections {
975
+ delivered: Vec<DeliveredMessage>,
976
+ scheduled: Vec<FiredScheduledEvent>,
977
+ stuck: Vec<AgentId>,
978
+ idle_alerts: Vec<IdleAlert>,
979
+ deadlock_alerts: Vec<DeadlockAlert>,
980
+ results: Vec<CollectedResult>,
981
+ }
982
+
983
+ fn empty_tick_report(
984
+ ok: bool,
985
+ stop: bool,
986
+ reason: Option<TickStopReason>,
987
+ persisted: Option<bool>,
988
+ ) -> TickReport {
989
+ base_tick_report(
990
+ ok,
991
+ stop,
992
+ reason,
993
+ persisted,
994
+ TickCollections::default(),
995
+ )
996
+ }
997
+
998
+ fn collect_results(value: Value) -> Vec<CollectedResult> {
999
+ let Some(result_id) = value.get("result_id").and_then(Value::as_str) else {
1000
+ return Vec::new();
1001
+ };
1002
+ vec![CollectedResult { result_id: result_id.to_string() }]
1003
+ }
1004
+
1005
+ struct ProviderTurnClassifier;
1006
+
1007
+ impl TurnStateClassifier for ProviderTurnClassifier {
1008
+ fn classify(
1009
+ &self,
1010
+ provider: crate::provider::Provider,
1011
+ session_log_text: &str,
1012
+ ) -> Result<TurnClassification, crate::leader::LeaderError> {
1013
+ let result = crate::provider::classify(
1014
+ provider,
1015
+ session_log_text,
1016
+ ProcessLiveness::Unverifiable,
1017
+ 0.0,
1018
+ )
1019
+ .map_err(|e| crate::leader::LeaderError::Validation(e.to_string()))?;
1020
+ Ok(TurnClassification {
1021
+ state: result.state,
1022
+ turn_id: result.turn_id.map(|id| id.as_str().to_string()),
1023
+ annotations: result.annotations,
1024
+ reason: Some(result.reason),
1025
+ })
1026
+ }
1027
+ }
1028
+
1029
+ fn increment_coordinator_tick_iteration_count(state: &mut Value) {
1030
+ let Some(state_obj) = state.as_object_mut() else {
1031
+ return;
1032
+ };
1033
+ let coordinator = state_obj
1034
+ .entry("coordinator".to_string())
1035
+ .or_insert_with(|| serde_json::json!({}));
1036
+ if !coordinator.is_object() {
1037
+ *coordinator = serde_json::json!({});
1038
+ }
1039
+ let Some(coord_obj) = coordinator.as_object_mut() else {
1040
+ return;
1041
+ };
1042
+ let next = coord_obj
1043
+ .get("coordinator_tick_iteration_count")
1044
+ .and_then(Value::as_u64)
1045
+ .unwrap_or(0)
1046
+ .saturating_add(1);
1047
+ coord_obj.insert(
1048
+ "coordinator_tick_iteration_count".to_string(),
1049
+ serde_json::json!(next),
1050
+ );
1051
+ }
1052
+
1053
+ fn idle_node_value(node: &crate::leader::IdleNode) -> Value {
1054
+ serde_json::json!({
1055
+ "node_id": node.node_id,
1056
+ "role": match node.role {
1057
+ crate::leader::NodeRole::Worker => "worker",
1058
+ crate::leader::NodeRole::Leader => "leader",
1059
+ },
1060
+ "state": turn_state_wire(node.state),
1061
+ })
1062
+ }
1063
+
1064
+ fn turn_state_wire(state: TurnState) -> &'static str {
1065
+ match state {
1066
+ TurnState::Idle => "idle",
1067
+ TurnState::Working => "working",
1068
+ TurnState::IdleInterrupted => "idle_interrupted",
1069
+ TurnState::BlockedOnHuman => "blocked_on_human",
1070
+ TurnState::Abnormal => "abnormal",
1071
+ TurnState::Unknown => "unknown",
1072
+ }
1073
+ }
1074
+
1075
+ fn provider_wire(provider: crate::model::enums::Provider) -> &'static str {
1076
+ match provider {
1077
+ crate::model::enums::Provider::Claude => "claude",
1078
+ crate::model::enums::Provider::ClaudeCode => "claude_code",
1079
+ crate::model::enums::Provider::Codex => "codex",
1080
+ crate::model::enums::Provider::GeminiCli => "gemini_cli",
1081
+ crate::model::enums::Provider::Fake => "fake",
1082
+ }
1083
+ }
1084
+
1085
+ #[derive(Debug, Clone)]
1086
+ struct AbnormalWatchAgent {
1087
+ agent_id: String,
1088
+ provider: crate::model::enums::Provider,
1089
+ rollout_path: PathBuf,
1090
+ rollout_path_display: String,
1091
+ status: Option<String>,
1092
+ process_liveness: Option<ProcessLiveness>,
1093
+ window: Option<String>,
1094
+ pane_id: Option<String>,
1095
+ pid: Option<Pid>,
1096
+ current_command: Option<String>,
1097
+ }
1098
+
1099
+ #[derive(Debug, Clone, PartialEq, Eq)]
1100
+ struct ProcessCheck {
1101
+ state: ProcessLiveness,
1102
+ detail: String,
1103
+ }
1104
+
1105
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
1106
+ enum AbnormalExitDecision {
1107
+ Notify,
1108
+ Suppress(&'static str),
1109
+ NoSignal,
1110
+ }
1111
+
1112
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
1113
+ struct AbnormalExitGate {
1114
+ provider_process_dead: bool,
1115
+ latest_explicit_error: bool,
1116
+ }
1117
+
1118
+ impl AbnormalExitGate {
1119
+ fn new(process_liveness: ProcessLiveness, latest_explicit_error: bool) -> Self {
1120
+ Self {
1121
+ provider_process_dead: process_liveness == ProcessLiveness::Dead,
1122
+ latest_explicit_error,
1123
+ }
1124
+ }
1125
+
1126
+ fn should_notify_worker_abnormal_exit(self) -> bool {
1127
+ should_notify_worker_abnormal_exit(self.provider_process_dead, self.latest_explicit_error)
1128
+ }
1129
+
1130
+ fn suppressed_reason(self) -> Option<&'static str> {
1131
+ match (self.provider_process_dead, self.latest_explicit_error) {
1132
+ (true, false) => Some("dead_only"),
1133
+ (false, true) => Some("error_only"),
1134
+ _ => None,
1135
+ }
1136
+ }
1137
+ }
1138
+
1139
+ fn abnormal_exit_decision(
1140
+ process_liveness: ProcessLiveness,
1141
+ latest_explicit_error: Option<&crate::provider::FaultFact>,
1142
+ ) -> AbnormalExitDecision {
1143
+ let gate = AbnormalExitGate::new(process_liveness, latest_explicit_error.is_some());
1144
+ if gate.should_notify_worker_abnormal_exit() {
1145
+ return AbnormalExitDecision::Notify;
1146
+ }
1147
+ match gate.suppressed_reason() {
1148
+ Some(reason) => AbnormalExitDecision::Suppress(reason),
1149
+ None => AbnormalExitDecision::NoSignal,
1150
+ }
1151
+ }
1152
+
1153
+ fn should_notify_worker_abnormal_exit(
1154
+ provider_process_dead: bool,
1155
+ latest_explicit_error: bool,
1156
+ ) -> bool {
1157
+ provider_process_dead && latest_explicit_error
1158
+ }
1159
+
1160
+ fn resolve_agent_rollout_path(workspace: &Path, path: &Path) -> PathBuf {
1161
+ if path.is_absolute() {
1162
+ path.to_path_buf()
1163
+ } else {
1164
+ workspace.join(path)
1165
+ }
1166
+ }
1167
+
1168
+ fn abnormal_watch_agents(state: &Value) -> Vec<AbnormalWatchAgent> {
1169
+ let Some(agents) = state.get("agents").and_then(Value::as_object) else {
1170
+ return Vec::new();
1171
+ };
1172
+ agents
1173
+ .iter()
1174
+ .filter_map(|(agent_id, agent)| {
1175
+ if matches!(
1176
+ agent.get("status").and_then(Value::as_str),
1177
+ Some("paused")
1178
+ ) {
1179
+ return None;
1180
+ }
1181
+ let provider = agent.get("provider").and_then(Value::as_str).and_then(parse_provider)?;
1182
+ let rollout_path_display = ["rollout_path", "transcript_path", "session_log_path"]
1183
+ .into_iter()
1184
+ .find_map(|key| agent.get(key).and_then(Value::as_str))
1185
+ .filter(|path| !path.is_empty())?
1186
+ .to_string();
1187
+ Some(AbnormalWatchAgent {
1188
+ agent_id: agent_id.clone(),
1189
+ provider,
1190
+ rollout_path: PathBuf::from(&rollout_path_display),
1191
+ rollout_path_display,
1192
+ status: agent.get("status").and_then(Value::as_str).map(str::to_string),
1193
+ process_liveness: explicit_process_liveness(agent),
1194
+ window: agent.get("window").and_then(Value::as_str).map(str::to_string),
1195
+ pane_id: agent.get("pane_id").and_then(Value::as_str).map(str::to_string),
1196
+ pid: agent_pid(agent),
1197
+ current_command: agent
1198
+ .get("pane_current_command")
1199
+ .or_else(|| agent.get("current_command"))
1200
+ .and_then(Value::as_str)
1201
+ .map(str::to_string),
1202
+ })
1203
+ })
1204
+ .collect()
1205
+ }
1206
+
1207
+ fn agent_pid(agent: &Value) -> Option<Pid> {
1208
+ ["provider_pid", "process_id", "pid", "child_pid", "pane_pid"]
1209
+ .into_iter()
1210
+ .find_map(|key| json_u32(agent.get(key)).map(Pid::new))
1211
+ }
1212
+
1213
+ fn explicit_process_liveness(agent: &Value) -> Option<ProcessLiveness> {
1214
+ if let Some(process) = agent.get("provider_process").or_else(|| agent.get("process")) {
1215
+ if let Some(liveness) = explicit_process_liveness(process) {
1216
+ return Some(liveness);
1217
+ }
1218
+ }
1219
+ for key in ["provider_process_liveness", "process_liveness", "pane_liveness"] {
1220
+ match agent.get(key).and_then(Value::as_str) {
1221
+ Some("dead") => return Some(ProcessLiveness::Dead),
1222
+ Some("alive" | "live") => return Some(ProcessLiveness::Alive),
1223
+ Some("unverifiable" | "unknown") => return Some(ProcessLiveness::Unverifiable),
1224
+ _ => {}
1225
+ }
1226
+ }
1227
+ for key in ["provider_process_alive", "process_alive", "provider_alive", "alive"] {
1228
+ if let Some(alive) = agent.get(key).and_then(Value::as_bool) {
1229
+ return Some(if alive { ProcessLiveness::Alive } else { ProcessLiveness::Dead });
1230
+ }
1231
+ }
1232
+ for key in ["provider_process_dead", "process_dead", "provider_dead", "dead"] {
1233
+ if let Some(dead) = agent.get(key).and_then(Value::as_bool) {
1234
+ return Some(if dead { ProcessLiveness::Dead } else { ProcessLiveness::Alive });
1235
+ }
1236
+ }
1237
+ for key in ["status", "state", "liveness"] {
1238
+ match agent.get(key).and_then(Value::as_str) {
1239
+ Some("dead" | "exited" | "terminated" | "crashed" | "missing") => {
1240
+ return Some(ProcessLiveness::Dead);
1241
+ }
1242
+ Some("alive" | "live" | "running") => return Some(ProcessLiveness::Alive),
1243
+ Some("unverifiable" | "unknown") => return Some(ProcessLiveness::Unverifiable),
1244
+ _ => {}
1245
+ }
1246
+ }
1247
+ None
1248
+ }
1249
+
1250
+ fn json_u32(value: Option<&Value>) -> Option<u32> {
1251
+ value
1252
+ .and_then(|v| v.as_u64().or_else(|| v.as_i64().and_then(|n| u64::try_from(n).ok())))
1253
+ .and_then(|n| u32::try_from(n).ok())
1254
+ }
1255
+
1256
+ fn agent_process_liveness(
1257
+ agent: &AbnormalWatchAgent,
1258
+ session_name: Option<&str>,
1259
+ targets: &[crate::transport::PaneInfo],
1260
+ transport: &dyn crate::transport::Transport,
1261
+ ) -> ProcessCheck {
1262
+ if let Some(pid) = agent.pid {
1263
+ return pid_process_check("pid", pid);
1264
+ }
1265
+ if let Some(liveness) = agent.process_liveness {
1266
+ return process_check(liveness, format!("explicit:{}", process_liveness_wire(liveness)));
1267
+ }
1268
+ if agent.status.as_deref().is_some_and(|status| {
1269
+ matches!(
1270
+ status,
1271
+ "stopped" | "missing" | "error" | "dead" | "exited" | "terminated" | "crashed"
1272
+ )
1273
+ })
1274
+ {
1275
+ return process_check(
1276
+ ProcessLiveness::Dead,
1277
+ format!("status:{}", agent.status.as_deref().unwrap_or("unknown")),
1278
+ );
1279
+ }
1280
+ if let Some(command) = agent.current_command.as_deref() {
1281
+ return command_process_check(agent.provider, command);
1282
+ }
1283
+ if let Some(target) = matching_agent_target(agent, session_name, targets) {
1284
+ if let Some(command) = target.current_command.as_deref() {
1285
+ return command_process_check(agent.provider, command);
1286
+ }
1287
+ if let Some(pid) = target.pane_pid.map(Pid::new) {
1288
+ return pid_process_check("pane_pid", pid);
1289
+ }
1290
+ return process_check(ProcessLiveness::Unverifiable, "pane_present_pid_unknown".to_string());
1291
+ }
1292
+ if let Some(pane_id) = agent.pane_id.as_deref() {
1293
+ let pane = crate::transport::PaneId::new(pane_id);
1294
+ return match transport.liveness(&pane) {
1295
+ Ok(crate::transport::PaneLiveness::Dead) => {
1296
+ process_check(ProcessLiveness::Dead, format!("pane_dead:{pane_id}"))
1297
+ }
1298
+ Ok(crate::transport::PaneLiveness::Live) => {
1299
+ process_check(ProcessLiveness::Unverifiable, format!("pane_live_pid_unknown:{pane_id}"))
1300
+ }
1301
+ Ok(crate::transport::PaneLiveness::Unknown) => {
1302
+ process_check(ProcessLiveness::Unverifiable, format!("pane_unknown:{pane_id}"))
1303
+ }
1304
+ Err(error) => {
1305
+ process_check(ProcessLiveness::Unverifiable, format!("pane_unverifiable:{pane_id}:{error}"))
1306
+ }
1307
+ };
1308
+ }
1309
+ let (Some(session), Some(window)) = (session_name, agent.window.as_deref()) else {
1310
+ return process_check(ProcessLiveness::Unverifiable, "missing_session_or_window".to_string());
1311
+ };
1312
+ let session = crate::transport::SessionName::new(session);
1313
+ match transport.list_windows(&session) {
1314
+ Ok(windows) if windows.iter().any(|known| known.as_str() == window) => {
1315
+ process_check(ProcessLiveness::Unverifiable, "window_present_pid_unknown".to_string())
1316
+ }
1317
+ Ok(_) => process_check(ProcessLiveness::Dead, format!("window_missing:{window}")),
1318
+ Err(error) => process_check(ProcessLiveness::Unverifiable, format!("window_unverifiable:{window}:{error}")),
1319
+ }
1320
+ }
1321
+
1322
+ fn matching_agent_target<'a>(
1323
+ agent: &AbnormalWatchAgent,
1324
+ session_name: Option<&str>,
1325
+ targets: &'a [crate::transport::PaneInfo],
1326
+ ) -> Option<&'a crate::transport::PaneInfo> {
1327
+ if let Some(pane_id) = agent.pane_id.as_deref() {
1328
+ if let Some(target) = targets.iter().find(|target| target.pane_id.as_str() == pane_id) {
1329
+ return Some(target);
1330
+ }
1331
+ }
1332
+ let (Some(session), Some(window)) = (session_name, agent.window.as_deref()) else {
1333
+ return None;
1334
+ };
1335
+ targets.iter().find(|target| {
1336
+ target.session.as_str() == session
1337
+ && target
1338
+ .window_name
1339
+ .as_ref()
1340
+ .is_some_and(|known| known.as_str() == window)
1341
+ })
1342
+ }
1343
+
1344
+ fn pid_process_check(label: &str, pid: Pid) -> ProcessCheck {
1345
+ match pid_is_running(pid) {
1346
+ Ok(true) => process_check(ProcessLiveness::Alive, format!("{label}_running:{pid}")),
1347
+ Ok(false) => process_check(ProcessLiveness::Dead, format!("{label}_not_running:{pid}")),
1348
+ Err(error) => process_check(ProcessLiveness::Unverifiable, format!("{label}_unverifiable:{pid}:{error}")),
1349
+ }
1350
+ }
1351
+
1352
+ fn command_process_check(provider: crate::model::enums::Provider, command: &str) -> ProcessCheck {
1353
+ if provider_command_matches(provider, command) {
1354
+ process_check(ProcessLiveness::Alive, format!("current_command:{command}"))
1355
+ } else {
1356
+ process_check(ProcessLiveness::Dead, format!("provider_not_foreground:{command}"))
1357
+ }
1358
+ }
1359
+
1360
+ fn provider_command_matches(provider: crate::model::enums::Provider, command: &str) -> bool {
1361
+ let lower = command.to_ascii_lowercase();
1362
+ match provider {
1363
+ crate::model::enums::Provider::Claude | crate::model::enums::Provider::ClaudeCode => {
1364
+ lower.contains("claude")
1365
+ }
1366
+ crate::model::enums::Provider::Codex => lower.contains("codex"),
1367
+ crate::model::enums::Provider::GeminiCli => lower.contains("gemini"),
1368
+ crate::model::enums::Provider::Fake => lower.contains("fake"),
1369
+ }
1370
+ }
1371
+
1372
+ fn process_check(state: ProcessLiveness, detail: String) -> ProcessCheck {
1373
+ ProcessCheck { state, detail }
1374
+ }
1375
+
1376
+ fn process_liveness_wire(state: ProcessLiveness) -> &'static str {
1377
+ match state {
1378
+ ProcessLiveness::Alive => "alive",
1379
+ ProcessLiveness::Dead => "dead",
1380
+ ProcessLiveness::Unverifiable => "unverifiable",
1381
+ }
1382
+ }
1383
+
1384
+ fn metadata_mtime_ns(metadata: &std::fs::Metadata) -> Option<u64> {
1385
+ let duration = metadata
1386
+ .modified()
1387
+ .ok()?
1388
+ .duration_since(std::time::UNIX_EPOCH)
1389
+ .ok()?;
1390
+ Some(
1391
+ duration
1392
+ .as_secs()
1393
+ .saturating_mul(1_000_000_000)
1394
+ .saturating_add(u64::from(duration.subsec_nanos())),
1395
+ )
1396
+ }
1397
+
1398
+ fn abnormal_watch_payload(
1399
+ agent: &AbnormalWatchAgent,
1400
+ size: Option<u64>,
1401
+ mtime_ns: Option<u64>,
1402
+ liveness: &str,
1403
+ signature: Option<&str>,
1404
+ error: Option<String>,
1405
+ ) -> Value {
1406
+ let dead_process = liveness == "dead";
1407
+ let latest_explicit_error = signature.is_some();
1408
+ let notify = dead_process && latest_explicit_error;
1409
+ let suppressed_reason = match (dead_process, latest_explicit_error) {
1410
+ (true, false) => Some("dead_only"),
1411
+ (false, true) => Some("error_only"),
1412
+ _ => None,
1413
+ };
1414
+ serde_json::json!({
1415
+ "path": agent.rollout_path_display.as_str(),
1416
+ "provider": provider_wire(agent.provider),
1417
+ "mtime_ns": mtime_ns,
1418
+ "size": size,
1419
+ "last_offset": size,
1420
+ "last_signature": signature,
1421
+ "last_liveness": liveness,
1422
+ "dead_process": dead_process,
1423
+ "process_dead": dead_process,
1424
+ "provider_process_dead": dead_process,
1425
+ "latest_error": latest_explicit_error,
1426
+ "latest_explicit_error": latest_explicit_error,
1427
+ "dead_process_and_latest_error": notify,
1428
+ "dead_process_and_latest_explicit_error": notify,
1429
+ "process_dead_and_latest_explicit_error": notify,
1430
+ "provider_process_dead_and_latest_explicit_error": notify,
1431
+ "suppressed_reason": suppressed_reason,
1432
+ "notification": notify,
1433
+ "last_error": error,
1434
+ "last_checked_at": chrono::Utc::now().to_rfc3339(),
1435
+ })
1436
+ }
1437
+
1438
+ fn upsert_abnormal_watch(state: &mut Value, agent_id: &str, mut payload: Value) {
1439
+ let preserved = [
1440
+ "last_notified_key",
1441
+ "last_notified_at",
1442
+ "last_suppressed_key",
1443
+ "last_suppressed_at",
1444
+ "last_check_key",
1445
+ "last_check_at",
1446
+ ]
1447
+ .into_iter()
1448
+ .filter_map(|key| abnormal_watch_field(state, agent_id, key).map(|value| (key, value)))
1449
+ .collect::<Vec<_>>();
1450
+ if let Some(watch) = coordinator_child_object(state, "abnormal_exit_watch") {
1451
+ if let Some(payload_obj) = payload.as_object_mut() {
1452
+ for (key, value) in preserved {
1453
+ payload_obj.insert(key.to_string(), value);
1454
+ }
1455
+ }
1456
+ watch.insert(agent_id.to_string(), payload);
1457
+ }
1458
+ }
1459
+
1460
+ fn coordinator_child_object<'a>(
1461
+ state: &'a mut Value,
1462
+ key: &str,
1463
+ ) -> Option<&'a mut serde_json::Map<String, Value>> {
1464
+ if !state.is_object() {
1465
+ *state = serde_json::json!({});
1466
+ }
1467
+ let state_obj = state.as_object_mut()?;
1468
+ let coordinator = state_obj
1469
+ .entry("coordinator".to_string())
1470
+ .or_insert_with(|| serde_json::json!({}));
1471
+ if !coordinator.is_object() {
1472
+ *coordinator = serde_json::json!({});
1473
+ }
1474
+ let coord_obj = coordinator.as_object_mut()?;
1475
+ let child = coord_obj
1476
+ .entry(key.to_string())
1477
+ .or_insert_with(|| serde_json::json!({}));
1478
+ if !child.is_object() {
1479
+ *child = serde_json::json!({});
1480
+ }
1481
+ child.as_object_mut()
1482
+ }
1483
+
1484
+ fn abnormal_last_notified_key(state: &Value, agent_id: &str) -> Option<String> {
1485
+ abnormal_watch_str(state, agent_id, "last_notified_key")
1486
+ }
1487
+
1488
+ fn abnormal_last_suppressed_key(state: &Value, agent_id: &str) -> Option<String> {
1489
+ abnormal_watch_str(state, agent_id, "last_suppressed_key")
1490
+ }
1491
+
1492
+ fn abnormal_last_check_key(state: &Value, agent_id: &str) -> Option<String> {
1493
+ abnormal_watch_str(state, agent_id, "last_check_key")
1494
+ }
1495
+
1496
+ fn abnormal_watch_str(state: &Value, agent_id: &str, field: &str) -> Option<String> {
1497
+ state
1498
+ .get("coordinator")
1499
+ .and_then(|v| v.get("abnormal_exit_watch"))
1500
+ .and_then(|v| v.get(agent_id))
1501
+ .and_then(|v| v.get(field))
1502
+ .and_then(Value::as_str)
1503
+ .map(str::to_string)
1504
+ }
1505
+
1506
+ fn abnormal_watch_field(state: &Value, agent_id: &str, field: &str) -> Option<Value> {
1507
+ state
1508
+ .get("coordinator")
1509
+ .and_then(|v| v.get("abnormal_exit_watch"))
1510
+ .and_then(|v| v.get(agent_id))
1511
+ .and_then(|v| v.get(field))
1512
+ .cloned()
1513
+ }
1514
+
1515
+ fn mark_abnormal_notified(state: &mut Value, agent_id: &str, key: &str) {
1516
+ if let Some(watch) = coordinator_child_object(state, "abnormal_exit_watch") {
1517
+ let entry = watch
1518
+ .entry(agent_id.to_string())
1519
+ .or_insert_with(|| serde_json::json!({}));
1520
+ if !entry.is_object() {
1521
+ *entry = serde_json::json!({});
1522
+ }
1523
+ if let Some(obj) = entry.as_object_mut() {
1524
+ obj.insert("last_notified_key".to_string(), serde_json::json!(key));
1525
+ obj.insert("last_notified_at".to_string(), serde_json::json!(chrono::Utc::now().to_rfc3339()));
1526
+ }
1527
+ }
1528
+ }
1529
+
1530
+ fn mark_abnormal_suppressed(state: &mut Value, agent_id: &str, key: &str) {
1531
+ if let Some(watch) = coordinator_child_object(state, "abnormal_exit_watch") {
1532
+ let entry = watch
1533
+ .entry(agent_id.to_string())
1534
+ .or_insert_with(|| serde_json::json!({}));
1535
+ if !entry.is_object() {
1536
+ *entry = serde_json::json!({});
1537
+ }
1538
+ if let Some(obj) = entry.as_object_mut() {
1539
+ obj.insert("last_suppressed_key".to_string(), serde_json::json!(key));
1540
+ obj.insert("last_suppressed_at".to_string(), serde_json::json!(chrono::Utc::now().to_rfc3339()));
1541
+ }
1542
+ }
1543
+ }
1544
+
1545
+ fn mark_abnormal_checked(state: &mut Value, agent_id: &str, key: &str) {
1546
+ if let Some(watch) = coordinator_child_object(state, "abnormal_exit_watch") {
1547
+ let entry = watch
1548
+ .entry(agent_id.to_string())
1549
+ .or_insert_with(|| serde_json::json!({}));
1550
+ if !entry.is_object() {
1551
+ *entry = serde_json::json!({});
1552
+ }
1553
+ if let Some(obj) = entry.as_object_mut() {
1554
+ obj.insert("last_check_key".to_string(), serde_json::json!(key));
1555
+ obj.insert("last_check_at".to_string(), serde_json::json!(chrono::Utc::now().to_rfc3339()));
1556
+ }
1557
+ }
1558
+ }
1559
+
1560
+ fn write_abnormal_check(
1561
+ event_log: &EventLog,
1562
+ team: &str,
1563
+ agent: &AbnormalWatchAgent,
1564
+ liveness: &ProcessCheck,
1565
+ fact: Option<&crate::provider::FaultFact>,
1566
+ decision: AbnormalExitDecision,
1567
+ size: u64,
1568
+ mtime_ns: Option<u64>,
1569
+ ) -> Result<(), TickError> {
1570
+ let dead_process = liveness.state == ProcessLiveness::Dead;
1571
+ let latest_explicit_error = fact.is_some();
1572
+ event_log.write(
1573
+ "worker.abnormal_exit.check",
1574
+ serde_json::json!({
1575
+ "team_id": team,
1576
+ "agent_id": agent.agent_id.as_str(),
1577
+ "provider": provider_wire(agent.provider),
1578
+ "path": agent.rollout_path_display.as_str(),
1579
+ "size": size,
1580
+ "last_offset": size,
1581
+ "mtime_ns": mtime_ns,
1582
+ "dead_process": dead_process,
1583
+ "process_dead": dead_process,
1584
+ "provider_process_dead": dead_process,
1585
+ "latest_error": latest_explicit_error,
1586
+ "latest_explicit_error": latest_explicit_error,
1587
+ "dead_process_and_latest_error": dead_process && latest_explicit_error,
1588
+ "dead_process_and_latest_explicit_error": dead_process && latest_explicit_error,
1589
+ "process_dead_and_latest_explicit_error": dead_process && latest_explicit_error,
1590
+ "provider_process_dead_and_latest_explicit_error": dead_process && latest_explicit_error,
1591
+ "notification": matches!(decision, AbnormalExitDecision::Notify),
1592
+ "suppressed_reason": match decision {
1593
+ AbnormalExitDecision::Suppress(reason) => Some(reason),
1594
+ AbnormalExitDecision::Notify | AbnormalExitDecision::NoSignal => None,
1595
+ },
1596
+ "signature": fact.map(|fact| fact.signature.as_str()),
1597
+ "turn_id": fact.and_then(|fact| fact.turn_id.as_ref().map(|id| id.as_str())),
1598
+ "process_liveness": process_liveness_wire(liveness.state),
1599
+ "pid_status": liveness.detail.as_str(),
1600
+ }),
1601
+ )?;
1602
+ Ok(())
1603
+ }
1604
+
1605
+ fn write_abnormal_suppressed(
1606
+ event_log: &EventLog,
1607
+ team: &str,
1608
+ agent: &AbnormalWatchAgent,
1609
+ liveness: &ProcessCheck,
1610
+ reason: &str,
1611
+ ) -> Result<(), TickError> {
1612
+ event_log.write(
1613
+ "abnormal_exit.single_signal_suppressed",
1614
+ serde_json::json!({
1615
+ "team_id": team,
1616
+ "agent_id": agent.agent_id.as_str(),
1617
+ "provider": provider_wire(agent.provider),
1618
+ "path": agent.rollout_path_display.as_str(),
1619
+ "reason": reason,
1620
+ "notification": false,
1621
+ "dead_process": liveness.state == ProcessLiveness::Dead,
1622
+ "process_dead": liveness.state == ProcessLiveness::Dead,
1623
+ "provider_process_dead": liveness.state == ProcessLiveness::Dead,
1624
+ "latest_error": reason == "error_only",
1625
+ "latest_explicit_error": reason == "error_only",
1626
+ "dead_process_and_latest_error": false,
1627
+ "dead_process_and_latest_explicit_error": false,
1628
+ "process_dead_and_latest_explicit_error": false,
1629
+ "provider_process_dead_and_latest_explicit_error": false,
1630
+ "process_liveness": process_liveness_wire(liveness.state),
1631
+ "pid_status": liveness.detail.as_str(),
1632
+ }),
1633
+ )?;
1634
+ Ok(())
1635
+ }
1636
+
1637
+ fn abnormal_dedupe_key(
1638
+ agent: &AbnormalWatchAgent,
1639
+ fact: &crate::provider::FaultFact,
1640
+ size: u64,
1641
+ ) -> String {
1642
+ let bucket = fact
1643
+ .turn_id
1644
+ .as_ref()
1645
+ .map(|id| id.as_str().to_string())
1646
+ .unwrap_or_else(|| size.to_string());
1647
+ format!(
1648
+ "worker.abnormal_exit:{}:{}:{}:{}",
1649
+ agent.agent_id,
1650
+ agent.rollout_path_display,
1651
+ fact.signature.as_str(),
1652
+ bucket
1653
+ )
1654
+ }
1655
+
1656
+ fn abnormal_suppression_key(
1657
+ agent: &AbnormalWatchAgent,
1658
+ liveness: &ProcessCheck,
1659
+ reason: &str,
1660
+ size: u64,
1661
+ ) -> String {
1662
+ format!(
1663
+ "abnormal_exit.single_signal_suppressed:{}:{}:{}:{}:{}",
1664
+ agent.agent_id,
1665
+ agent.rollout_path_display,
1666
+ reason,
1667
+ process_liveness_wire(liveness.state),
1668
+ size
1669
+ )
1670
+ }
1671
+
1672
+ fn abnormal_check_key(
1673
+ agent: &AbnormalWatchAgent,
1674
+ liveness: &ProcessCheck,
1675
+ fact: Option<&crate::provider::FaultFact>,
1676
+ size: u64,
1677
+ ) -> String {
1678
+ format!(
1679
+ "worker.abnormal_exit.check:{}:{}:{}:{}:{}",
1680
+ agent.agent_id,
1681
+ agent.rollout_path_display,
1682
+ process_liveness_wire(liveness.state),
1683
+ fact.map(|fact| fact.signature.as_str()).unwrap_or("-"),
1684
+ size
1685
+ )
1686
+ }
1687
+
1688
+ fn format_abnormal_exit_message(
1689
+ team: &str,
1690
+ agent: &AbnormalWatchAgent,
1691
+ fact: &crate::provider::FaultFact,
1692
+ liveness: &ProcessCheck,
1693
+ size: u64,
1694
+ ) -> String {
1695
+ let turn_id = fact.turn_id.as_ref().map(|id| id.as_str()).unwrap_or("-");
1696
+ format!(
1697
+ "Team Agent detected a provider abnormal exit.\n\n\
1698
+ event: worker.abnormal_exit\n\
1699
+ team: {team}\n\
1700
+ node: {node}\n\
1701
+ provider: {provider}\n\
1702
+ signature: {signature}\n\
1703
+ turn_id: {turn_id}\n\
1704
+ transcript: {path}\n\
1705
+ last_offset: {size}\n\
1706
+ pid_status: {pid_status}\n\n\
1707
+ No automatic restart was performed.",
1708
+ node = agent.agent_id.as_str(),
1709
+ provider = provider_wire(agent.provider),
1710
+ signature = fact.signature.as_str(),
1711
+ path = agent.rollout_path_display.as_str(),
1712
+ pid_status = liveness.detail.as_str(),
1713
+ )
1714
+ }
1715
+
1716
+ fn monotonic_seconds() -> f64 {
1717
+ match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) {
1718
+ Ok(duration) => duration.as_secs_f64(),
1719
+ Err(_) => 0.0,
1720
+ }
1721
+ }
1722
+
1723
+ fn parse_provider(raw: &str) -> Option<crate::model::enums::Provider> {
1724
+ match raw {
1725
+ "claude" => Some(crate::model::enums::Provider::Claude),
1726
+ "claude_code" => Some(crate::model::enums::Provider::ClaudeCode),
1727
+ "codex" => Some(crate::model::enums::Provider::Codex),
1728
+ "gemini_cli" => Some(crate::model::enums::Provider::GeminiCli),
1729
+ "fake" => Some(crate::model::enums::Provider::Fake),
1730
+ _ => None,
1731
+ }
1732
+ }
1733
+
1734
+ fn capture_window_target(
1735
+ agent: &Value,
1736
+ session_name: Option<&str>,
1737
+ ) -> Option<(
1738
+ crate::transport::SessionName,
1739
+ crate::transport::WindowName,
1740
+ crate::transport::Target,
1741
+ )> {
1742
+ let window = agent.get("window").and_then(Value::as_str).filter(|s| !s.is_empty())?;
1743
+ let session = session_name.filter(|s| !s.is_empty())?;
1744
+ let session = crate::transport::SessionName::new(session);
1745
+ let window = crate::transport::WindowName::new(window);
1746
+ Some((
1747
+ session.clone(),
1748
+ window.clone(),
1749
+ crate::transport::Target::SessionWindow { session, window },
1750
+ ))
1751
+ }
1752
+
1753
+ fn runtime_approval_target(agent: &Value, session_name: Option<&str>) -> Option<crate::transport::Target> {
1754
+ if let Some(pane_id) = agent
1755
+ .get("pane_id")
1756
+ .and_then(Value::as_str)
1757
+ .filter(|pane_id| !pane_id.is_empty())
1758
+ {
1759
+ return Some(crate::transport::Target::Pane(crate::transport::PaneId::new(pane_id)));
1760
+ }
1761
+ capture_window_target(agent, session_name).map(|(_, _, target)| target)
1762
+ }
1763
+
1764
+ fn runtime_approval_key(raw: String) -> Option<crate::transport::Key> {
1765
+ match raw.as_str() {
1766
+ "Enter" => Some(crate::transport::Key::Enter),
1767
+ "Up" => Some(crate::transport::Key::Up),
1768
+ "Down" => Some(crate::transport::Key::Down),
1769
+ "Left" => Some(crate::transport::Key::Left),
1770
+ "Right" => Some(crate::transport::Key::Right),
1771
+ other => {
1772
+ let mut chars = other.chars();
1773
+ let ch = chars.next()?;
1774
+ if chars.next().is_none() {
1775
+ Some(crate::transport::Key::Char(ch))
1776
+ } else {
1777
+ None
1778
+ }
1779
+ }
1780
+ }
1781
+ }
1782
+
1783
+ fn runtime_approval_auto_answer_allowed() -> bool {
1784
+ crate::lifecycle::launch::detect_dangerous_approval()
1785
+ .map(|safety| safety.enabled && !safety.worker_capability_above_leader)
1786
+ .unwrap_or(false)
1787
+ }
1788
+
1789
+ fn awaiting_human_confirm_payload(
1790
+ agent: &Value,
1791
+ fact: &crate::provider::AwaitingHumanConfirmFact,
1792
+ ) -> Value {
1793
+ let mut payload = fact.to_event_payload();
1794
+ let excerpt = fact.prompt.lines().next().unwrap_or("").chars().take(240).collect::<String>();
1795
+ if let Some(obj) = payload.as_object_mut() {
1796
+ obj.insert("team_id".to_string(), serde_json::json!(fact.team));
1797
+ obj.insert("owner_team_id".to_string(), serde_json::json!(fact.team));
1798
+ if let Some(provider) = agent.get("provider").and_then(Value::as_str) {
1799
+ obj.insert("provider".to_string(), serde_json::json!(provider));
1800
+ }
1801
+ if let Some(pane_id) = agent.get("pane_id").and_then(Value::as_str) {
1802
+ obj.insert("pane_id".to_string(), serde_json::json!(pane_id));
1803
+ }
1804
+ obj.insert("excerpt".to_string(), serde_json::json!(excerpt));
1805
+ }
1806
+ payload
1807
+ }
1808
+
1809
+ enum AwaitingDedupUpdate {
1810
+ Remember(crate::provider::AwaitingHumanConfirmFact),
1811
+ Clear { team: String, agent_id: String },
1812
+ }
1813
+
1814
+ fn state_awaiting_human_confirm_fingerprint(
1815
+ state: &Value,
1816
+ team: &str,
1817
+ agent_id: &str,
1818
+ ) -> Option<String> {
1819
+ state
1820
+ .get("coordinator")
1821
+ .and_then(|coordinator| {
1822
+ coordinator
1823
+ .get("awaiting_human_confirm_seen")
1824
+ .or_else(|| coordinator.get("awaiting_human_confirm"))
1825
+ })
1826
+ .and_then(|by_team| by_team.get(team))
1827
+ .and_then(|by_agent| by_agent.get(agent_id))
1828
+ .and_then(|record| record.get("fingerprint"))
1829
+ .and_then(Value::as_str)
1830
+ .map(str::to_string)
1831
+ }
1832
+
1833
+ fn remember_state_awaiting_human_confirm(
1834
+ state: &mut Value,
1835
+ fact: &crate::provider::AwaitingHumanConfirmFact,
1836
+ ) {
1837
+ let Some(state_obj) = state.as_object_mut() else {
1838
+ return;
1839
+ };
1840
+ let coordinator = state_obj
1841
+ .entry("coordinator".to_string())
1842
+ .or_insert_with(|| serde_json::json!({}));
1843
+ if !coordinator.is_object() {
1844
+ *coordinator = serde_json::json!({});
1845
+ }
1846
+ let Some(coord_obj) = coordinator.as_object_mut() else {
1847
+ return;
1848
+ };
1849
+ let awaiting = coord_obj
1850
+ .entry("awaiting_human_confirm_seen".to_string())
1851
+ .or_insert_with(|| serde_json::json!({}));
1852
+ if !awaiting.is_object() {
1853
+ *awaiting = serde_json::json!({});
1854
+ }
1855
+ let Some(awaiting_obj) = awaiting.as_object_mut() else {
1856
+ return;
1857
+ };
1858
+ let team_entry = awaiting_obj
1859
+ .entry(fact.team.clone())
1860
+ .or_insert_with(|| serde_json::json!({}));
1861
+ if !team_entry.is_object() {
1862
+ *team_entry = serde_json::json!({});
1863
+ }
1864
+ if let Some(team_obj) = team_entry.as_object_mut() {
1865
+ team_obj.insert(
1866
+ fact.agent_id.clone(),
1867
+ serde_json::json!({
1868
+ "team": fact.team,
1869
+ "team_id": fact.team,
1870
+ "owner_team_id": fact.team,
1871
+ "agent_id": fact.agent_id,
1872
+ "fingerprint": fact.fingerprint,
1873
+ "dedupe_key": fact.dedupe_key,
1874
+ "prompt_kind": fact.prompt_kind,
1875
+ "reason": fact.reason,
1876
+ }),
1877
+ );
1878
+ }
1879
+ }
1880
+
1881
+ fn clear_state_awaiting_human_confirm(state: &mut Value, team: &str, agent_id: &str) {
1882
+ let Some(awaiting_obj) = state
1883
+ .get_mut("coordinator")
1884
+ .and_then(|coordinator| coordinator.get_mut("awaiting_human_confirm_seen"))
1885
+ .and_then(Value::as_object_mut)
1886
+ else {
1887
+ return;
1888
+ };
1889
+ let remove_team = if let Some(team_value) = awaiting_obj.get_mut(team) {
1890
+ if let Some(team_obj) = team_value.as_object_mut() {
1891
+ team_obj.remove(agent_id);
1892
+ team_obj.is_empty()
1893
+ } else {
1894
+ true
1895
+ }
1896
+ } else {
1897
+ false
1898
+ };
1899
+ if remove_team {
1900
+ awaiting_obj.remove(team);
1901
+ }
1902
+ }
1903
+
1904
+ fn remember_awaiting_human_confirm(
1905
+ agent: &mut Value,
1906
+ fact: &crate::provider::AwaitingHumanConfirmFact,
1907
+ ) {
1908
+ if let Some(agent_obj) = agent.as_object_mut() {
1909
+ agent_obj.insert(
1910
+ "awaiting_human_confirm".to_string(),
1911
+ serde_json::json!({
1912
+ "team": fact.team,
1913
+ "team_id": fact.team,
1914
+ "owner_team_id": fact.team,
1915
+ "agent_id": fact.agent_id,
1916
+ "fingerprint": fact.fingerprint,
1917
+ "dedupe_key": fact.dedupe_key,
1918
+ "prompt_kind": fact.prompt_kind,
1919
+ "reason": fact.reason,
1920
+ }),
1921
+ );
1922
+ }
1923
+ }
1924
+
1925
+ fn clear_awaiting_human_confirm(agent: &mut Value) {
1926
+ if let Some(agent_obj) = agent.as_object_mut() {
1927
+ agent_obj.remove("awaiting_human_confirm");
1928
+ }
1929
+ }
1930
+
1931
+ fn write_activity(
1932
+ agent: &mut Value,
1933
+ activity: &crate::messaging::AgentActivity,
1934
+ output_advanced: bool,
1935
+ ) -> Option<String> {
1936
+ let previous_last_output = agent.get("last_output_at").and_then(Value::as_str).map(str::to_string);
1937
+ let Some(agent_obj) = agent.as_object_mut() else {
1938
+ return previous_last_output;
1939
+ };
1940
+ let status = activity_status_wire(activity.status);
1941
+ agent_obj.insert(
1942
+ "activity".to_string(),
1943
+ serde_json::json!({
1944
+ "status": status,
1945
+ "confidence": activity.confidence,
1946
+ "rationale": activity.rationale,
1947
+ }),
1948
+ );
1949
+ if output_advanced {
1950
+ let last_output_at = chrono::Utc::now().to_rfc3339();
1951
+ agent_obj.insert(
1952
+ "last_output_at".to_string(),
1953
+ serde_json::json!(last_output_at.clone()),
1954
+ );
1955
+ return Some(last_output_at);
1956
+ }
1957
+ previous_last_output
1958
+ }
1959
+
1960
+ fn activity_status_wire(status: crate::messaging::ActivityStatus) -> &'static str {
1961
+ match status {
1962
+ crate::messaging::ActivityStatus::Idle => "idle",
1963
+ crate::messaging::ActivityStatus::Working => "working",
1964
+ crate::messaging::ActivityStatus::Stuck => "stuck",
1965
+ crate::messaging::ActivityStatus::Uncertain => "uncertain",
1966
+ }
1967
+ }
1968
+
1969
+ fn agent_health_status_wire(status: crate::messaging::ActivityStatus) -> &'static str {
1970
+ match status {
1971
+ crate::messaging::ActivityStatus::Idle => "IDLE",
1972
+ crate::messaging::ActivityStatus::Working => "WORKING",
1973
+ crate::messaging::ActivityStatus::Stuck => "STUCK",
1974
+ crate::messaging::ActivityStatus::Uncertain => "UNKNOWN",
1975
+ }
1976
+ }
1977
+
1978
+ fn write_agent_health(
1979
+ store: &crate::message_store::MessageStore,
1980
+ team: &str,
1981
+ agent_id: &str,
1982
+ agent: &Value,
1983
+ activity: &crate::messaging::AgentActivity,
1984
+ last_output_at: Option<&str>,
1985
+ ) -> Result<(), crate::messaging::MessagingError> {
1986
+ let conn = crate::db::schema::open_db(store.db_path())?;
1987
+ let status = agent_health_status_wire(activity.status);
1988
+ let updated_at = chrono::Utc::now().to_rfc3339();
1989
+ let context_usage_pct = agent
1990
+ .get("context_usage_pct")
1991
+ .or_else(|| agent.get("context_usage_percent"))
1992
+ .and_then(Value::as_i64);
1993
+ let current_task_id = agent
1994
+ .get("current_task_id")
1995
+ .or_else(|| agent.get("task_id"))
1996
+ .and_then(Value::as_str);
1997
+ conn.execute(
1998
+ "insert into agent_health(
1999
+ owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at
2000
+ ) values (?1, ?2, ?3, ?4, ?5, ?6, ?7)
2001
+ on conflict(owner_team_id, agent_id) do update set
2002
+ status = excluded.status,
2003
+ last_output_at = coalesce(excluded.last_output_at, agent_health.last_output_at),
2004
+ context_usage_pct = excluded.context_usage_pct,
2005
+ current_task_id = excluded.current_task_id,
2006
+ updated_at = excluded.updated_at",
2007
+ rusqlite::params![
2008
+ team,
2009
+ agent_id,
2010
+ status,
2011
+ last_output_at,
2012
+ context_usage_pct,
2013
+ current_task_id,
2014
+ updated_at,
2015
+ ],
2016
+ )?;
2017
+ Ok(())
2018
+ }
2019
+
2020
+ fn read_pid_file(path: &Path) -> Option<Pid> {
2021
+ let text = std::fs::read_to_string(path).ok()?;
2022
+ let pid = text.trim().parse::<u32>().ok()?;
2023
+ Some(Pid(pid))
2024
+ }
2025
+
2026
+ fn remove_file_if_exists(path: &Path) -> Result<(), std::io::Error> {
2027
+ match std::fs::remove_file(path) {
2028
+ Ok(()) => Ok(()),
2029
+ Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
2030
+ Err(e) => Err(e),
2031
+ }
2032
+ }