@ag-eco/agentplate-cli 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (455) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +462 -0
  3. package/agents/ap-co-creation.md +90 -0
  4. package/agents/builder.md +144 -0
  5. package/agents/coordinator.md +377 -0
  6. package/agents/lead.md +435 -0
  7. package/agents/merger.md +164 -0
  8. package/agents/monitor.md +214 -0
  9. package/agents/orchestrator.md +239 -0
  10. package/agents/reviewer.md +140 -0
  11. package/agents/scout.md +125 -0
  12. package/agents/supervisor.md +427 -0
  13. package/package.json +66 -0
  14. package/src/agents/capabilities.test.ts +85 -0
  15. package/src/agents/capabilities.ts +125 -0
  16. package/src/agents/checkpoint.test.ts +88 -0
  17. package/src/agents/checkpoint.ts +101 -0
  18. package/src/agents/copilot-hooks-deployer.test.ts +162 -0
  19. package/src/agents/copilot-hooks-deployer.ts +93 -0
  20. package/src/agents/guard-rules.test.ts +372 -0
  21. package/src/agents/guard-rules.ts +97 -0
  22. package/src/agents/headless-mail-injector.test.ts +709 -0
  23. package/src/agents/headless-mail-injector.ts +377 -0
  24. package/src/agents/headless-prompt.test.ts +102 -0
  25. package/src/agents/headless-prompt.ts +68 -0
  26. package/src/agents/hooks-deployer.test.ts +3119 -0
  27. package/src/agents/hooks-deployer.ts +804 -0
  28. package/src/agents/identity.test.ts +604 -0
  29. package/src/agents/identity.ts +384 -0
  30. package/src/agents/lifecycle.test.ts +196 -0
  31. package/src/agents/lifecycle.ts +183 -0
  32. package/src/agents/mail-poll-detect.test.ts +153 -0
  33. package/src/agents/mail-poll-detect.ts +73 -0
  34. package/src/agents/manifest.test.ts +1026 -0
  35. package/src/agents/manifest.ts +376 -0
  36. package/src/agents/overlay.test.ts +1058 -0
  37. package/src/agents/overlay.ts +490 -0
  38. package/src/agents/scope-detect.test.ts +190 -0
  39. package/src/agents/scope-detect.ts +146 -0
  40. package/src/agents/turn-lock.test.ts +181 -0
  41. package/src/agents/turn-lock.ts +235 -0
  42. package/src/agents/turn-runner-dispatch.test.ts +182 -0
  43. package/src/agents/turn-runner-dispatch.ts +105 -0
  44. package/src/agents/turn-runner.test.ts +2312 -0
  45. package/src/agents/turn-runner.ts +1383 -0
  46. package/src/beads/client.test.ts +217 -0
  47. package/src/beads/client.ts +230 -0
  48. package/src/beads/molecules.test.ts +338 -0
  49. package/src/beads/molecules.ts +198 -0
  50. package/src/commands/agents.test.ts +328 -0
  51. package/src/commands/agents.ts +299 -0
  52. package/src/commands/clean.test.ts +797 -0
  53. package/src/commands/clean.ts +791 -0
  54. package/src/commands/completions.test.ts +348 -0
  55. package/src/commands/completions.ts +981 -0
  56. package/src/commands/coordinator.test.ts +2975 -0
  57. package/src/commands/coordinator.ts +1841 -0
  58. package/src/commands/costs.test.ts +1183 -0
  59. package/src/commands/costs.ts +599 -0
  60. package/src/commands/dashboard.test.ts +954 -0
  61. package/src/commands/dashboard.ts +1212 -0
  62. package/src/commands/discover.test.ts +288 -0
  63. package/src/commands/discover.ts +202 -0
  64. package/src/commands/doctor.test.ts +303 -0
  65. package/src/commands/doctor.ts +311 -0
  66. package/src/commands/ecosystem.test.ts +226 -0
  67. package/src/commands/ecosystem.ts +248 -0
  68. package/src/commands/errors.test.ts +654 -0
  69. package/src/commands/errors.ts +197 -0
  70. package/src/commands/feed.test.ts +709 -0
  71. package/src/commands/feed.ts +260 -0
  72. package/src/commands/group.test.ts +475 -0
  73. package/src/commands/group.ts +546 -0
  74. package/src/commands/hooks.test.ts +458 -0
  75. package/src/commands/hooks.ts +263 -0
  76. package/src/commands/init.test.ts +1011 -0
  77. package/src/commands/init.ts +967 -0
  78. package/src/commands/inspect.test.ts +1239 -0
  79. package/src/commands/inspect.ts +648 -0
  80. package/src/commands/log.test.ts +1913 -0
  81. package/src/commands/log.ts +958 -0
  82. package/src/commands/logs.test.ts +801 -0
  83. package/src/commands/logs.ts +483 -0
  84. package/src/commands/mail.test.ts +1501 -0
  85. package/src/commands/mail.ts +848 -0
  86. package/src/commands/merge.test.ts +864 -0
  87. package/src/commands/merge.ts +381 -0
  88. package/src/commands/metrics.test.ts +458 -0
  89. package/src/commands/metrics.ts +129 -0
  90. package/src/commands/monitor.test.ts +191 -0
  91. package/src/commands/monitor.ts +409 -0
  92. package/src/commands/nudge.test.ts +579 -0
  93. package/src/commands/nudge.ts +646 -0
  94. package/src/commands/orchestrator.ts +42 -0
  95. package/src/commands/prime.test.ts +612 -0
  96. package/src/commands/prime.ts +359 -0
  97. package/src/commands/replay.test.ts +757 -0
  98. package/src/commands/replay.ts +231 -0
  99. package/src/commands/run.test.ts +469 -0
  100. package/src/commands/run.ts +353 -0
  101. package/src/commands/serve/agent-actions.test.ts +210 -0
  102. package/src/commands/serve/agent-actions.ts +192 -0
  103. package/src/commands/serve/build.test.ts +202 -0
  104. package/src/commands/serve/build.ts +206 -0
  105. package/src/commands/serve/coordinator-actions.test.ts +339 -0
  106. package/src/commands/serve/coordinator-actions.ts +410 -0
  107. package/src/commands/serve/dev.test.ts +168 -0
  108. package/src/commands/serve/dev.ts +117 -0
  109. package/src/commands/serve/mail-actions.test.ts +312 -0
  110. package/src/commands/serve/mail-actions.ts +167 -0
  111. package/src/commands/serve/rest.test.ts +1680 -0
  112. package/src/commands/serve/rest.ts +1130 -0
  113. package/src/commands/serve/static.ts +51 -0
  114. package/src/commands/serve/ws.test.ts +361 -0
  115. package/src/commands/serve/ws.ts +332 -0
  116. package/src/commands/serve.test.ts +459 -0
  117. package/src/commands/serve.ts +654 -0
  118. package/src/commands/sling.test.ts +1583 -0
  119. package/src/commands/sling.ts +1351 -0
  120. package/src/commands/spec.test.ts +179 -0
  121. package/src/commands/spec.ts +105 -0
  122. package/src/commands/status.test.ts +614 -0
  123. package/src/commands/status.ts +403 -0
  124. package/src/commands/stop.test.ts +964 -0
  125. package/src/commands/stop.ts +319 -0
  126. package/src/commands/supervisor.test.ts +185 -0
  127. package/src/commands/supervisor.ts +537 -0
  128. package/src/commands/trace.test.ts +762 -0
  129. package/src/commands/trace.ts +205 -0
  130. package/src/commands/update.test.ts +466 -0
  131. package/src/commands/update.ts +263 -0
  132. package/src/commands/upgrade.test.ts +48 -0
  133. package/src/commands/upgrade.ts +240 -0
  134. package/src/commands/watch.test.ts +257 -0
  135. package/src/commands/watch.ts +308 -0
  136. package/src/commands/worktree.test.ts +1297 -0
  137. package/src/commands/worktree.ts +451 -0
  138. package/src/config.test.ts +1535 -0
  139. package/src/config.ts +1064 -0
  140. package/src/doctor/agents.test.ts +523 -0
  141. package/src/doctor/agents.ts +399 -0
  142. package/src/doctor/config-check.test.ts +191 -0
  143. package/src/doctor/config-check.ts +183 -0
  144. package/src/doctor/consistency.test.ts +807 -0
  145. package/src/doctor/consistency.ts +347 -0
  146. package/src/doctor/databases.test.ts +350 -0
  147. package/src/doctor/databases.ts +243 -0
  148. package/src/doctor/dependencies.test.ts +296 -0
  149. package/src/doctor/dependencies.ts +272 -0
  150. package/src/doctor/ecosystem.test.ts +308 -0
  151. package/src/doctor/ecosystem.ts +156 -0
  152. package/src/doctor/logs.test.ts +253 -0
  153. package/src/doctor/logs.ts +295 -0
  154. package/src/doctor/merge-queue.test.ts +315 -0
  155. package/src/doctor/merge-queue.ts +167 -0
  156. package/src/doctor/providers.test.ts +409 -0
  157. package/src/doctor/providers.ts +250 -0
  158. package/src/doctor/serve.test.ts +95 -0
  159. package/src/doctor/serve.ts +86 -0
  160. package/src/doctor/structure.test.ts +423 -0
  161. package/src/doctor/structure.ts +285 -0
  162. package/src/doctor/types.ts +43 -0
  163. package/src/doctor/version.test.ts +241 -0
  164. package/src/doctor/version.ts +132 -0
  165. package/src/doctor/watchdog.test.ts +167 -0
  166. package/src/doctor/watchdog.ts +214 -0
  167. package/src/e2e/init-sling-lifecycle.test.ts +283 -0
  168. package/src/errors.test.ts +350 -0
  169. package/src/errors.ts +217 -0
  170. package/src/events/store.test.ts +660 -0
  171. package/src/events/store.ts +369 -0
  172. package/src/events/tailer.test.ts +719 -0
  173. package/src/events/tailer.ts +332 -0
  174. package/src/events/tool-filter.test.ts +330 -0
  175. package/src/events/tool-filter.ts +126 -0
  176. package/src/index.ts +533 -0
  177. package/src/insights/analyzer.test.ts +466 -0
  178. package/src/insights/analyzer.ts +203 -0
  179. package/src/insights/quality-gates.test.ts +141 -0
  180. package/src/insights/quality-gates.ts +156 -0
  181. package/src/json.test.ts +72 -0
  182. package/src/json.ts +53 -0
  183. package/src/loam/client.test.ts +752 -0
  184. package/src/loam/client.ts +664 -0
  185. package/src/logging/color.test.ts +252 -0
  186. package/src/logging/color.ts +105 -0
  187. package/src/logging/format.test.ts +110 -0
  188. package/src/logging/format.ts +255 -0
  189. package/src/logging/logger.test.ts +814 -0
  190. package/src/logging/logger.ts +266 -0
  191. package/src/logging/reporter.test.ts +259 -0
  192. package/src/logging/reporter.ts +110 -0
  193. package/src/logging/sanitizer.test.ts +190 -0
  194. package/src/logging/sanitizer.ts +57 -0
  195. package/src/logging/theme.ts +140 -0
  196. package/src/mail/broadcast.test.ts +204 -0
  197. package/src/mail/broadcast.ts +92 -0
  198. package/src/mail/client.test.ts +774 -0
  199. package/src/mail/client.ts +236 -0
  200. package/src/mail/store.test.ts +898 -0
  201. package/src/mail/store.ts +425 -0
  202. package/src/merge/lock.test.ts +149 -0
  203. package/src/merge/lock.ts +140 -0
  204. package/src/merge/predict.test.ts +387 -0
  205. package/src/merge/predict.ts +249 -0
  206. package/src/merge/queue.test.ts +426 -0
  207. package/src/merge/queue.ts +246 -0
  208. package/src/merge/resolver.test.ts +1993 -0
  209. package/src/merge/resolver.ts +926 -0
  210. package/src/metrics/pricing.test.ts +258 -0
  211. package/src/metrics/pricing.ts +135 -0
  212. package/src/metrics/store.test.ts +978 -0
  213. package/src/metrics/store.ts +501 -0
  214. package/src/metrics/summary.test.ts +398 -0
  215. package/src/metrics/summary.ts +178 -0
  216. package/src/metrics/transcript.test.ts +483 -0
  217. package/src/metrics/transcript.ts +114 -0
  218. package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
  219. package/src/runtimes/aider.test.ts +124 -0
  220. package/src/runtimes/aider.ts +147 -0
  221. package/src/runtimes/amp.test.ts +164 -0
  222. package/src/runtimes/amp.ts +154 -0
  223. package/src/runtimes/claude.test.ts +1474 -0
  224. package/src/runtimes/claude.ts +579 -0
  225. package/src/runtimes/codex.test.ts +805 -0
  226. package/src/runtimes/codex.ts +273 -0
  227. package/src/runtimes/connections.test.ts +214 -0
  228. package/src/runtimes/connections.ts +103 -0
  229. package/src/runtimes/copilot.test.ts +707 -0
  230. package/src/runtimes/copilot.ts +316 -0
  231. package/src/runtimes/cursor.test.ts +497 -0
  232. package/src/runtimes/cursor.ts +205 -0
  233. package/src/runtimes/gemini.test.ts +537 -0
  234. package/src/runtimes/gemini.ts +243 -0
  235. package/src/runtimes/goose.test.ts +133 -0
  236. package/src/runtimes/goose.ts +157 -0
  237. package/src/runtimes/headless-connection.test.ts +264 -0
  238. package/src/runtimes/headless-connection.ts +158 -0
  239. package/src/runtimes/opencode.test.ts +325 -0
  240. package/src/runtimes/opencode.ts +188 -0
  241. package/src/runtimes/pi-guards.test.ts +486 -0
  242. package/src/runtimes/pi-guards.ts +367 -0
  243. package/src/runtimes/pi.test.ts +789 -0
  244. package/src/runtimes/pi.ts +305 -0
  245. package/src/runtimes/registry.test.ts +196 -0
  246. package/src/runtimes/registry.ts +99 -0
  247. package/src/runtimes/sapling.test.ts +1267 -0
  248. package/src/runtimes/sapling.ts +710 -0
  249. package/src/runtimes/types.ts +266 -0
  250. package/src/schema-consistency.test.ts +246 -0
  251. package/src/sessions/compat.test.ts +281 -0
  252. package/src/sessions/compat.ts +105 -0
  253. package/src/sessions/store.test.ts +1748 -0
  254. package/src/sessions/store.ts +858 -0
  255. package/src/test-helpers.test.ts +124 -0
  256. package/src/test-helpers.ts +145 -0
  257. package/src/test-setup.test.ts +31 -0
  258. package/src/test-setup.ts +28 -0
  259. package/src/tools/loam/api.ts +368 -0
  260. package/src/tools/loam/cli.ts +278 -0
  261. package/src/tools/loam/commands/add.ts +52 -0
  262. package/src/tools/loam/commands/archive.ts +214 -0
  263. package/src/tools/loam/commands/audit.ts +276 -0
  264. package/src/tools/loam/commands/compact.ts +1062 -0
  265. package/src/tools/loam/commands/completions.ts +79 -0
  266. package/src/tools/loam/commands/config.ts +381 -0
  267. package/src/tools/loam/commands/delete-domain.ts +121 -0
  268. package/src/tools/loam/commands/delete.ts +316 -0
  269. package/src/tools/loam/commands/diff.ts +200 -0
  270. package/src/tools/loam/commands/doctor.ts +1113 -0
  271. package/src/tools/loam/commands/edit.ts +226 -0
  272. package/src/tools/loam/commands/init.ts +31 -0
  273. package/src/tools/loam/commands/learn.ts +179 -0
  274. package/src/tools/loam/commands/move.ts +323 -0
  275. package/src/tools/loam/commands/onboard.ts +374 -0
  276. package/src/tools/loam/commands/outcome.ts +185 -0
  277. package/src/tools/loam/commands/prime.ts +688 -0
  278. package/src/tools/loam/commands/prune.ts +614 -0
  279. package/src/tools/loam/commands/query.ts +218 -0
  280. package/src/tools/loam/commands/rank.ts +180 -0
  281. package/src/tools/loam/commands/ready.ts +189 -0
  282. package/src/tools/loam/commands/record.ts +1210 -0
  283. package/src/tools/loam/commands/restore.ts +166 -0
  284. package/src/tools/loam/commands/search.ts +327 -0
  285. package/src/tools/loam/commands/setup.ts +887 -0
  286. package/src/tools/loam/commands/status.ts +103 -0
  287. package/src/tools/loam/commands/sync.ts +298 -0
  288. package/src/tools/loam/commands/update.ts +19 -0
  289. package/src/tools/loam/commands/upgrade.ts +93 -0
  290. package/src/tools/loam/commands/validate.ts +190 -0
  291. package/src/tools/loam/index.ts +62 -0
  292. package/src/tools/loam/log.ts +127 -0
  293. package/src/tools/loam/registry/builtins.ts +409 -0
  294. package/src/tools/loam/registry/custom.ts +431 -0
  295. package/src/tools/loam/registry/init.ts +55 -0
  296. package/src/tools/loam/registry/template.ts +40 -0
  297. package/src/tools/loam/registry/type-registry.ts +113 -0
  298. package/src/tools/loam/schemas/config-schema.ts +489 -0
  299. package/src/tools/loam/schemas/config.ts +245 -0
  300. package/src/tools/loam/schemas/index.ts +18 -0
  301. package/src/tools/loam/schemas/record-schema.ts +191 -0
  302. package/src/tools/loam/schemas/record.ts +115 -0
  303. package/src/tools/loam/utils/active-work.ts +205 -0
  304. package/src/tools/loam/utils/anchor-validity.ts +80 -0
  305. package/src/tools/loam/utils/archive.ts +146 -0
  306. package/src/tools/loam/utils/audit.ts +667 -0
  307. package/src/tools/loam/utils/bm25.ts +238 -0
  308. package/src/tools/loam/utils/budget.ts +142 -0
  309. package/src/tools/loam/utils/config.ts +344 -0
  310. package/src/tools/loam/utils/dir-anchors.ts +62 -0
  311. package/src/tools/loam/utils/domain-rules.ts +114 -0
  312. package/src/tools/loam/utils/expertise.ts +393 -0
  313. package/src/tools/loam/utils/format-helpers.ts +96 -0
  314. package/src/tools/loam/utils/format.ts +1234 -0
  315. package/src/tools/loam/utils/git-context.ts +50 -0
  316. package/src/tools/loam/utils/git.ts +183 -0
  317. package/src/tools/loam/utils/hooks.ts +299 -0
  318. package/src/tools/loam/utils/index.ts +52 -0
  319. package/src/tools/loam/utils/json-output.ts +13 -0
  320. package/src/tools/loam/utils/lock.ts +76 -0
  321. package/src/tools/loam/utils/markers.ts +48 -0
  322. package/src/tools/loam/utils/numeric-flags.ts +20 -0
  323. package/src/tools/loam/utils/palette.ts +44 -0
  324. package/src/tools/loam/utils/prime-ranking.ts +135 -0
  325. package/src/tools/loam/utils/recipe-discovery.ts +195 -0
  326. package/src/tools/loam/utils/runtime-flags.ts +28 -0
  327. package/src/tools/loam/utils/scoring.ts +94 -0
  328. package/src/tools/loam/utils/version.ts +116 -0
  329. package/src/tools/sprout/commands/block.ts +64 -0
  330. package/src/tools/sprout/commands/blocked.ts +86 -0
  331. package/src/tools/sprout/commands/close.ts +129 -0
  332. package/src/tools/sprout/commands/completions.ts +198 -0
  333. package/src/tools/sprout/commands/config.ts +238 -0
  334. package/src/tools/sprout/commands/create.ts +164 -0
  335. package/src/tools/sprout/commands/dep.ts +148 -0
  336. package/src/tools/sprout/commands/doctor.ts +979 -0
  337. package/src/tools/sprout/commands/init.ts +83 -0
  338. package/src/tools/sprout/commands/label.ts +178 -0
  339. package/src/tools/sprout/commands/list.ts +210 -0
  340. package/src/tools/sprout/commands/migrate.ts +133 -0
  341. package/src/tools/sprout/commands/onboard.ts +207 -0
  342. package/src/tools/sprout/commands/plan-show.ts +278 -0
  343. package/src/tools/sprout/commands/plan.ts +2526 -0
  344. package/src/tools/sprout/commands/prime.ts +399 -0
  345. package/src/tools/sprout/commands/ready.ts +245 -0
  346. package/src/tools/sprout/commands/search.ts +221 -0
  347. package/src/tools/sprout/commands/show.ts +277 -0
  348. package/src/tools/sprout/commands/stats.ts +146 -0
  349. package/src/tools/sprout/commands/sync.ts +134 -0
  350. package/src/tools/sprout/commands/tpl.ts +364 -0
  351. package/src/tools/sprout/commands/unblock.ts +115 -0
  352. package/src/tools/sprout/commands/update.ts +257 -0
  353. package/src/tools/sprout/commands/upgrade.ts +91 -0
  354. package/src/tools/sprout/config-schema.ts +152 -0
  355. package/src/tools/sprout/config.ts +355 -0
  356. package/src/tools/sprout/filter.ts +107 -0
  357. package/src/tools/sprout/format.ts +43 -0
  358. package/src/tools/sprout/id.ts +22 -0
  359. package/src/tools/sprout/index.ts +204 -0
  360. package/src/tools/sprout/log.ts +76 -0
  361. package/src/tools/sprout/markers.ts +22 -0
  362. package/src/tools/sprout/output.ts +121 -0
  363. package/src/tools/sprout/plan-backref.ts +93 -0
  364. package/src/tools/sprout/plan-context.ts +81 -0
  365. package/src/tools/sprout/plan-domain.ts +139 -0
  366. package/src/tools/sprout/plan-lifecycle.ts +65 -0
  367. package/src/tools/sprout/plan-loam.ts +207 -0
  368. package/src/tools/sprout/plan-schema.ts +209 -0
  369. package/src/tools/sprout/sort.ts +31 -0
  370. package/src/tools/sprout/store.ts +172 -0
  371. package/src/tools/sprout/types.ts +118 -0
  372. package/src/tools/sprout/validation.ts +119 -0
  373. package/src/tools/sprout/version.ts +1 -0
  374. package/src/tools/sprout/yaml.ts +387 -0
  375. package/src/tools/trellis/commands/archive.ts +87 -0
  376. package/src/tools/trellis/commands/completions.ts +610 -0
  377. package/src/tools/trellis/commands/config.ts +382 -0
  378. package/src/tools/trellis/commands/create.ts +252 -0
  379. package/src/tools/trellis/commands/diff.ts +150 -0
  380. package/src/tools/trellis/commands/doctor.ts +771 -0
  381. package/src/tools/trellis/commands/emit.ts +365 -0
  382. package/src/tools/trellis/commands/history.ts +83 -0
  383. package/src/tools/trellis/commands/import.ts +198 -0
  384. package/src/tools/trellis/commands/init.ts +81 -0
  385. package/src/tools/trellis/commands/list.ts +103 -0
  386. package/src/tools/trellis/commands/onboard.ts +156 -0
  387. package/src/tools/trellis/commands/pin.ts +172 -0
  388. package/src/tools/trellis/commands/prime.ts +193 -0
  389. package/src/tools/trellis/commands/render.ts +122 -0
  390. package/src/tools/trellis/commands/schema.ts +353 -0
  391. package/src/tools/trellis/commands/show.ts +115 -0
  392. package/src/tools/trellis/commands/stats.ts +65 -0
  393. package/src/tools/trellis/commands/sync.ts +112 -0
  394. package/src/tools/trellis/commands/tree.ts +123 -0
  395. package/src/tools/trellis/commands/update.ts +330 -0
  396. package/src/tools/trellis/commands/upgrade.ts +95 -0
  397. package/src/tools/trellis/commands/validate.ts +166 -0
  398. package/src/tools/trellis/config-schema.ts +81 -0
  399. package/src/tools/trellis/config.ts +108 -0
  400. package/src/tools/trellis/frontmatter.ts +348 -0
  401. package/src/tools/trellis/id.ts +24 -0
  402. package/src/tools/trellis/index.ts +209 -0
  403. package/src/tools/trellis/markers.ts +28 -0
  404. package/src/tools/trellis/output.ts +84 -0
  405. package/src/tools/trellis/render.ts +212 -0
  406. package/src/tools/trellis/store.ts +144 -0
  407. package/src/tools/trellis/types.ts +82 -0
  408. package/src/tools/trellis/validate.ts +199 -0
  409. package/src/tools/trellis/yaml.ts +309 -0
  410. package/src/tracker/beads.test.ts +454 -0
  411. package/src/tracker/beads.ts +56 -0
  412. package/src/tracker/factory.test.ts +90 -0
  413. package/src/tracker/factory.ts +65 -0
  414. package/src/tracker/sprout.test.ts +461 -0
  415. package/src/tracker/sprout.ts +182 -0
  416. package/src/tracker/types.ts +52 -0
  417. package/src/trellis/client.test.ts +107 -0
  418. package/src/trellis/client.ts +179 -0
  419. package/src/types.ts +970 -0
  420. package/src/utils/bin.test.ts +10 -0
  421. package/src/utils/bin.ts +37 -0
  422. package/src/utils/browser.test.ts +49 -0
  423. package/src/utils/browser.ts +48 -0
  424. package/src/utils/fs.test.ts +119 -0
  425. package/src/utils/fs.ts +62 -0
  426. package/src/utils/pid.test.ts +152 -0
  427. package/src/utils/pid.ts +130 -0
  428. package/src/utils/process-scan.test.ts +53 -0
  429. package/src/utils/process-scan.ts +76 -0
  430. package/src/utils/time.test.ts +43 -0
  431. package/src/utils/time.ts +37 -0
  432. package/src/utils/version.test.ts +33 -0
  433. package/src/utils/version.ts +70 -0
  434. package/src/version.ts +5 -0
  435. package/src/watchdog/daemon.test.ts +3721 -0
  436. package/src/watchdog/daemon.ts +1257 -0
  437. package/src/watchdog/health.test.ts +830 -0
  438. package/src/watchdog/health.ts +434 -0
  439. package/src/watchdog/triage.test.ts +205 -0
  440. package/src/watchdog/triage.ts +205 -0
  441. package/src/worktree/manager.test.ts +720 -0
  442. package/src/worktree/manager.ts +405 -0
  443. package/src/worktree/process.test.ts +172 -0
  444. package/src/worktree/process.ts +131 -0
  445. package/src/worktree/tmux.test.ts +1616 -0
  446. package/src/worktree/tmux.ts +721 -0
  447. package/templates/CLAUDE.md.tmpl +100 -0
  448. package/templates/copilot-hooks.json.tmpl +13 -0
  449. package/templates/hooks.json.tmpl +109 -0
  450. package/templates/overlay.md.tmpl +88 -0
  451. package/ui/dist/apple-touch-icon-bdy6teep.png +0 -0
  452. package/ui/dist/chunk-8s31f05k.css +1 -0
  453. package/ui/dist/chunk-vm5rz679.js +300 -0
  454. package/ui/dist/favicon-nzb39vza.svg +4 -0
  455. package/ui/dist/index.html +17 -0
@@ -0,0 +1,1257 @@
1
+ /**
2
+ * Tier 0 mechanical process monitoring daemon.
3
+ *
4
+ * Runs on a configurable interval, checking the health of all active agent
5
+ * sessions. Implements progressive nudging for stalled agents instead of
6
+ * immediately escalating to AI triage:
7
+ *
8
+ * Level 0 (warn): Log warning via onHealthCheck callback, no direct action
9
+ * Level 1 (nudge): Send tmux nudge via nudgeAgent()
10
+ * Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled), else skip
11
+ * Level 3 (terminate): Kill tmux session
12
+ *
13
+ * Phase 4 tier numbering:
14
+ * Tier 0 = Mechanical daemon (this file)
15
+ * Tier 1 = Triage agent (triage.ts)
16
+ * Tier 2 = Monitor agent (not yet implemented)
17
+ * Tier 3 = Supervisor monitors (per-project)
18
+ *
19
+ * ZFC Principle: Observable state (tmux alive, pid alive) is the source of
20
+ * truth. See health.ts for the full ZFC documentation.
21
+ */
22
+
23
+ import { join } from "node:path";
24
+ import { isPersistentCapability } from "../agents/capabilities.ts";
25
+ import { nudgeAgent } from "../commands/nudge.ts";
26
+ import { createEventStore } from "../events/store.ts";
27
+ import {
28
+ findLatestStdoutLog,
29
+ startEventTailer,
30
+ type TailerHandle,
31
+ type TailerOptions,
32
+ } from "../events/tailer.ts";
33
+ import { createLoamClient } from "../loam/client.ts";
34
+ import { createMailStore, type MailStore } from "../mail/store.ts";
35
+ import { getConnection, removeConnection } from "../runtimes/connections.ts";
36
+ import type { RuntimeConnection } from "../runtimes/types.ts";
37
+ import { openSessionStore } from "../sessions/compat.ts";
38
+ import { createRunStore } from "../sessions/store.ts";
39
+ import type {
40
+ AgentSession,
41
+ EventStore,
42
+ HealthCheck,
43
+ RunStore,
44
+ WorkerDiedPayload,
45
+ } from "../types.ts";
46
+ import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
47
+ import { evaluateHealth, transitionState } from "./health.ts";
48
+ import { type TriageResult, triageAgent } from "./triage.ts";
49
+
50
+ /** Maximum escalation level (terminate). */
51
+ const MAX_ESCALATION_LEVEL = 3;
52
+
53
+ /**
54
+ * Module-level registry of active event tailers for headless agents.
55
+ * Maps agentName → TailerHandle. Persists across daemon ticks so tailers
56
+ * survive between tick invocations. Overridable via DaemonOptions._tailerRegistry.
57
+ */
58
+ const _defaultTailerRegistry: Map<string, TailerHandle> = new Map();
59
+
60
+ /**
61
+ * Per-cause dedup state for `current-run.txt` defensive-read warnings
62
+ * (agentplate-87bf). The watchdog reads `.agentplate/current-run.txt` once per
63
+ * tick to gate run-completion checks; if the file is missing/empty/unreadable
64
+ * or points to an id with no row in the runs table, the check would silently
65
+ * skip every tick. We log one warning per cause and then continue skipping
66
+ * silently, so an operator can see the run-completion path is wedged without
67
+ * drowning in repeated lines.
68
+ *
69
+ * Module-level by design: warnings should dedupe across ticks within one
70
+ * watchdog process. Overridable via DaemonOptions._runIdWarnState in tests.
71
+ */
72
+ export interface RunIdWarnState {
73
+ missingFileWarned: boolean;
74
+ unknownIds: Set<string>;
75
+ }
76
+
77
+ const _defaultRunIdWarnState: RunIdWarnState = {
78
+ missingFileWarned: false,
79
+ unknownIds: new Set(),
80
+ };
81
+
82
+ /**
83
+ * Record an agent failure to loam for future reference.
84
+ * Fire-and-forget: never throws, logs errors internally if loam fails.
85
+ *
86
+ * @param root - Project root directory
87
+ * @param session - The agent session that failed
88
+ * @param reason - Human-readable failure reason
89
+ * @param tier - Which watchdog tier detected the failure (0 or 1)
90
+ * @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
91
+ */
92
+ async function recordFailure(
93
+ root: string,
94
+ session: AgentSession,
95
+ reason: string,
96
+ tier: 0 | 1,
97
+ triageSuggestion?: string,
98
+ ): Promise<void> {
99
+ try {
100
+ const loam = createLoamClient(root);
101
+ const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
102
+ const description = [
103
+ `Agent: ${session.agentName}`,
104
+ `Capability: ${session.capability}`,
105
+ `Failure reason: ${reason}`,
106
+ triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
107
+ `Detected by: ${tierLabel}`,
108
+ ]
109
+ .filter((line) => line !== null)
110
+ .join("\n");
111
+
112
+ await loam.record("agents", {
113
+ type: "failure",
114
+ description,
115
+ tags: ["watchdog", "auto-recorded"],
116
+ evidenceBead: session.taskId || undefined,
117
+ });
118
+ } catch {
119
+ // Fire-and-forget: recording failures must not break the watchdog
120
+ }
121
+ }
122
+
123
+ /**
124
+ * Read the current run ID from current-run.txt, or null if no active run.
125
+ * Async because it uses Bun.file().
126
+ */
127
+ async function readCurrentRunId(agentplateDir: string): Promise<string | null> {
128
+ const path = join(agentplateDir, "current-run.txt");
129
+ const file = Bun.file(path);
130
+ if (!(await file.exists())) {
131
+ return null;
132
+ }
133
+ try {
134
+ const text = await file.text();
135
+ const trimmed = text.trim();
136
+ return trimmed.length > 0 ? trimmed : null;
137
+ } catch {
138
+ return null;
139
+ }
140
+ }
141
+
142
+ /**
143
+ * Resolve the active run id for run-completion checks, defensively
144
+ * (agentplate-87bf). Returns the id only when `current-run.txt` is readable
145
+ * AND points to a row in the runs table. On either failure mode, logs one
146
+ * warning per cause via `warnState` and returns null so the caller can skip
147
+ * the check silently on subsequent ticks.
148
+ *
149
+ * Intentionally narrow: the broader `readCurrentRunId` is unchanged and still
150
+ * powers event-recording paths where a stale id is acceptable as a label.
151
+ */
152
+ async function resolveRunIdForCompletionCheck(
153
+ agentplateDir: string,
154
+ runStore: RunStore | null,
155
+ warnState: RunIdWarnState,
156
+ ): Promise<string | null> {
157
+ const runId = await readCurrentRunId(agentplateDir);
158
+ if (runId === null) {
159
+ if (!warnState.missingFileWarned) {
160
+ warnState.missingFileWarned = true;
161
+ process.stderr.write(
162
+ "[WATCHDOG] current-run.txt missing — run-completion checks disabled until restart\n",
163
+ );
164
+ }
165
+ return null;
166
+ }
167
+ if (runStore === null) {
168
+ // RunStore unavailable (rare — sessions.db open failed). Trust the file
169
+ // and let the downstream nudge path proceed; this is no worse than the
170
+ // pre-87bf behavior.
171
+ return runId;
172
+ }
173
+ let run: ReturnType<RunStore["getRun"]>;
174
+ try {
175
+ run = runStore.getRun(runId);
176
+ } catch {
177
+ // Treat lookup errors as "unknown" — same defensive posture as a missing row.
178
+ run = null;
179
+ }
180
+ if (run === null) {
181
+ if (!warnState.unknownIds.has(runId)) {
182
+ warnState.unknownIds.add(runId);
183
+ process.stderr.write(
184
+ `[WATCHDOG] current-run.txt points to unknown run "${runId}" — run-completion checks disabled until restart\n`,
185
+ );
186
+ }
187
+ return null;
188
+ }
189
+ return runId;
190
+ }
191
+
192
+ /**
193
+ * Fire-and-forget: record an event to EventStore. Never throws.
194
+ */
195
+ function recordEvent(
196
+ eventStore: EventStore | null,
197
+ event: {
198
+ runId: string | null;
199
+ agentName: string;
200
+ eventType: "custom" | "mail_sent";
201
+ level: "debug" | "info" | "warn" | "error";
202
+ data: Record<string, unknown>;
203
+ },
204
+ ): void {
205
+ if (!eventStore) return;
206
+ try {
207
+ eventStore.insert({
208
+ runId: event.runId,
209
+ agentName: event.agentName,
210
+ sessionId: null,
211
+ eventType: event.eventType,
212
+ toolName: null,
213
+ toolArgs: null,
214
+ toolDurationMs: null,
215
+ level: event.level,
216
+ data: JSON.stringify(event.data),
217
+ });
218
+ } catch {
219
+ // Fire-and-forget: event recording must never break the daemon
220
+ }
221
+ }
222
+
223
+ /**
224
+ * Build a phase-aware completion message based on the capabilities of terminal workers.
225
+ *
226
+ * "Terminal" includes both `completed` (clean exit) and `zombie` (watchdog-killed)
227
+ * — see agentplate-e130 for why a zombie counts as run-terminal. Single-capability
228
+ * batches get targeted messages (e.g. scouts → "Ready for next phase"), while
229
+ * mixed-capability batches get a generic summary with a breakdown. When any worker
230
+ * died, the verb changes from "have completed" to "have terminated" and the message
231
+ * carries a "(N completed, M zombie)" qualifier so the coordinator does not mistake
232
+ * a partial failure for a clean batch.
233
+ */
234
+ export function buildCompletionMessage(
235
+ workerSessions: readonly AgentSession[],
236
+ runId: string,
237
+ ): string {
238
+ const capabilities = new Set(workerSessions.map((s) => s.capability));
239
+ const count = workerSessions.length;
240
+ const zombieCount = workerSessions.filter((s) => s.state === "zombie").length;
241
+ const completedCount = count - zombieCount;
242
+ const verb = zombieCount > 0 ? "have terminated" : "have completed";
243
+ const qualifier = zombieCount > 0 ? ` (${completedCount} completed, ${zombieCount} zombie)` : "";
244
+
245
+ if (capabilities.size === 1) {
246
+ if (capabilities.has("scout")) {
247
+ return `[WATCHDOG] All ${count} scout(s) in run ${runId} ${verb}${qualifier}. Ready for next phase.`;
248
+ }
249
+ if (capabilities.has("builder")) {
250
+ return `[WATCHDOG] All ${count} builder(s) in run ${runId} ${verb}${qualifier}. Awaiting lead verification.`;
251
+ }
252
+ if (capabilities.has("reviewer")) {
253
+ return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} ${verb}${qualifier}. Reviews done.`;
254
+ }
255
+ if (capabilities.has("lead")) {
256
+ return `[WATCHDOG] All ${count} lead(s) in run ${runId} ${verb}${qualifier}. Ready for merge/cleanup.`;
257
+ }
258
+ if (capabilities.has("merger")) {
259
+ return `[WATCHDOG] All ${count} merger(s) in run ${runId} ${verb}${qualifier}. Merges done.`;
260
+ }
261
+ }
262
+
263
+ const breakdown = Array.from(capabilities).sort().join(", ");
264
+ return `[WATCHDOG] All ${count} worker(s) in run ${runId} ${verb}${qualifier} (${breakdown}). Ready for next steps.`;
265
+ }
266
+
267
+ /**
268
+ * Check if every worker session for the active run has reached a terminal state
269
+ * (`completed` or `zombie`), and if so, nudge the coordinator. Fire-and-forget:
270
+ * never throws.
271
+ *
272
+ * Zombie counts as terminal (agentplate-e130): a watchdog-killed worker is not
273
+ * coming back, so excluding it would strand the coordinator on a run that mixes
274
+ * clean exits with kills.
275
+ *
276
+ * Deduplication: uses a marker file (run-complete-notified.txt) to prevent
277
+ * repeated nudges for the same run ID.
278
+ */
279
+ async function checkRunCompletion(ctx: {
280
+ store: { getByRun: (runId: string) => AgentSession[] };
281
+ runId: string;
282
+ agentplateDir: string;
283
+ root: string;
284
+ nudge: (
285
+ projectRoot: string,
286
+ agentName: string,
287
+ message: string,
288
+ force: boolean,
289
+ ) => Promise<{ delivered: boolean; reason?: string }>;
290
+ eventStore: EventStore | null;
291
+ }): Promise<void> {
292
+ const { store, runId, agentplateDir, root, nudge, eventStore } = ctx;
293
+
294
+ const runSessions = store.getByRun(runId);
295
+ const workerSessions = runSessions.filter((s) => !isPersistentCapability(s.capability));
296
+
297
+ if (workerSessions.length === 0) {
298
+ return;
299
+ }
300
+
301
+ // `completed` = clean exit, `zombie` = watchdog-killed. Both are terminal
302
+ // for run-completion: a zombie is not coming back, so blocking on it would
303
+ // strand the coordinator forever (agentplate-e130).
304
+ const allTerminal = workerSessions.every((s) => s.state === "completed" || s.state === "zombie");
305
+ if (!allTerminal) {
306
+ return;
307
+ }
308
+
309
+ // Dedup: check marker file
310
+ const markerPath = join(agentplateDir, "run-complete-notified.txt");
311
+ try {
312
+ const file = Bun.file(markerPath);
313
+ if (await file.exists()) {
314
+ const existing = await file.text();
315
+ if (existing.trim() === runId) {
316
+ return; // Already notified
317
+ }
318
+ }
319
+ } catch {
320
+ // Read failure is non-fatal — proceed with nudge
321
+ }
322
+
323
+ // Nudge the coordinator
324
+ const message = buildCompletionMessage(workerSessions, runId);
325
+ try {
326
+ await nudge(root, "coordinator", message, true);
327
+ } catch {
328
+ // Nudge delivery failure is non-fatal
329
+ }
330
+
331
+ // Record the event
332
+ const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
333
+ const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
334
+ const completedAgents = workerSessions
335
+ .filter((s) => s.state === "completed")
336
+ .map((s) => s.agentName);
337
+ const zombieAgents = workerSessions.filter((s) => s.state === "zombie").map((s) => s.agentName);
338
+ recordEvent(eventStore, {
339
+ runId,
340
+ agentName: "watchdog",
341
+ eventType: "custom",
342
+ level: zombieAgents.length > 0 ? "warn" : "info",
343
+ data: {
344
+ type: "run_complete",
345
+ workerCount: workerSessions.length,
346
+ completedAgents,
347
+ zombieAgents,
348
+ capabilities: capabilitiesArr,
349
+ phase,
350
+ },
351
+ });
352
+
353
+ // Write dedup marker
354
+ try {
355
+ await Bun.write(markerPath, runId);
356
+ } catch {
357
+ // Marker write failure is non-fatal
358
+ }
359
+ }
360
+
361
+ /** Options shared between startDaemon and runDaemonTick. */
362
+ export interface DaemonOptions {
363
+ root: string;
364
+ staleThresholdMs: number;
365
+ zombieThresholdMs: number;
366
+ nudgeIntervalMs?: number;
367
+ tier1Enabled?: boolean;
368
+ /**
369
+ * When true (default), the watchdog sends a synthetic `worker_died` mail to
370
+ * `session.parentAgent` the first time it transitions a session to `zombie`
371
+ * (agentplate-c111). Without this, the parent — typically a lead waiting for
372
+ * `worker_done` — blocks indefinitely on mail that will never arrive.
373
+ */
374
+ notifyParentOnDeath?: boolean;
375
+ onHealthCheck?: (check: HealthCheck) => void;
376
+ /** Dependency injection for testing. Uses real implementations when omitted. */
377
+ _tmux?: {
378
+ isSessionAlive: (name: string) => Promise<boolean>;
379
+ killSession: (name: string) => Promise<void>;
380
+ };
381
+ /** Dependency injection for testing. Uses real triageAgent when omitted. */
382
+ _triage?: (options: {
383
+ agentName: string;
384
+ root: string;
385
+ lastActivity: string;
386
+ }) => Promise<TriageResult | "retry" | "terminate" | "extend">;
387
+ /** Max triage calls per daemon tick (prevents runaway AI usage). Default: 3. */
388
+ _maxTriagePerTick?: number;
389
+ /** Dependency injection for testing. Uses real nudgeAgent when omitted. */
390
+ _nudge?: (
391
+ projectRoot: string,
392
+ agentName: string,
393
+ message: string,
394
+ force: boolean,
395
+ ) => Promise<{ delivered: boolean; reason?: string }>;
396
+ /** Dependency injection for testing. Uses real isProcessAlive/killProcessTree when omitted. */
397
+ _process?: {
398
+ isAlive: (pid: number) => boolean;
399
+ killTree: (pid: number) => Promise<void>;
400
+ };
401
+ /** Dependency injection for testing. Overrides EventStore creation. */
402
+ _eventStore?: EventStore | null;
403
+ /** Dependency injection for testing. Uses real recordFailure when omitted. */
404
+ _recordFailure?: (
405
+ root: string,
406
+ session: AgentSession,
407
+ reason: string,
408
+ tier: 0 | 1,
409
+ triageSuggestion?: string,
410
+ ) => Promise<void>;
411
+ /** Dependency injection for testing. Uses real getConnection when omitted. */
412
+ _getConnection?: (name: string) => RuntimeConnection | undefined;
413
+ /** Dependency injection for testing. Uses real removeConnection when omitted. */
414
+ _removeConnection?: (name: string) => void;
415
+ /** Dependency injection for testing. Uses _defaultTailerRegistry when omitted. */
416
+ _tailerRegistry?: Map<string, TailerHandle>;
417
+ /** Dependency injection for testing. Uses startEventTailer when omitted. */
418
+ _tailerFactory?: (opts: TailerOptions) => TailerHandle;
419
+ /** Dependency injection for testing. Uses findLatestStdoutLog when omitted. */
420
+ _findLatestStdoutLog?: (agentplateDir: string, agentName: string) => Promise<string | null>;
421
+ /** Dependency injection for testing. Overrides MailStore creation for decision gate detection. */
422
+ _mailStore?: MailStore | null;
423
+ /**
424
+ * Dependency injection for testing. Overrides the module-level run-id warning
425
+ * state so each test starts with a clean dedup slate (agentplate-87bf).
426
+ */
427
+ _runIdWarnState?: RunIdWarnState;
428
+ /**
429
+ * Dependency injection for testing. Overrides RunStore creation. When `null`
430
+ * is passed explicitly, run-id validation is skipped (file presence still
431
+ * gates the warning). When omitted, a real RunStore is opened against
432
+ * `.agentplate/sessions.db`.
433
+ */
434
+ _runStore?: RunStore | null;
435
+ }
436
+
437
+ /**
438
+ * Start the watchdog daemon that periodically monitors agent health.
439
+ *
440
+ * On each tick:
441
+ * 1. Loads sessions from SessionStore (sessions.db)
442
+ * 2. For each session (including zombies — ZFC requires re-checking observable
443
+ * state), checks tmux liveness and evaluates health
444
+ * 3. For "terminate" actions: kills tmux session immediately
445
+ * 4. For "investigate" actions: surfaces via onHealthCheck, no auto-kill
446
+ * 5. For "escalate" actions: applies progressive nudging based on escalationLevel
447
+ * 6. Persists updated session states back to SessionStore
448
+ *
449
+ * @param options.root - Project root directory (contains .agentplate/)
450
+ * @param options.intervalMs - Polling interval in milliseconds
451
+ * @param options.staleThresholdMs - Time after which an agent is considered stale
452
+ * @param options.zombieThresholdMs - Time after which an agent is considered a zombie
453
+ * @param options.nudgeIntervalMs - Time between progressive nudge stage transitions (default 60000)
454
+ * @param options.tier1Enabled - Whether Tier 1 AI triage is enabled (default false)
455
+ * @param options.onHealthCheck - Optional callback for each health check result
456
+ * @returns An object with a `stop` function to halt the daemon
457
+ */
458
+ export function startDaemon(options: DaemonOptions & { intervalMs: number }): { stop: () => void } {
459
+ const { intervalMs } = options;
460
+ const tailerRegistry = options._tailerRegistry ?? _defaultTailerRegistry;
461
+
462
+ // Run the first tick immediately, then on interval
463
+ runDaemonTick(options).catch(() => {
464
+ // Swallow errors in the first tick — daemon must not crash
465
+ });
466
+
467
+ const interval = setInterval(() => {
468
+ runDaemonTick(options).catch(() => {
469
+ // Swallow errors in periodic ticks — daemon must not crash
470
+ });
471
+ }, intervalMs);
472
+
473
+ return {
474
+ stop(): void {
475
+ clearInterval(interval);
476
+ for (const [name, handle] of tailerRegistry) {
477
+ handle.stop();
478
+ tailerRegistry.delete(name);
479
+ }
480
+ },
481
+ };
482
+ }
483
+
484
+ /**
485
+ * Kill an agent using the appropriate method based on whether it is headless or TUI.
486
+ *
487
+ * Prefers runtime-agnostic `conn.abort()` when a RuntimeConnection is registered.
488
+ * If abort() succeeds, returns immediately — no PID/tmux kill needed.
489
+ * If abort() throws (e.g. process already exited), falls through to the
490
+ * defense-in-depth path below.
491
+ *
492
+ * Branching after abort:
493
+ * - tmuxSession === "" (headless): never call tmux.killSession — an empty `-t`
494
+ * prefix-matches every session in the tmux server, wildcard-killing the entire
495
+ * agentplate swarm (agentplate-74ce). Branch by pid:
496
+ * - pid !== null → kill the process tree (long-lived headless capability).
497
+ * - pid === null → no-op (spawn-per-turn agent between turns; the in-flight
498
+ * process, if any, was already handled by the abort/connection path).
499
+ * - tmuxSession !== "" (TUI): kill the named tmux session, but only when
500
+ * `tmuxAlive` to avoid spurious "session not found" errors.
501
+ */
502
+ async function killAgent(ctx: {
503
+ session: AgentSession;
504
+ tmuxAlive: boolean;
505
+ tmux: { killSession: (name: string) => Promise<void> };
506
+ process: { killTree: (pid: number) => Promise<void> };
507
+ getConnection: (name: string) => RuntimeConnection | undefined;
508
+ removeConnection: (name: string) => void;
509
+ }): Promise<void> {
510
+ const { session, tmuxAlive, tmux, process: proc, getConnection, removeConnection } = ctx;
511
+
512
+ // Prefer runtime-agnostic abort() when a connection is registered.
513
+ const conn = getConnection(session.agentName);
514
+ if (conn) {
515
+ let aborted = false;
516
+ try {
517
+ await conn.abort();
518
+ aborted = true;
519
+ } catch {
520
+ // abort() failure — fall through to defense-in-depth path
521
+ }
522
+ removeConnection(session.agentName);
523
+ if (aborted) {
524
+ return;
525
+ }
526
+ // abort() threw — fall through to PID/tmux kill below as defense-in-depth
527
+ }
528
+
529
+ // Headless agents (no tmux session) must never reach tmux.killSession.
530
+ // An empty `-t` argument is prefix-matched and would kill every agentplate
531
+ // tmux session in the server (agentplate-74ce).
532
+ if (session.tmuxSession === "") {
533
+ if (session.pid !== null) {
534
+ try {
535
+ await proc.killTree(session.pid);
536
+ } catch {
537
+ // Already exited — not an error
538
+ }
539
+ }
540
+ // pid === null: spawn-per-turn agent between turns. Any in-flight process
541
+ // was handled by abort/connection above. No-op — next dispatch will spawn fresh.
542
+ return;
543
+ }
544
+
545
+ // Named tmux session path (TUI agents).
546
+ if (tmuxAlive) {
547
+ try {
548
+ await tmux.killSession(session.tmuxSession);
549
+ } catch {
550
+ // Session may have died between check and kill — not an error
551
+ }
552
+ }
553
+ }
554
+
555
+ /**
556
+ * Send a synthetic `worker_died` mail to the parent of a watchdog-terminated
557
+ * session (agentplate-c111). Fire-and-forget: never throws.
558
+ *
559
+ * Called only when `tryTransitionState(..., "zombie")` returns `ok: true`, so
560
+ * the state-machine's idempotence dedupes us — a subsequent watchdog tick that
561
+ * tries to re-zombify a session sees `illegal_transition` and skips notify.
562
+ */
563
+ function notifyParentOfDeath(ctx: {
564
+ session: AgentSession;
565
+ mailStore: MailStore | null;
566
+ reason: string;
567
+ tier: 0 | 1;
568
+ eventStore: EventStore | null;
569
+ runId: string | null;
570
+ }): void {
571
+ const { session, mailStore, reason, tier, eventStore, runId } = ctx;
572
+ if (mailStore === null) return;
573
+ if (session.parentAgent === null) return;
574
+
575
+ const payload: WorkerDiedPayload = {
576
+ agentName: session.agentName,
577
+ capability: session.capability,
578
+ taskId: session.taskId,
579
+ reason,
580
+ lastActivity: session.lastActivity,
581
+ terminatedBy: tier === 0 ? "tier0" : "tier1",
582
+ };
583
+
584
+ try {
585
+ mailStore.insert({
586
+ id: "",
587
+ from: session.agentName,
588
+ to: session.parentAgent,
589
+ subject: `[WATCHDOG] worker_died: ${session.agentName}`,
590
+ body:
591
+ `Worker "${session.agentName}" (${session.capability}) on task ${session.taskId} ` +
592
+ `was terminated by the watchdog. Reason: ${reason}. ` +
593
+ `Last activity: ${session.lastActivity}. ` +
594
+ `Decide whether to retry the work, escalate, or report the failure upstream.`,
595
+ type: "worker_died",
596
+ priority: "high",
597
+ threadId: null,
598
+ payload: JSON.stringify(payload),
599
+ });
600
+ } catch {
601
+ // Mail-send failure must never crash the watchdog.
602
+ return;
603
+ }
604
+
605
+ recordEvent(eventStore, {
606
+ runId,
607
+ agentName: session.agentName,
608
+ eventType: "mail_sent",
609
+ level: "warn",
610
+ data: {
611
+ type: "worker_died",
612
+ parent: session.parentAgent,
613
+ reason,
614
+ tier,
615
+ },
616
+ });
617
+ }
618
+
619
+ /**
620
+ * Run a single daemon tick. Exported for testing — allows direct invocation
621
+ * of the monitoring logic without starting the interval-based daemon loop.
622
+ *
623
+ * @param options - Same options as startDaemon (minus intervalMs)
624
+ */
625
+ export async function runDaemonTick(options: DaemonOptions): Promise<void> {
626
+ const {
627
+ root,
628
+ staleThresholdMs,
629
+ zombieThresholdMs,
630
+ nudgeIntervalMs = 60_000,
631
+ tier1Enabled = false,
632
+ notifyParentOnDeath = true,
633
+ onHealthCheck,
634
+ } = options;
635
+ const tmux = options._tmux ?? { isSessionAlive, killSession };
636
+ const proc = options._process ?? { isAlive: isProcessAlive, killTree: killProcessTree };
637
+ const triage = options._triage ?? triageAgent;
638
+ const nudge = options._nudge ?? nudgeAgent;
639
+ const recordFailureFn = options._recordFailure ?? recordFailure;
640
+ const getConn = options._getConnection ?? getConnection;
641
+ const removeConn = options._removeConnection ?? removeConnection;
642
+ const tailerRegistry = options._tailerRegistry ?? _defaultTailerRegistry;
643
+ const tailerFactory = options._tailerFactory ?? startEventTailer;
644
+ const findStdoutLog = options._findLatestStdoutLog ?? findLatestStdoutLog;
645
+ const maxTriagePerTick = options._maxTriagePerTick ?? 3;
646
+ const triageCount = { value: 0 };
647
+ const runIdWarnState = options._runIdWarnState ?? _defaultRunIdWarnState;
648
+
649
+ const agentplateDir = join(root, ".agentplate");
650
+ const { store } = openSessionStore(agentplateDir);
651
+
652
+ // Open RunStore for run-id validation (agentplate-87bf). Sharing sessions.db
653
+ // is intentional — same file, WAL mode covers concurrent reads.
654
+ let runStore: RunStore | null = null;
655
+ let ownRunStore = false;
656
+ if (options._runStore !== undefined) {
657
+ runStore = options._runStore;
658
+ } else {
659
+ try {
660
+ runStore = createRunStore(join(agentplateDir, "sessions.db"));
661
+ ownRunStore = true;
662
+ } catch {
663
+ // RunStore creation failure is non-fatal — id validation is then skipped.
664
+ }
665
+ }
666
+
667
+ // Open MailStore for decision gate detection (fire-and-forget: non-fatal if unavailable)
668
+ let mailStore: MailStore | null = null;
669
+ let ownMailStore = false;
670
+ if (options._mailStore !== undefined) {
671
+ mailStore = options._mailStore;
672
+ } else {
673
+ try {
674
+ mailStore = createMailStore(join(agentplateDir, "mail.db"));
675
+ ownMailStore = true;
676
+ } catch {
677
+ // MailStore creation failure is non-fatal — decision gate detection will be skipped
678
+ }
679
+ }
680
+
681
+ // Open EventStore for recording daemon events (fire-and-forget)
682
+ let eventStore: EventStore | null = null;
683
+ let runId: string | null = null;
684
+ const useInjectedEventStore = options._eventStore !== undefined;
685
+ if (useInjectedEventStore) {
686
+ eventStore = options._eventStore ?? null;
687
+ } else {
688
+ try {
689
+ const eventsDbPath = join(agentplateDir, "events.db");
690
+ eventStore = createEventStore(eventsDbPath);
691
+ } catch {
692
+ // EventStore creation failure is non-fatal for the daemon
693
+ }
694
+ }
695
+ try {
696
+ runId = await readCurrentRunId(agentplateDir);
697
+ } catch {
698
+ // Reading run ID failure is non-fatal
699
+ }
700
+
701
+ try {
702
+ const thresholds = {
703
+ staleMs: staleThresholdMs,
704
+ zombieMs: zombieThresholdMs,
705
+ };
706
+
707
+ const sessions = store.getAll();
708
+
709
+ // Track active headless agents to clean up stale tailers after the loop.
710
+ const activeHeadlessAgents = new Set<string>();
711
+ const eventsDbPath = join(agentplateDir, "events.db");
712
+ const sessionsDbPath = join(agentplateDir, "sessions.db");
713
+
714
+ for (const session of sessions) {
715
+ // Skip completed sessions — they are terminal and don't need monitoring
716
+ if (session.state === "completed") {
717
+ continue;
718
+ }
719
+
720
+ // ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
721
+ // A zombie with a live tmux session needs investigation, not silence.
722
+
723
+ // Event tailer management: start a background NDJSON tailer for each
724
+ // active headless agent that doesn't already have one running.
725
+ // Tailers persist between ticks (module-level registry) so events are
726
+ // continuously written to events.db while the agent is working.
727
+ //
728
+ // Both long-lived headless (pid !== null) and spawn-per-turn workers
729
+ // (pid === null, agentplate-7a34) emit stream-json to stdout.log, so
730
+ // either pattern needs a tailer.
731
+ if (session.tmuxSession === "") {
732
+ activeHeadlessAgents.add(session.agentName);
733
+ if (!tailerRegistry.has(session.agentName)) {
734
+ // Discover the latest stdout.log for this agent and start tailing.
735
+ const logPath = await findStdoutLog(agentplateDir, session.agentName);
736
+ if (logPath) {
737
+ const handle = tailerFactory({
738
+ stdoutLogPath: logPath,
739
+ agentName: session.agentName,
740
+ runId,
741
+ eventsDbPath,
742
+ sessionsDbPath,
743
+ });
744
+ tailerRegistry.set(session.agentName, handle);
745
+ }
746
+ }
747
+ }
748
+
749
+ // === Liveness check ===
750
+ // Prefer RuntimeConnection.getState() when a connection is registered. Fall
751
+ // back to tmux liveness when no connection exists. For headless agents without
752
+ // a connection, use event-based activity detection to refresh lastActivity.
753
+ const conn = getConn(session.agentName);
754
+ let tmuxAlive: boolean;
755
+
756
+ if (conn) {
757
+ try {
758
+ const state = await Promise.race([
759
+ conn.getState(),
760
+ new Promise<never>((_, reject) =>
761
+ setTimeout(() => reject(new Error("getState timed out")), 5000),
762
+ ),
763
+ ]);
764
+ // Map ConnectionState → liveness:
765
+ // idle | working → alive (running)
766
+ // error → not alive (exited)
767
+ if (state.status === "idle" || state.status === "working") {
768
+ tmuxAlive = true;
769
+ store.updateLastActivity(session.agentName);
770
+ session.lastActivity = new Date().toISOString();
771
+ } else {
772
+ tmuxAlive = false;
773
+ }
774
+ } catch {
775
+ // getState() failed/timed out — drop stale connection, fall back to tmux
776
+ removeConn(session.agentName);
777
+ tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
778
+ }
779
+ } else {
780
+ tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
781
+
782
+ // Headless agents without a registered connection: event-based
783
+ // activity detection to avoid false-positive stale. Covers both
784
+ // long-lived headless (e.g. after a process restart) and
785
+ // spawn-per-turn workers between turns where lastActivity is
786
+ // the only liveness signal (agentplate-7a34).
787
+ if (session.tmuxSession === "" && eventStore) {
788
+ try {
789
+ const recentEvents = eventStore.getByAgent(session.agentName, {
790
+ since: new Date(Date.now() - staleThresholdMs).toISOString(),
791
+ limit: 1,
792
+ });
793
+ if (recentEvents.length > 0) {
794
+ store.updateLastActivity(session.agentName);
795
+ session.lastActivity = new Date().toISOString();
796
+ }
797
+ } catch {
798
+ // Non-fatal: event store query failure should not affect monitoring
799
+ }
800
+ }
801
+ }
802
+ const check = evaluateHealth(session, tmuxAlive, thresholds);
803
+
804
+ // Snapshot the pre-tick state so the worker_died notify path can
805
+ // dedupe across re-ticks (agentplate-c111). Subsequent `tryTransitionState`
806
+ // calls below mutate session.state, and the matrix allows the idempotent
807
+ // `zombie → zombie` self-transition — both would erase the dedup signal.
808
+ const stateBeforeTick = session.state;
809
+
810
+ // Transition state forward only (investigate action holds state).
811
+ // `transitionState` computes the watchdog's preferred target;
812
+ // `tryTransitionState` is the matrix-guarded CAS — `completed → *`
813
+ // is rejected here so a properly-completed agent cannot be
814
+ // reclassified as zombie by a late watchdog tick (agentplate-a993).
815
+ const newState = transitionState(session.state, check);
816
+ if (newState !== session.state) {
817
+ const outcome = store.tryTransitionState(session.agentName, newState);
818
+ if (outcome.ok) {
819
+ session.state = newState;
820
+ } else if (outcome.reason === "illegal_transition") {
821
+ // Resync local mirror — another writer settled state durably.
822
+ session.state = outcome.prev;
823
+ }
824
+ }
825
+
826
+ if (onHealthCheck) {
827
+ onHealthCheck(check);
828
+ }
829
+
830
+ if (check.action === "terminate") {
831
+ // Record the failure via loam (Tier 0 detection)
832
+ const reason = check.reconciliationNote ?? "Process terminated";
833
+ await recordFailureFn(root, session, reason, 0);
834
+
835
+ // Kill the agent: prefer conn.abort(), fall back to PID/tmux
836
+ await killAgent({
837
+ session,
838
+ tmuxAlive,
839
+ tmux,
840
+ process: proc,
841
+ getConnection: getConn,
842
+ removeConnection: removeConn,
843
+ });
844
+ // Matrix-guarded: rejected when state is `completed` so a clean
845
+ // `ap stop` cannot be silently downgraded to zombie by a late
846
+ // watchdog termination (agentplate-a993).
847
+ const outcome = store.tryTransitionState(session.agentName, "zombie");
848
+ // Reset escalation tracking on terminal state
849
+ store.updateEscalation(session.agentName, 0, null);
850
+ if (outcome.ok) {
851
+ session.state = "zombie";
852
+ // First-time zombify: notify parent so it doesn't block on
853
+ // missing `worker_done` mail (agentplate-c111). Dedup uses the
854
+ // pre-tick snapshot because the matrix allows the idempotent
855
+ // zombie → zombie transition (both `outcome.ok` and the earlier
856
+ // transitionState call would otherwise mask re-ticks).
857
+ if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
858
+ notifyParentOfDeath({
859
+ session,
860
+ mailStore,
861
+ reason,
862
+ tier: 0,
863
+ eventStore,
864
+ runId,
865
+ });
866
+ }
867
+ } else if (outcome.reason === "illegal_transition") {
868
+ session.state = outcome.prev;
869
+ }
870
+ session.escalationLevel = 0;
871
+ session.stalledSince = null;
872
+ } else if (check.action === "investigate") {
873
+ // ZFC: tmux alive but SessionStore says zombie.
874
+ // Log the conflict but do NOT auto-kill.
875
+ // The onHealthCheck callback surfaces this to the operator.
876
+ // No state change — keep zombie until a human or higher-tier agent decides.
877
+ } else if (check.action === "complete") {
878
+ // ZFC fallback: tmux/pid is gone AND lastActivity is stale —
879
+ // the agent looks like it finished naturally and only the
880
+ // session-end hook missed (agentplate-e74b). Mark completed
881
+ // without killing (process is already gone) and without
882
+ // notifying parents of death (this is not a crash).
883
+ const outcome = store.tryTransitionState(session.agentName, "completed");
884
+ if (outcome.ok) {
885
+ session.state = "completed";
886
+ } else if (outcome.reason === "illegal_transition") {
887
+ session.state = outcome.prev;
888
+ }
889
+ store.updateEscalation(session.agentName, 0, null);
890
+ session.escalationLevel = 0;
891
+ session.stalledSince = null;
892
+ } else if (check.action === "escalate") {
893
+ // Decision gate check: if the agent sent a decision_gate message, it is
894
+ // intentionally paused waiting for a human decision — not a stall.
895
+ // Skip watchdog escalation and clear any accumulated stall state.
896
+ if (mailStore !== null) {
897
+ const recentMail = mailStore.getAll({ from: session.agentName, limit: 20 });
898
+ const hasPendingDecisionGate = recentMail.some((m) => m.type === "decision_gate");
899
+ if (hasPendingDecisionGate) {
900
+ if (session.stalledSince !== null) {
901
+ store.updateEscalation(session.agentName, 0, null);
902
+ session.stalledSince = null;
903
+ session.escalationLevel = 0;
904
+ }
905
+ continue;
906
+ }
907
+ }
908
+
909
+ // Progressive nudging: increment escalation level based on elapsed time
910
+ // instead of immediately delegating to AI triage.
911
+
912
+ // Initialize stalledSince on first escalation detection
913
+ if (session.stalledSince === null) {
914
+ session.stalledSince = new Date().toISOString();
915
+ session.escalationLevel = 0;
916
+ store.updateEscalation(session.agentName, 0, session.stalledSince);
917
+ }
918
+
919
+ // Check if enough time has passed to advance to the next escalation level
920
+ const stalledMs = Date.now() - new Date(session.stalledSince).getTime();
921
+ const expectedLevel = Math.min(
922
+ Math.floor(stalledMs / nudgeIntervalMs),
923
+ MAX_ESCALATION_LEVEL,
924
+ );
925
+
926
+ if (expectedLevel > session.escalationLevel) {
927
+ session.escalationLevel = expectedLevel;
928
+ store.updateEscalation(session.agentName, expectedLevel, session.stalledSince);
929
+ }
930
+
931
+ // Execute the action for the current escalation level
932
+ const actionResult = await executeEscalationAction({
933
+ session,
934
+ root,
935
+ tmuxAlive,
936
+ tier1Enabled,
937
+ tmux,
938
+ process: proc,
939
+ triage,
940
+ nudge,
941
+ eventStore,
942
+ runId,
943
+ recordFailure: recordFailureFn,
944
+ triageCount,
945
+ maxTriagePerTick,
946
+ getConnection: getConn,
947
+ removeConnection: removeConn,
948
+ });
949
+
950
+ if (actionResult.terminated) {
951
+ // Matrix-guarded: completed → zombie is rejected (agentplate-a993).
952
+ const outcome = store.tryTransitionState(session.agentName, "zombie");
953
+ store.updateEscalation(session.agentName, 0, null);
954
+ if (outcome.ok) {
955
+ session.state = "zombie";
956
+ // First-time zombify: notify parent so it doesn't block on
957
+ // missing `worker_done` mail (agentplate-c111). Dedup via
958
+ // the pre-tick snapshot — see the terminate branch above.
959
+ if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
960
+ notifyParentOfDeath({
961
+ session,
962
+ mailStore,
963
+ reason: actionResult.deathReason ?? "Watchdog escalation terminated agent",
964
+ tier: actionResult.deathTier ?? 0,
965
+ eventStore,
966
+ runId,
967
+ });
968
+ }
969
+ } else if (outcome.reason === "illegal_transition") {
970
+ session.state = outcome.prev;
971
+ }
972
+ session.escalationLevel = 0;
973
+ session.stalledSince = null;
974
+ }
975
+ } else if (check.action === "none" && session.stalledSince !== null) {
976
+ // Agent recovered — reset escalation tracking
977
+ store.updateEscalation(session.agentName, 0, null);
978
+ session.stalledSince = null;
979
+ session.escalationLevel = 0;
980
+ }
981
+ }
982
+
983
+ // === Tailer cleanup ===
984
+ // Stop tailers for any headless agent that is no longer in the active set
985
+ // (i.e. completed, removed from store, or was never a headless agent).
986
+ for (const [name, handle] of tailerRegistry) {
987
+ if (!activeHeadlessAgents.has(name)) {
988
+ handle.stop();
989
+ tailerRegistry.delete(name);
990
+ }
991
+ }
992
+
993
+ // === Run-level completion detection ===
994
+ // After monitoring individual sessions, check if the entire run is done.
995
+ // Re-resolve the run id defensively (agentplate-87bf): a missing
996
+ // current-run.txt or a stale id (no row in runs table) skips the check
997
+ // and emits one warning per cause for the lifetime of this watchdog.
998
+ const validatedRunId = await resolveRunIdForCompletionCheck(
999
+ agentplateDir,
1000
+ runStore,
1001
+ runIdWarnState,
1002
+ );
1003
+ if (validatedRunId) {
1004
+ await checkRunCompletion({
1005
+ store,
1006
+ runId: validatedRunId,
1007
+ agentplateDir,
1008
+ root,
1009
+ nudge,
1010
+ eventStore,
1011
+ });
1012
+ }
1013
+ } finally {
1014
+ store.close();
1015
+ // Close MailStore only if we created it (not injected)
1016
+ if (mailStore && ownMailStore) {
1017
+ try {
1018
+ mailStore.close();
1019
+ } catch {
1020
+ // Non-fatal
1021
+ }
1022
+ }
1023
+ // Close EventStore only if we created it (not injected)
1024
+ if (eventStore && !useInjectedEventStore) {
1025
+ try {
1026
+ eventStore.close();
1027
+ } catch {
1028
+ // Non-fatal
1029
+ }
1030
+ }
1031
+ // Close RunStore only if we created it (not injected)
1032
+ if (runStore && ownRunStore) {
1033
+ try {
1034
+ runStore.close();
1035
+ } catch {
1036
+ // Non-fatal
1037
+ }
1038
+ }
1039
+ }
1040
+ }
1041
+
1042
+ /**
1043
+ * Execute the escalation action corresponding to the agent's current escalation level.
1044
+ *
1045
+ * Level 0 (warn): No direct action — onHealthCheck callback already fired above.
1046
+ * Level 1 (nudge): Send a tmux nudge to the agent.
1047
+ * Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled; skip otherwise).
1048
+ * Level 3 (terminate): Kill the tmux session.
1049
+ *
1050
+ * @returns Object indicating whether the agent was terminated or state changed.
1051
+ */
1052
+ async function executeEscalationAction(ctx: {
1053
+ session: AgentSession;
1054
+ root: string;
1055
+ tmuxAlive: boolean;
1056
+ tier1Enabled: boolean;
1057
+ tmux: {
1058
+ isSessionAlive: (name: string) => Promise<boolean>;
1059
+ killSession: (name: string) => Promise<void>;
1060
+ };
1061
+ process: {
1062
+ killTree: (pid: number) => Promise<void>;
1063
+ };
1064
+ triage: (options: {
1065
+ agentName: string;
1066
+ root: string;
1067
+ lastActivity: string;
1068
+ }) => Promise<TriageResult | "retry" | "terminate" | "extend">;
1069
+ /** Shared counter across escalation calls in a single tick — enforces maxTriagePerTick. */
1070
+ triageCount: { value: number };
1071
+ /** Maximum number of triage calls allowed in one daemon tick. Default: 3. */
1072
+ maxTriagePerTick: number;
1073
+ nudge: (
1074
+ projectRoot: string,
1075
+ agentName: string,
1076
+ message: string,
1077
+ force: boolean,
1078
+ ) => Promise<{ delivered: boolean; reason?: string }>;
1079
+ eventStore: EventStore | null;
1080
+ runId: string | null;
1081
+ recordFailure: (
1082
+ root: string,
1083
+ session: AgentSession,
1084
+ reason: string,
1085
+ tier: 0 | 1,
1086
+ triageSuggestion?: string,
1087
+ ) => Promise<void>;
1088
+ getConnection: (name: string) => RuntimeConnection | undefined;
1089
+ removeConnection: (name: string) => void;
1090
+ }): Promise<{
1091
+ terminated: boolean;
1092
+ stateChanged: boolean;
1093
+ /** Reason and tier of the termination (only set when `terminated` is true). */
1094
+ deathReason?: string;
1095
+ deathTier?: 0 | 1;
1096
+ }> {
1097
+ const {
1098
+ session,
1099
+ root,
1100
+ tmuxAlive,
1101
+ tier1Enabled,
1102
+ tmux,
1103
+ process: proc,
1104
+ triage,
1105
+ nudge,
1106
+ eventStore,
1107
+ runId,
1108
+ recordFailure,
1109
+ triageCount,
1110
+ maxTriagePerTick,
1111
+ getConnection: getConn,
1112
+ removeConnection: removeConn,
1113
+ } = ctx;
1114
+
1115
+ switch (session.escalationLevel) {
1116
+ case 0: {
1117
+ // Level 0: warn — onHealthCheck callback already fired, no direct action
1118
+ recordEvent(eventStore, {
1119
+ runId,
1120
+ agentName: session.agentName,
1121
+ eventType: "custom",
1122
+ level: "warn",
1123
+ data: { type: "escalation", escalationLevel: 0, action: "warn" },
1124
+ });
1125
+ return { terminated: false, stateChanged: false };
1126
+ }
1127
+
1128
+ case 1: {
1129
+ // Level 1: nudge — send a tmux nudge to the agent
1130
+ let delivered = false;
1131
+ try {
1132
+ const result = await nudge(
1133
+ root,
1134
+ session.agentName,
1135
+ `[WATCHDOG] Agent "${session.agentName}" appears stalled. Please check your current task and report status.`,
1136
+ true, // force — skip debounce for watchdog nudges
1137
+ );
1138
+ delivered = result.delivered;
1139
+ } catch {
1140
+ // Nudge delivery failure is non-fatal for the watchdog
1141
+ }
1142
+ recordEvent(eventStore, {
1143
+ runId,
1144
+ agentName: session.agentName,
1145
+ eventType: "custom",
1146
+ level: "warn",
1147
+ data: { type: "nudge", escalationLevel: 1, delivered },
1148
+ });
1149
+ return { terminated: false, stateChanged: false };
1150
+ }
1151
+
1152
+ case 2: {
1153
+ // Level 2: escalate — invoke Tier 1 AI triage if enabled
1154
+ if (!tier1Enabled) {
1155
+ // Tier 1 disabled — skip triage, progressive nudging continues to level 3
1156
+ return { terminated: false, stateChanged: false };
1157
+ }
1158
+
1159
+ // Concurrency guard: limit triage calls per tick to avoid runaway AI usage
1160
+ if (triageCount.value >= maxTriagePerTick) {
1161
+ return { terminated: false, stateChanged: false };
1162
+ }
1163
+ triageCount.value++;
1164
+
1165
+ const raw = await triage({
1166
+ agentName: session.agentName,
1167
+ root,
1168
+ lastActivity: session.lastActivity,
1169
+ });
1170
+ // Normalize: accept bare string (backward compat) or TriageResult
1171
+ const result: TriageResult =
1172
+ typeof raw === "string" ? { verdict: raw, fallback: false } : raw;
1173
+
1174
+ recordEvent(eventStore, {
1175
+ runId,
1176
+ agentName: session.agentName,
1177
+ eventType: "custom",
1178
+ level: "warn",
1179
+ data: {
1180
+ type: "triage",
1181
+ escalationLevel: 2,
1182
+ verdict: result.verdict,
1183
+ triageFailed: result.fallback,
1184
+ },
1185
+ });
1186
+
1187
+ if (result.verdict === "terminate") {
1188
+ // Record the failure via loam (Tier 1 AI triage)
1189
+ const triageReason = "AI triage classified as terminal failure";
1190
+ await recordFailure(root, session, triageReason, 1, result.verdict);
1191
+
1192
+ await killAgent({
1193
+ session,
1194
+ tmuxAlive,
1195
+ tmux,
1196
+ process: proc,
1197
+ getConnection: getConn,
1198
+ removeConnection: removeConn,
1199
+ });
1200
+ return {
1201
+ terminated: true,
1202
+ stateChanged: true,
1203
+ deathReason: triageReason,
1204
+ deathTier: 1,
1205
+ };
1206
+ }
1207
+
1208
+ if (result.verdict === "retry") {
1209
+ // Send a nudge with a recovery message
1210
+ try {
1211
+ await nudge(
1212
+ root,
1213
+ session.agentName,
1214
+ "[WATCHDOG] Triage suggests recovery is possible. " +
1215
+ "Please retry your current operation or check for errors.",
1216
+ true, // force — skip debounce
1217
+ );
1218
+ } catch {
1219
+ // Nudge delivery failure is non-fatal
1220
+ }
1221
+ }
1222
+
1223
+ // "retry" (after nudge) and "extend" leave the session running
1224
+ return { terminated: false, stateChanged: false };
1225
+ }
1226
+
1227
+ default: {
1228
+ // Level 3+: terminate — kill the tmux session
1229
+ recordEvent(eventStore, {
1230
+ runId,
1231
+ agentName: session.agentName,
1232
+ eventType: "custom",
1233
+ level: "error",
1234
+ data: { type: "escalation", escalationLevel: 3, action: "terminate" },
1235
+ });
1236
+
1237
+ // Record the failure via loam (Tier 0: progressive escalation to terminal level)
1238
+ const escalationReason = "Progressive escalation reached terminal level";
1239
+ await recordFailure(root, session, escalationReason, 0);
1240
+
1241
+ await killAgent({
1242
+ session,
1243
+ tmuxAlive,
1244
+ tmux,
1245
+ process: proc,
1246
+ getConnection: getConn,
1247
+ removeConnection: removeConn,
1248
+ });
1249
+ return {
1250
+ terminated: true,
1251
+ stateChanged: true,
1252
+ deathReason: escalationReason,
1253
+ deathTier: 0,
1254
+ };
1255
+ }
1256
+ }
1257
+ }