@ag-eco/agentplate-cli 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +462 -0
- package/agents/ap-co-creation.md +90 -0
- package/agents/builder.md +144 -0
- package/agents/coordinator.md +377 -0
- package/agents/lead.md +435 -0
- package/agents/merger.md +164 -0
- package/agents/monitor.md +214 -0
- package/agents/orchestrator.md +239 -0
- package/agents/reviewer.md +140 -0
- package/agents/scout.md +125 -0
- package/agents/supervisor.md +427 -0
- package/package.json +66 -0
- package/src/agents/capabilities.test.ts +85 -0
- package/src/agents/capabilities.ts +125 -0
- package/src/agents/checkpoint.test.ts +88 -0
- package/src/agents/checkpoint.ts +101 -0
- package/src/agents/copilot-hooks-deployer.test.ts +162 -0
- package/src/agents/copilot-hooks-deployer.ts +93 -0
- package/src/agents/guard-rules.test.ts +372 -0
- package/src/agents/guard-rules.ts +97 -0
- package/src/agents/headless-mail-injector.test.ts +709 -0
- package/src/agents/headless-mail-injector.ts +377 -0
- package/src/agents/headless-prompt.test.ts +102 -0
- package/src/agents/headless-prompt.ts +68 -0
- package/src/agents/hooks-deployer.test.ts +3119 -0
- package/src/agents/hooks-deployer.ts +804 -0
- package/src/agents/identity.test.ts +604 -0
- package/src/agents/identity.ts +384 -0
- package/src/agents/lifecycle.test.ts +196 -0
- package/src/agents/lifecycle.ts +183 -0
- package/src/agents/mail-poll-detect.test.ts +153 -0
- package/src/agents/mail-poll-detect.ts +73 -0
- package/src/agents/manifest.test.ts +1026 -0
- package/src/agents/manifest.ts +376 -0
- package/src/agents/overlay.test.ts +1058 -0
- package/src/agents/overlay.ts +490 -0
- package/src/agents/scope-detect.test.ts +190 -0
- package/src/agents/scope-detect.ts +146 -0
- package/src/agents/turn-lock.test.ts +181 -0
- package/src/agents/turn-lock.ts +235 -0
- package/src/agents/turn-runner-dispatch.test.ts +182 -0
- package/src/agents/turn-runner-dispatch.ts +105 -0
- package/src/agents/turn-runner.test.ts +2312 -0
- package/src/agents/turn-runner.ts +1383 -0
- package/src/beads/client.test.ts +217 -0
- package/src/beads/client.ts +230 -0
- package/src/beads/molecules.test.ts +338 -0
- package/src/beads/molecules.ts +198 -0
- package/src/commands/agents.test.ts +328 -0
- package/src/commands/agents.ts +299 -0
- package/src/commands/clean.test.ts +797 -0
- package/src/commands/clean.ts +791 -0
- package/src/commands/completions.test.ts +348 -0
- package/src/commands/completions.ts +981 -0
- package/src/commands/coordinator.test.ts +2975 -0
- package/src/commands/coordinator.ts +1841 -0
- package/src/commands/costs.test.ts +1183 -0
- package/src/commands/costs.ts +599 -0
- package/src/commands/dashboard.test.ts +954 -0
- package/src/commands/dashboard.ts +1212 -0
- package/src/commands/discover.test.ts +288 -0
- package/src/commands/discover.ts +202 -0
- package/src/commands/doctor.test.ts +303 -0
- package/src/commands/doctor.ts +311 -0
- package/src/commands/ecosystem.test.ts +226 -0
- package/src/commands/ecosystem.ts +248 -0
- package/src/commands/errors.test.ts +654 -0
- package/src/commands/errors.ts +197 -0
- package/src/commands/feed.test.ts +709 -0
- package/src/commands/feed.ts +260 -0
- package/src/commands/group.test.ts +475 -0
- package/src/commands/group.ts +546 -0
- package/src/commands/hooks.test.ts +458 -0
- package/src/commands/hooks.ts +263 -0
- package/src/commands/init.test.ts +1011 -0
- package/src/commands/init.ts +967 -0
- package/src/commands/inspect.test.ts +1239 -0
- package/src/commands/inspect.ts +648 -0
- package/src/commands/log.test.ts +1913 -0
- package/src/commands/log.ts +958 -0
- package/src/commands/logs.test.ts +801 -0
- package/src/commands/logs.ts +483 -0
- package/src/commands/mail.test.ts +1501 -0
- package/src/commands/mail.ts +848 -0
- package/src/commands/merge.test.ts +864 -0
- package/src/commands/merge.ts +381 -0
- package/src/commands/metrics.test.ts +458 -0
- package/src/commands/metrics.ts +129 -0
- package/src/commands/monitor.test.ts +191 -0
- package/src/commands/monitor.ts +409 -0
- package/src/commands/nudge.test.ts +579 -0
- package/src/commands/nudge.ts +646 -0
- package/src/commands/orchestrator.ts +42 -0
- package/src/commands/prime.test.ts +612 -0
- package/src/commands/prime.ts +359 -0
- package/src/commands/replay.test.ts +757 -0
- package/src/commands/replay.ts +231 -0
- package/src/commands/run.test.ts +469 -0
- package/src/commands/run.ts +353 -0
- package/src/commands/serve/agent-actions.test.ts +210 -0
- package/src/commands/serve/agent-actions.ts +192 -0
- package/src/commands/serve/build.test.ts +202 -0
- package/src/commands/serve/build.ts +206 -0
- package/src/commands/serve/coordinator-actions.test.ts +339 -0
- package/src/commands/serve/coordinator-actions.ts +410 -0
- package/src/commands/serve/dev.test.ts +168 -0
- package/src/commands/serve/dev.ts +117 -0
- package/src/commands/serve/mail-actions.test.ts +312 -0
- package/src/commands/serve/mail-actions.ts +167 -0
- package/src/commands/serve/rest.test.ts +1680 -0
- package/src/commands/serve/rest.ts +1130 -0
- package/src/commands/serve/static.ts +51 -0
- package/src/commands/serve/ws.test.ts +361 -0
- package/src/commands/serve/ws.ts +332 -0
- package/src/commands/serve.test.ts +459 -0
- package/src/commands/serve.ts +654 -0
- package/src/commands/sling.test.ts +1583 -0
- package/src/commands/sling.ts +1351 -0
- package/src/commands/spec.test.ts +179 -0
- package/src/commands/spec.ts +105 -0
- package/src/commands/status.test.ts +614 -0
- package/src/commands/status.ts +403 -0
- package/src/commands/stop.test.ts +964 -0
- package/src/commands/stop.ts +319 -0
- package/src/commands/supervisor.test.ts +185 -0
- package/src/commands/supervisor.ts +537 -0
- package/src/commands/trace.test.ts +762 -0
- package/src/commands/trace.ts +205 -0
- package/src/commands/update.test.ts +466 -0
- package/src/commands/update.ts +263 -0
- package/src/commands/upgrade.test.ts +48 -0
- package/src/commands/upgrade.ts +240 -0
- package/src/commands/watch.test.ts +257 -0
- package/src/commands/watch.ts +308 -0
- package/src/commands/worktree.test.ts +1297 -0
- package/src/commands/worktree.ts +451 -0
- package/src/config.test.ts +1535 -0
- package/src/config.ts +1064 -0
- package/src/doctor/agents.test.ts +523 -0
- package/src/doctor/agents.ts +399 -0
- package/src/doctor/config-check.test.ts +191 -0
- package/src/doctor/config-check.ts +183 -0
- package/src/doctor/consistency.test.ts +807 -0
- package/src/doctor/consistency.ts +347 -0
- package/src/doctor/databases.test.ts +350 -0
- package/src/doctor/databases.ts +243 -0
- package/src/doctor/dependencies.test.ts +296 -0
- package/src/doctor/dependencies.ts +272 -0
- package/src/doctor/ecosystem.test.ts +308 -0
- package/src/doctor/ecosystem.ts +156 -0
- package/src/doctor/logs.test.ts +253 -0
- package/src/doctor/logs.ts +295 -0
- package/src/doctor/merge-queue.test.ts +315 -0
- package/src/doctor/merge-queue.ts +167 -0
- package/src/doctor/providers.test.ts +409 -0
- package/src/doctor/providers.ts +250 -0
- package/src/doctor/serve.test.ts +95 -0
- package/src/doctor/serve.ts +86 -0
- package/src/doctor/structure.test.ts +423 -0
- package/src/doctor/structure.ts +285 -0
- package/src/doctor/types.ts +43 -0
- package/src/doctor/version.test.ts +241 -0
- package/src/doctor/version.ts +132 -0
- package/src/doctor/watchdog.test.ts +167 -0
- package/src/doctor/watchdog.ts +214 -0
- package/src/e2e/init-sling-lifecycle.test.ts +283 -0
- package/src/errors.test.ts +350 -0
- package/src/errors.ts +217 -0
- package/src/events/store.test.ts +660 -0
- package/src/events/store.ts +369 -0
- package/src/events/tailer.test.ts +719 -0
- package/src/events/tailer.ts +332 -0
- package/src/events/tool-filter.test.ts +330 -0
- package/src/events/tool-filter.ts +126 -0
- package/src/index.ts +533 -0
- package/src/insights/analyzer.test.ts +466 -0
- package/src/insights/analyzer.ts +203 -0
- package/src/insights/quality-gates.test.ts +141 -0
- package/src/insights/quality-gates.ts +156 -0
- package/src/json.test.ts +72 -0
- package/src/json.ts +53 -0
- package/src/loam/client.test.ts +752 -0
- package/src/loam/client.ts +664 -0
- package/src/logging/color.test.ts +252 -0
- package/src/logging/color.ts +105 -0
- package/src/logging/format.test.ts +110 -0
- package/src/logging/format.ts +255 -0
- package/src/logging/logger.test.ts +814 -0
- package/src/logging/logger.ts +266 -0
- package/src/logging/reporter.test.ts +259 -0
- package/src/logging/reporter.ts +110 -0
- package/src/logging/sanitizer.test.ts +190 -0
- package/src/logging/sanitizer.ts +57 -0
- package/src/logging/theme.ts +140 -0
- package/src/mail/broadcast.test.ts +204 -0
- package/src/mail/broadcast.ts +92 -0
- package/src/mail/client.test.ts +774 -0
- package/src/mail/client.ts +236 -0
- package/src/mail/store.test.ts +898 -0
- package/src/mail/store.ts +425 -0
- package/src/merge/lock.test.ts +149 -0
- package/src/merge/lock.ts +140 -0
- package/src/merge/predict.test.ts +387 -0
- package/src/merge/predict.ts +249 -0
- package/src/merge/queue.test.ts +426 -0
- package/src/merge/queue.ts +246 -0
- package/src/merge/resolver.test.ts +1993 -0
- package/src/merge/resolver.ts +926 -0
- package/src/metrics/pricing.test.ts +258 -0
- package/src/metrics/pricing.ts +135 -0
- package/src/metrics/store.test.ts +978 -0
- package/src/metrics/store.ts +501 -0
- package/src/metrics/summary.test.ts +398 -0
- package/src/metrics/summary.ts +178 -0
- package/src/metrics/transcript.test.ts +483 -0
- package/src/metrics/transcript.ts +114 -0
- package/src/runtimes/__fixtures__/claude-stream-fixture.ts +22 -0
- package/src/runtimes/aider.test.ts +124 -0
- package/src/runtimes/aider.ts +147 -0
- package/src/runtimes/amp.test.ts +164 -0
- package/src/runtimes/amp.ts +154 -0
- package/src/runtimes/claude.test.ts +1474 -0
- package/src/runtimes/claude.ts +579 -0
- package/src/runtimes/codex.test.ts +805 -0
- package/src/runtimes/codex.ts +273 -0
- package/src/runtimes/connections.test.ts +214 -0
- package/src/runtimes/connections.ts +103 -0
- package/src/runtimes/copilot.test.ts +707 -0
- package/src/runtimes/copilot.ts +316 -0
- package/src/runtimes/cursor.test.ts +497 -0
- package/src/runtimes/cursor.ts +205 -0
- package/src/runtimes/gemini.test.ts +537 -0
- package/src/runtimes/gemini.ts +243 -0
- package/src/runtimes/goose.test.ts +133 -0
- package/src/runtimes/goose.ts +157 -0
- package/src/runtimes/headless-connection.test.ts +264 -0
- package/src/runtimes/headless-connection.ts +158 -0
- package/src/runtimes/opencode.test.ts +325 -0
- package/src/runtimes/opencode.ts +188 -0
- package/src/runtimes/pi-guards.test.ts +486 -0
- package/src/runtimes/pi-guards.ts +367 -0
- package/src/runtimes/pi.test.ts +789 -0
- package/src/runtimes/pi.ts +305 -0
- package/src/runtimes/registry.test.ts +196 -0
- package/src/runtimes/registry.ts +99 -0
- package/src/runtimes/sapling.test.ts +1267 -0
- package/src/runtimes/sapling.ts +710 -0
- package/src/runtimes/types.ts +266 -0
- package/src/schema-consistency.test.ts +246 -0
- package/src/sessions/compat.test.ts +281 -0
- package/src/sessions/compat.ts +105 -0
- package/src/sessions/store.test.ts +1748 -0
- package/src/sessions/store.ts +858 -0
- package/src/test-helpers.test.ts +124 -0
- package/src/test-helpers.ts +145 -0
- package/src/test-setup.test.ts +31 -0
- package/src/test-setup.ts +28 -0
- package/src/tools/loam/api.ts +368 -0
- package/src/tools/loam/cli.ts +278 -0
- package/src/tools/loam/commands/add.ts +52 -0
- package/src/tools/loam/commands/archive.ts +214 -0
- package/src/tools/loam/commands/audit.ts +276 -0
- package/src/tools/loam/commands/compact.ts +1062 -0
- package/src/tools/loam/commands/completions.ts +79 -0
- package/src/tools/loam/commands/config.ts +381 -0
- package/src/tools/loam/commands/delete-domain.ts +121 -0
- package/src/tools/loam/commands/delete.ts +316 -0
- package/src/tools/loam/commands/diff.ts +200 -0
- package/src/tools/loam/commands/doctor.ts +1113 -0
- package/src/tools/loam/commands/edit.ts +226 -0
- package/src/tools/loam/commands/init.ts +31 -0
- package/src/tools/loam/commands/learn.ts +179 -0
- package/src/tools/loam/commands/move.ts +323 -0
- package/src/tools/loam/commands/onboard.ts +374 -0
- package/src/tools/loam/commands/outcome.ts +185 -0
- package/src/tools/loam/commands/prime.ts +688 -0
- package/src/tools/loam/commands/prune.ts +614 -0
- package/src/tools/loam/commands/query.ts +218 -0
- package/src/tools/loam/commands/rank.ts +180 -0
- package/src/tools/loam/commands/ready.ts +189 -0
- package/src/tools/loam/commands/record.ts +1210 -0
- package/src/tools/loam/commands/restore.ts +166 -0
- package/src/tools/loam/commands/search.ts +327 -0
- package/src/tools/loam/commands/setup.ts +887 -0
- package/src/tools/loam/commands/status.ts +103 -0
- package/src/tools/loam/commands/sync.ts +298 -0
- package/src/tools/loam/commands/update.ts +19 -0
- package/src/tools/loam/commands/upgrade.ts +93 -0
- package/src/tools/loam/commands/validate.ts +190 -0
- package/src/tools/loam/index.ts +62 -0
- package/src/tools/loam/log.ts +127 -0
- package/src/tools/loam/registry/builtins.ts +409 -0
- package/src/tools/loam/registry/custom.ts +431 -0
- package/src/tools/loam/registry/init.ts +55 -0
- package/src/tools/loam/registry/template.ts +40 -0
- package/src/tools/loam/registry/type-registry.ts +113 -0
- package/src/tools/loam/schemas/config-schema.ts +489 -0
- package/src/tools/loam/schemas/config.ts +245 -0
- package/src/tools/loam/schemas/index.ts +18 -0
- package/src/tools/loam/schemas/record-schema.ts +191 -0
- package/src/tools/loam/schemas/record.ts +115 -0
- package/src/tools/loam/utils/active-work.ts +205 -0
- package/src/tools/loam/utils/anchor-validity.ts +80 -0
- package/src/tools/loam/utils/archive.ts +146 -0
- package/src/tools/loam/utils/audit.ts +667 -0
- package/src/tools/loam/utils/bm25.ts +238 -0
- package/src/tools/loam/utils/budget.ts +142 -0
- package/src/tools/loam/utils/config.ts +344 -0
- package/src/tools/loam/utils/dir-anchors.ts +62 -0
- package/src/tools/loam/utils/domain-rules.ts +114 -0
- package/src/tools/loam/utils/expertise.ts +393 -0
- package/src/tools/loam/utils/format-helpers.ts +96 -0
- package/src/tools/loam/utils/format.ts +1234 -0
- package/src/tools/loam/utils/git-context.ts +50 -0
- package/src/tools/loam/utils/git.ts +183 -0
- package/src/tools/loam/utils/hooks.ts +299 -0
- package/src/tools/loam/utils/index.ts +52 -0
- package/src/tools/loam/utils/json-output.ts +13 -0
- package/src/tools/loam/utils/lock.ts +76 -0
- package/src/tools/loam/utils/markers.ts +48 -0
- package/src/tools/loam/utils/numeric-flags.ts +20 -0
- package/src/tools/loam/utils/palette.ts +44 -0
- package/src/tools/loam/utils/prime-ranking.ts +135 -0
- package/src/tools/loam/utils/recipe-discovery.ts +195 -0
- package/src/tools/loam/utils/runtime-flags.ts +28 -0
- package/src/tools/loam/utils/scoring.ts +94 -0
- package/src/tools/loam/utils/version.ts +116 -0
- package/src/tools/sprout/commands/block.ts +64 -0
- package/src/tools/sprout/commands/blocked.ts +86 -0
- package/src/tools/sprout/commands/close.ts +129 -0
- package/src/tools/sprout/commands/completions.ts +198 -0
- package/src/tools/sprout/commands/config.ts +238 -0
- package/src/tools/sprout/commands/create.ts +164 -0
- package/src/tools/sprout/commands/dep.ts +148 -0
- package/src/tools/sprout/commands/doctor.ts +979 -0
- package/src/tools/sprout/commands/init.ts +83 -0
- package/src/tools/sprout/commands/label.ts +178 -0
- package/src/tools/sprout/commands/list.ts +210 -0
- package/src/tools/sprout/commands/migrate.ts +133 -0
- package/src/tools/sprout/commands/onboard.ts +207 -0
- package/src/tools/sprout/commands/plan-show.ts +278 -0
- package/src/tools/sprout/commands/plan.ts +2526 -0
- package/src/tools/sprout/commands/prime.ts +399 -0
- package/src/tools/sprout/commands/ready.ts +245 -0
- package/src/tools/sprout/commands/search.ts +221 -0
- package/src/tools/sprout/commands/show.ts +277 -0
- package/src/tools/sprout/commands/stats.ts +146 -0
- package/src/tools/sprout/commands/sync.ts +134 -0
- package/src/tools/sprout/commands/tpl.ts +364 -0
- package/src/tools/sprout/commands/unblock.ts +115 -0
- package/src/tools/sprout/commands/update.ts +257 -0
- package/src/tools/sprout/commands/upgrade.ts +91 -0
- package/src/tools/sprout/config-schema.ts +152 -0
- package/src/tools/sprout/config.ts +355 -0
- package/src/tools/sprout/filter.ts +107 -0
- package/src/tools/sprout/format.ts +43 -0
- package/src/tools/sprout/id.ts +22 -0
- package/src/tools/sprout/index.ts +204 -0
- package/src/tools/sprout/log.ts +76 -0
- package/src/tools/sprout/markers.ts +22 -0
- package/src/tools/sprout/output.ts +121 -0
- package/src/tools/sprout/plan-backref.ts +93 -0
- package/src/tools/sprout/plan-context.ts +81 -0
- package/src/tools/sprout/plan-domain.ts +139 -0
- package/src/tools/sprout/plan-lifecycle.ts +65 -0
- package/src/tools/sprout/plan-loam.ts +207 -0
- package/src/tools/sprout/plan-schema.ts +209 -0
- package/src/tools/sprout/sort.ts +31 -0
- package/src/tools/sprout/store.ts +172 -0
- package/src/tools/sprout/types.ts +118 -0
- package/src/tools/sprout/validation.ts +119 -0
- package/src/tools/sprout/version.ts +1 -0
- package/src/tools/sprout/yaml.ts +387 -0
- package/src/tools/trellis/commands/archive.ts +87 -0
- package/src/tools/trellis/commands/completions.ts +610 -0
- package/src/tools/trellis/commands/config.ts +382 -0
- package/src/tools/trellis/commands/create.ts +252 -0
- package/src/tools/trellis/commands/diff.ts +150 -0
- package/src/tools/trellis/commands/doctor.ts +771 -0
- package/src/tools/trellis/commands/emit.ts +365 -0
- package/src/tools/trellis/commands/history.ts +83 -0
- package/src/tools/trellis/commands/import.ts +198 -0
- package/src/tools/trellis/commands/init.ts +81 -0
- package/src/tools/trellis/commands/list.ts +103 -0
- package/src/tools/trellis/commands/onboard.ts +156 -0
- package/src/tools/trellis/commands/pin.ts +172 -0
- package/src/tools/trellis/commands/prime.ts +193 -0
- package/src/tools/trellis/commands/render.ts +122 -0
- package/src/tools/trellis/commands/schema.ts +353 -0
- package/src/tools/trellis/commands/show.ts +115 -0
- package/src/tools/trellis/commands/stats.ts +65 -0
- package/src/tools/trellis/commands/sync.ts +112 -0
- package/src/tools/trellis/commands/tree.ts +123 -0
- package/src/tools/trellis/commands/update.ts +330 -0
- package/src/tools/trellis/commands/upgrade.ts +95 -0
- package/src/tools/trellis/commands/validate.ts +166 -0
- package/src/tools/trellis/config-schema.ts +81 -0
- package/src/tools/trellis/config.ts +108 -0
- package/src/tools/trellis/frontmatter.ts +348 -0
- package/src/tools/trellis/id.ts +24 -0
- package/src/tools/trellis/index.ts +209 -0
- package/src/tools/trellis/markers.ts +28 -0
- package/src/tools/trellis/output.ts +84 -0
- package/src/tools/trellis/render.ts +212 -0
- package/src/tools/trellis/store.ts +144 -0
- package/src/tools/trellis/types.ts +82 -0
- package/src/tools/trellis/validate.ts +199 -0
- package/src/tools/trellis/yaml.ts +309 -0
- package/src/tracker/beads.test.ts +454 -0
- package/src/tracker/beads.ts +56 -0
- package/src/tracker/factory.test.ts +90 -0
- package/src/tracker/factory.ts +65 -0
- package/src/tracker/sprout.test.ts +461 -0
- package/src/tracker/sprout.ts +182 -0
- package/src/tracker/types.ts +52 -0
- package/src/trellis/client.test.ts +107 -0
- package/src/trellis/client.ts +179 -0
- package/src/types.ts +970 -0
- package/src/utils/bin.test.ts +10 -0
- package/src/utils/bin.ts +37 -0
- package/src/utils/browser.test.ts +49 -0
- package/src/utils/browser.ts +48 -0
- package/src/utils/fs.test.ts +119 -0
- package/src/utils/fs.ts +62 -0
- package/src/utils/pid.test.ts +152 -0
- package/src/utils/pid.ts +130 -0
- package/src/utils/process-scan.test.ts +53 -0
- package/src/utils/process-scan.ts +76 -0
- package/src/utils/time.test.ts +43 -0
- package/src/utils/time.ts +37 -0
- package/src/utils/version.test.ts +33 -0
- package/src/utils/version.ts +70 -0
- package/src/version.ts +5 -0
- package/src/watchdog/daemon.test.ts +3721 -0
- package/src/watchdog/daemon.ts +1257 -0
- package/src/watchdog/health.test.ts +830 -0
- package/src/watchdog/health.ts +434 -0
- package/src/watchdog/triage.test.ts +205 -0
- package/src/watchdog/triage.ts +205 -0
- package/src/worktree/manager.test.ts +720 -0
- package/src/worktree/manager.ts +405 -0
- package/src/worktree/process.test.ts +172 -0
- package/src/worktree/process.ts +131 -0
- package/src/worktree/tmux.test.ts +1616 -0
- package/src/worktree/tmux.ts +721 -0
- package/templates/CLAUDE.md.tmpl +100 -0
- package/templates/copilot-hooks.json.tmpl +13 -0
- package/templates/hooks.json.tmpl +109 -0
- package/templates/overlay.md.tmpl +88 -0
- package/ui/dist/apple-touch-icon-bdy6teep.png +0 -0
- package/ui/dist/chunk-8s31f05k.css +1 -0
- package/ui/dist/chunk-vm5rz679.js +300 -0
- package/ui/dist/favicon-nzb39vza.svg +4 -0
- package/ui/dist/index.html +17 -0
|
@@ -0,0 +1,1257 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tier 0 mechanical process monitoring daemon.
|
|
3
|
+
*
|
|
4
|
+
* Runs on a configurable interval, checking the health of all active agent
|
|
5
|
+
* sessions. Implements progressive nudging for stalled agents instead of
|
|
6
|
+
* immediately escalating to AI triage:
|
|
7
|
+
*
|
|
8
|
+
* Level 0 (warn): Log warning via onHealthCheck callback, no direct action
|
|
9
|
+
* Level 1 (nudge): Send tmux nudge via nudgeAgent()
|
|
10
|
+
* Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled), else skip
|
|
11
|
+
* Level 3 (terminate): Kill tmux session
|
|
12
|
+
*
|
|
13
|
+
* Phase 4 tier numbering:
|
|
14
|
+
* Tier 0 = Mechanical daemon (this file)
|
|
15
|
+
* Tier 1 = Triage agent (triage.ts)
|
|
16
|
+
* Tier 2 = Monitor agent (not yet implemented)
|
|
17
|
+
* Tier 3 = Supervisor monitors (per-project)
|
|
18
|
+
*
|
|
19
|
+
* ZFC Principle: Observable state (tmux alive, pid alive) is the source of
|
|
20
|
+
* truth. See health.ts for the full ZFC documentation.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { join } from "node:path";
|
|
24
|
+
import { isPersistentCapability } from "../agents/capabilities.ts";
|
|
25
|
+
import { nudgeAgent } from "../commands/nudge.ts";
|
|
26
|
+
import { createEventStore } from "../events/store.ts";
|
|
27
|
+
import {
|
|
28
|
+
findLatestStdoutLog,
|
|
29
|
+
startEventTailer,
|
|
30
|
+
type TailerHandle,
|
|
31
|
+
type TailerOptions,
|
|
32
|
+
} from "../events/tailer.ts";
|
|
33
|
+
import { createLoamClient } from "../loam/client.ts";
|
|
34
|
+
import { createMailStore, type MailStore } from "../mail/store.ts";
|
|
35
|
+
import { getConnection, removeConnection } from "../runtimes/connections.ts";
|
|
36
|
+
import type { RuntimeConnection } from "../runtimes/types.ts";
|
|
37
|
+
import { openSessionStore } from "../sessions/compat.ts";
|
|
38
|
+
import { createRunStore } from "../sessions/store.ts";
|
|
39
|
+
import type {
|
|
40
|
+
AgentSession,
|
|
41
|
+
EventStore,
|
|
42
|
+
HealthCheck,
|
|
43
|
+
RunStore,
|
|
44
|
+
WorkerDiedPayload,
|
|
45
|
+
} from "../types.ts";
|
|
46
|
+
import { isProcessAlive, isSessionAlive, killProcessTree, killSession } from "../worktree/tmux.ts";
|
|
47
|
+
import { evaluateHealth, transitionState } from "./health.ts";
|
|
48
|
+
import { type TriageResult, triageAgent } from "./triage.ts";
|
|
49
|
+
|
|
50
|
+
/** Maximum escalation level (terminate). */
|
|
51
|
+
const MAX_ESCALATION_LEVEL = 3;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Module-level registry of active event tailers for headless agents.
|
|
55
|
+
* Maps agentName → TailerHandle. Persists across daemon ticks so tailers
|
|
56
|
+
* survive between tick invocations. Overridable via DaemonOptions._tailerRegistry.
|
|
57
|
+
*/
|
|
58
|
+
const _defaultTailerRegistry: Map<string, TailerHandle> = new Map();
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Per-cause dedup state for `current-run.txt` defensive-read warnings
|
|
62
|
+
* (agentplate-87bf). The watchdog reads `.agentplate/current-run.txt` once per
|
|
63
|
+
* tick to gate run-completion checks; if the file is missing/empty/unreadable
|
|
64
|
+
* or points to an id with no row in the runs table, the check would silently
|
|
65
|
+
* skip every tick. We log one warning per cause and then continue skipping
|
|
66
|
+
* silently, so an operator can see the run-completion path is wedged without
|
|
67
|
+
* drowning in repeated lines.
|
|
68
|
+
*
|
|
69
|
+
* Module-level by design: warnings should dedupe across ticks within one
|
|
70
|
+
* watchdog process. Overridable via DaemonOptions._runIdWarnState in tests.
|
|
71
|
+
*/
|
|
72
|
+
export interface RunIdWarnState {
|
|
73
|
+
missingFileWarned: boolean;
|
|
74
|
+
unknownIds: Set<string>;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const _defaultRunIdWarnState: RunIdWarnState = {
|
|
78
|
+
missingFileWarned: false,
|
|
79
|
+
unknownIds: new Set(),
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Record an agent failure to loam for future reference.
|
|
84
|
+
* Fire-and-forget: never throws, logs errors internally if loam fails.
|
|
85
|
+
*
|
|
86
|
+
* @param root - Project root directory
|
|
87
|
+
* @param session - The agent session that failed
|
|
88
|
+
* @param reason - Human-readable failure reason
|
|
89
|
+
* @param tier - Which watchdog tier detected the failure (0 or 1)
|
|
90
|
+
* @param triageSuggestion - Optional triage verdict from Tier 1 AI analysis
|
|
91
|
+
*/
|
|
92
|
+
async function recordFailure(
|
|
93
|
+
root: string,
|
|
94
|
+
session: AgentSession,
|
|
95
|
+
reason: string,
|
|
96
|
+
tier: 0 | 1,
|
|
97
|
+
triageSuggestion?: string,
|
|
98
|
+
): Promise<void> {
|
|
99
|
+
try {
|
|
100
|
+
const loam = createLoamClient(root);
|
|
101
|
+
const tierLabel = tier === 0 ? "Tier 0 (process death)" : "Tier 1 (AI triage)";
|
|
102
|
+
const description = [
|
|
103
|
+
`Agent: ${session.agentName}`,
|
|
104
|
+
`Capability: ${session.capability}`,
|
|
105
|
+
`Failure reason: ${reason}`,
|
|
106
|
+
triageSuggestion ? `Triage suggestion: ${triageSuggestion}` : null,
|
|
107
|
+
`Detected by: ${tierLabel}`,
|
|
108
|
+
]
|
|
109
|
+
.filter((line) => line !== null)
|
|
110
|
+
.join("\n");
|
|
111
|
+
|
|
112
|
+
await loam.record("agents", {
|
|
113
|
+
type: "failure",
|
|
114
|
+
description,
|
|
115
|
+
tags: ["watchdog", "auto-recorded"],
|
|
116
|
+
evidenceBead: session.taskId || undefined,
|
|
117
|
+
});
|
|
118
|
+
} catch {
|
|
119
|
+
// Fire-and-forget: recording failures must not break the watchdog
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Read the current run ID from current-run.txt, or null if no active run.
|
|
125
|
+
* Async because it uses Bun.file().
|
|
126
|
+
*/
|
|
127
|
+
async function readCurrentRunId(agentplateDir: string): Promise<string | null> {
|
|
128
|
+
const path = join(agentplateDir, "current-run.txt");
|
|
129
|
+
const file = Bun.file(path);
|
|
130
|
+
if (!(await file.exists())) {
|
|
131
|
+
return null;
|
|
132
|
+
}
|
|
133
|
+
try {
|
|
134
|
+
const text = await file.text();
|
|
135
|
+
const trimmed = text.trim();
|
|
136
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
137
|
+
} catch {
|
|
138
|
+
return null;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Resolve the active run id for run-completion checks, defensively
|
|
144
|
+
* (agentplate-87bf). Returns the id only when `current-run.txt` is readable
|
|
145
|
+
* AND points to a row in the runs table. On either failure mode, logs one
|
|
146
|
+
* warning per cause via `warnState` and returns null so the caller can skip
|
|
147
|
+
* the check silently on subsequent ticks.
|
|
148
|
+
*
|
|
149
|
+
* Intentionally narrow: the broader `readCurrentRunId` is unchanged and still
|
|
150
|
+
* powers event-recording paths where a stale id is acceptable as a label.
|
|
151
|
+
*/
|
|
152
|
+
async function resolveRunIdForCompletionCheck(
|
|
153
|
+
agentplateDir: string,
|
|
154
|
+
runStore: RunStore | null,
|
|
155
|
+
warnState: RunIdWarnState,
|
|
156
|
+
): Promise<string | null> {
|
|
157
|
+
const runId = await readCurrentRunId(agentplateDir);
|
|
158
|
+
if (runId === null) {
|
|
159
|
+
if (!warnState.missingFileWarned) {
|
|
160
|
+
warnState.missingFileWarned = true;
|
|
161
|
+
process.stderr.write(
|
|
162
|
+
"[WATCHDOG] current-run.txt missing — run-completion checks disabled until restart\n",
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
return null;
|
|
166
|
+
}
|
|
167
|
+
if (runStore === null) {
|
|
168
|
+
// RunStore unavailable (rare — sessions.db open failed). Trust the file
|
|
169
|
+
// and let the downstream nudge path proceed; this is no worse than the
|
|
170
|
+
// pre-87bf behavior.
|
|
171
|
+
return runId;
|
|
172
|
+
}
|
|
173
|
+
let run: ReturnType<RunStore["getRun"]>;
|
|
174
|
+
try {
|
|
175
|
+
run = runStore.getRun(runId);
|
|
176
|
+
} catch {
|
|
177
|
+
// Treat lookup errors as "unknown" — same defensive posture as a missing row.
|
|
178
|
+
run = null;
|
|
179
|
+
}
|
|
180
|
+
if (run === null) {
|
|
181
|
+
if (!warnState.unknownIds.has(runId)) {
|
|
182
|
+
warnState.unknownIds.add(runId);
|
|
183
|
+
process.stderr.write(
|
|
184
|
+
`[WATCHDOG] current-run.txt points to unknown run "${runId}" — run-completion checks disabled until restart\n`,
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
return runId;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Fire-and-forget: record an event to EventStore. Never throws.
|
|
194
|
+
*/
|
|
195
|
+
function recordEvent(
|
|
196
|
+
eventStore: EventStore | null,
|
|
197
|
+
event: {
|
|
198
|
+
runId: string | null;
|
|
199
|
+
agentName: string;
|
|
200
|
+
eventType: "custom" | "mail_sent";
|
|
201
|
+
level: "debug" | "info" | "warn" | "error";
|
|
202
|
+
data: Record<string, unknown>;
|
|
203
|
+
},
|
|
204
|
+
): void {
|
|
205
|
+
if (!eventStore) return;
|
|
206
|
+
try {
|
|
207
|
+
eventStore.insert({
|
|
208
|
+
runId: event.runId,
|
|
209
|
+
agentName: event.agentName,
|
|
210
|
+
sessionId: null,
|
|
211
|
+
eventType: event.eventType,
|
|
212
|
+
toolName: null,
|
|
213
|
+
toolArgs: null,
|
|
214
|
+
toolDurationMs: null,
|
|
215
|
+
level: event.level,
|
|
216
|
+
data: JSON.stringify(event.data),
|
|
217
|
+
});
|
|
218
|
+
} catch {
|
|
219
|
+
// Fire-and-forget: event recording must never break the daemon
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Build a phase-aware completion message based on the capabilities of terminal workers.
|
|
225
|
+
*
|
|
226
|
+
* "Terminal" includes both `completed` (clean exit) and `zombie` (watchdog-killed)
|
|
227
|
+
* — see agentplate-e130 for why a zombie counts as run-terminal. Single-capability
|
|
228
|
+
* batches get targeted messages (e.g. scouts → "Ready for next phase"), while
|
|
229
|
+
* mixed-capability batches get a generic summary with a breakdown. When any worker
|
|
230
|
+
* died, the verb changes from "have completed" to "have terminated" and the message
|
|
231
|
+
* carries a "(N completed, M zombie)" qualifier so the coordinator does not mistake
|
|
232
|
+
* a partial failure for a clean batch.
|
|
233
|
+
*/
|
|
234
|
+
export function buildCompletionMessage(
|
|
235
|
+
workerSessions: readonly AgentSession[],
|
|
236
|
+
runId: string,
|
|
237
|
+
): string {
|
|
238
|
+
const capabilities = new Set(workerSessions.map((s) => s.capability));
|
|
239
|
+
const count = workerSessions.length;
|
|
240
|
+
const zombieCount = workerSessions.filter((s) => s.state === "zombie").length;
|
|
241
|
+
const completedCount = count - zombieCount;
|
|
242
|
+
const verb = zombieCount > 0 ? "have terminated" : "have completed";
|
|
243
|
+
const qualifier = zombieCount > 0 ? ` (${completedCount} completed, ${zombieCount} zombie)` : "";
|
|
244
|
+
|
|
245
|
+
if (capabilities.size === 1) {
|
|
246
|
+
if (capabilities.has("scout")) {
|
|
247
|
+
return `[WATCHDOG] All ${count} scout(s) in run ${runId} ${verb}${qualifier}. Ready for next phase.`;
|
|
248
|
+
}
|
|
249
|
+
if (capabilities.has("builder")) {
|
|
250
|
+
return `[WATCHDOG] All ${count} builder(s) in run ${runId} ${verb}${qualifier}. Awaiting lead verification.`;
|
|
251
|
+
}
|
|
252
|
+
if (capabilities.has("reviewer")) {
|
|
253
|
+
return `[WATCHDOG] All ${count} reviewer(s) in run ${runId} ${verb}${qualifier}. Reviews done.`;
|
|
254
|
+
}
|
|
255
|
+
if (capabilities.has("lead")) {
|
|
256
|
+
return `[WATCHDOG] All ${count} lead(s) in run ${runId} ${verb}${qualifier}. Ready for merge/cleanup.`;
|
|
257
|
+
}
|
|
258
|
+
if (capabilities.has("merger")) {
|
|
259
|
+
return `[WATCHDOG] All ${count} merger(s) in run ${runId} ${verb}${qualifier}. Merges done.`;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const breakdown = Array.from(capabilities).sort().join(", ");
|
|
264
|
+
return `[WATCHDOG] All ${count} worker(s) in run ${runId} ${verb}${qualifier} (${breakdown}). Ready for next steps.`;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Check if every worker session for the active run has reached a terminal state
|
|
269
|
+
* (`completed` or `zombie`), and if so, nudge the coordinator. Fire-and-forget:
|
|
270
|
+
* never throws.
|
|
271
|
+
*
|
|
272
|
+
* Zombie counts as terminal (agentplate-e130): a watchdog-killed worker is not
|
|
273
|
+
* coming back, so excluding it would strand the coordinator on a run that mixes
|
|
274
|
+
* clean exits with kills.
|
|
275
|
+
*
|
|
276
|
+
* Deduplication: uses a marker file (run-complete-notified.txt) to prevent
|
|
277
|
+
* repeated nudges for the same run ID.
|
|
278
|
+
*/
|
|
279
|
+
async function checkRunCompletion(ctx: {
|
|
280
|
+
store: { getByRun: (runId: string) => AgentSession[] };
|
|
281
|
+
runId: string;
|
|
282
|
+
agentplateDir: string;
|
|
283
|
+
root: string;
|
|
284
|
+
nudge: (
|
|
285
|
+
projectRoot: string,
|
|
286
|
+
agentName: string,
|
|
287
|
+
message: string,
|
|
288
|
+
force: boolean,
|
|
289
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
290
|
+
eventStore: EventStore | null;
|
|
291
|
+
}): Promise<void> {
|
|
292
|
+
const { store, runId, agentplateDir, root, nudge, eventStore } = ctx;
|
|
293
|
+
|
|
294
|
+
const runSessions = store.getByRun(runId);
|
|
295
|
+
const workerSessions = runSessions.filter((s) => !isPersistentCapability(s.capability));
|
|
296
|
+
|
|
297
|
+
if (workerSessions.length === 0) {
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// `completed` = clean exit, `zombie` = watchdog-killed. Both are terminal
|
|
302
|
+
// for run-completion: a zombie is not coming back, so blocking on it would
|
|
303
|
+
// strand the coordinator forever (agentplate-e130).
|
|
304
|
+
const allTerminal = workerSessions.every((s) => s.state === "completed" || s.state === "zombie");
|
|
305
|
+
if (!allTerminal) {
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Dedup: check marker file
|
|
310
|
+
const markerPath = join(agentplateDir, "run-complete-notified.txt");
|
|
311
|
+
try {
|
|
312
|
+
const file = Bun.file(markerPath);
|
|
313
|
+
if (await file.exists()) {
|
|
314
|
+
const existing = await file.text();
|
|
315
|
+
if (existing.trim() === runId) {
|
|
316
|
+
return; // Already notified
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
} catch {
|
|
320
|
+
// Read failure is non-fatal — proceed with nudge
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Nudge the coordinator
|
|
324
|
+
const message = buildCompletionMessage(workerSessions, runId);
|
|
325
|
+
try {
|
|
326
|
+
await nudge(root, "coordinator", message, true);
|
|
327
|
+
} catch {
|
|
328
|
+
// Nudge delivery failure is non-fatal
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Record the event
|
|
332
|
+
const capabilitiesArr = Array.from(new Set(workerSessions.map((s) => s.capability))).sort();
|
|
333
|
+
const phase = capabilitiesArr.length === 1 ? capabilitiesArr[0] : "mixed";
|
|
334
|
+
const completedAgents = workerSessions
|
|
335
|
+
.filter((s) => s.state === "completed")
|
|
336
|
+
.map((s) => s.agentName);
|
|
337
|
+
const zombieAgents = workerSessions.filter((s) => s.state === "zombie").map((s) => s.agentName);
|
|
338
|
+
recordEvent(eventStore, {
|
|
339
|
+
runId,
|
|
340
|
+
agentName: "watchdog",
|
|
341
|
+
eventType: "custom",
|
|
342
|
+
level: zombieAgents.length > 0 ? "warn" : "info",
|
|
343
|
+
data: {
|
|
344
|
+
type: "run_complete",
|
|
345
|
+
workerCount: workerSessions.length,
|
|
346
|
+
completedAgents,
|
|
347
|
+
zombieAgents,
|
|
348
|
+
capabilities: capabilitiesArr,
|
|
349
|
+
phase,
|
|
350
|
+
},
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
// Write dedup marker
|
|
354
|
+
try {
|
|
355
|
+
await Bun.write(markerPath, runId);
|
|
356
|
+
} catch {
|
|
357
|
+
// Marker write failure is non-fatal
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/** Options shared between startDaemon and runDaemonTick. */
|
|
362
|
+
export interface DaemonOptions {
|
|
363
|
+
root: string;
|
|
364
|
+
staleThresholdMs: number;
|
|
365
|
+
zombieThresholdMs: number;
|
|
366
|
+
nudgeIntervalMs?: number;
|
|
367
|
+
tier1Enabled?: boolean;
|
|
368
|
+
/**
|
|
369
|
+
* When true (default), the watchdog sends a synthetic `worker_died` mail to
|
|
370
|
+
* `session.parentAgent` the first time it transitions a session to `zombie`
|
|
371
|
+
* (agentplate-c111). Without this, the parent — typically a lead waiting for
|
|
372
|
+
* `worker_done` — blocks indefinitely on mail that will never arrive.
|
|
373
|
+
*/
|
|
374
|
+
notifyParentOnDeath?: boolean;
|
|
375
|
+
onHealthCheck?: (check: HealthCheck) => void;
|
|
376
|
+
/** Dependency injection for testing. Uses real implementations when omitted. */
|
|
377
|
+
_tmux?: {
|
|
378
|
+
isSessionAlive: (name: string) => Promise<boolean>;
|
|
379
|
+
killSession: (name: string) => Promise<void>;
|
|
380
|
+
};
|
|
381
|
+
/** Dependency injection for testing. Uses real triageAgent when omitted. */
|
|
382
|
+
_triage?: (options: {
|
|
383
|
+
agentName: string;
|
|
384
|
+
root: string;
|
|
385
|
+
lastActivity: string;
|
|
386
|
+
}) => Promise<TriageResult | "retry" | "terminate" | "extend">;
|
|
387
|
+
/** Max triage calls per daemon tick (prevents runaway AI usage). Default: 3. */
|
|
388
|
+
_maxTriagePerTick?: number;
|
|
389
|
+
/** Dependency injection for testing. Uses real nudgeAgent when omitted. */
|
|
390
|
+
_nudge?: (
|
|
391
|
+
projectRoot: string,
|
|
392
|
+
agentName: string,
|
|
393
|
+
message: string,
|
|
394
|
+
force: boolean,
|
|
395
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
396
|
+
/** Dependency injection for testing. Uses real isProcessAlive/killProcessTree when omitted. */
|
|
397
|
+
_process?: {
|
|
398
|
+
isAlive: (pid: number) => boolean;
|
|
399
|
+
killTree: (pid: number) => Promise<void>;
|
|
400
|
+
};
|
|
401
|
+
/** Dependency injection for testing. Overrides EventStore creation. */
|
|
402
|
+
_eventStore?: EventStore | null;
|
|
403
|
+
/** Dependency injection for testing. Uses real recordFailure when omitted. */
|
|
404
|
+
_recordFailure?: (
|
|
405
|
+
root: string,
|
|
406
|
+
session: AgentSession,
|
|
407
|
+
reason: string,
|
|
408
|
+
tier: 0 | 1,
|
|
409
|
+
triageSuggestion?: string,
|
|
410
|
+
) => Promise<void>;
|
|
411
|
+
/** Dependency injection for testing. Uses real getConnection when omitted. */
|
|
412
|
+
_getConnection?: (name: string) => RuntimeConnection | undefined;
|
|
413
|
+
/** Dependency injection for testing. Uses real removeConnection when omitted. */
|
|
414
|
+
_removeConnection?: (name: string) => void;
|
|
415
|
+
/** Dependency injection for testing. Uses _defaultTailerRegistry when omitted. */
|
|
416
|
+
_tailerRegistry?: Map<string, TailerHandle>;
|
|
417
|
+
/** Dependency injection for testing. Uses startEventTailer when omitted. */
|
|
418
|
+
_tailerFactory?: (opts: TailerOptions) => TailerHandle;
|
|
419
|
+
/** Dependency injection for testing. Uses findLatestStdoutLog when omitted. */
|
|
420
|
+
_findLatestStdoutLog?: (agentplateDir: string, agentName: string) => Promise<string | null>;
|
|
421
|
+
/** Dependency injection for testing. Overrides MailStore creation for decision gate detection. */
|
|
422
|
+
_mailStore?: MailStore | null;
|
|
423
|
+
/**
|
|
424
|
+
* Dependency injection for testing. Overrides the module-level run-id warning
|
|
425
|
+
* state so each test starts with a clean dedup slate (agentplate-87bf).
|
|
426
|
+
*/
|
|
427
|
+
_runIdWarnState?: RunIdWarnState;
|
|
428
|
+
/**
|
|
429
|
+
* Dependency injection for testing. Overrides RunStore creation. When `null`
|
|
430
|
+
* is passed explicitly, run-id validation is skipped (file presence still
|
|
431
|
+
* gates the warning). When omitted, a real RunStore is opened against
|
|
432
|
+
* `.agentplate/sessions.db`.
|
|
433
|
+
*/
|
|
434
|
+
_runStore?: RunStore | null;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Start the watchdog daemon that periodically monitors agent health.
|
|
439
|
+
*
|
|
440
|
+
* On each tick:
|
|
441
|
+
* 1. Loads sessions from SessionStore (sessions.db)
|
|
442
|
+
* 2. For each session (including zombies — ZFC requires re-checking observable
|
|
443
|
+
* state), checks tmux liveness and evaluates health
|
|
444
|
+
* 3. For "terminate" actions: kills tmux session immediately
|
|
445
|
+
* 4. For "investigate" actions: surfaces via onHealthCheck, no auto-kill
|
|
446
|
+
* 5. For "escalate" actions: applies progressive nudging based on escalationLevel
|
|
447
|
+
* 6. Persists updated session states back to SessionStore
|
|
448
|
+
*
|
|
449
|
+
* @param options.root - Project root directory (contains .agentplate/)
|
|
450
|
+
* @param options.intervalMs - Polling interval in milliseconds
|
|
451
|
+
* @param options.staleThresholdMs - Time after which an agent is considered stale
|
|
452
|
+
* @param options.zombieThresholdMs - Time after which an agent is considered a zombie
|
|
453
|
+
* @param options.nudgeIntervalMs - Time between progressive nudge stage transitions (default 60000)
|
|
454
|
+
* @param options.tier1Enabled - Whether Tier 1 AI triage is enabled (default false)
|
|
455
|
+
* @param options.onHealthCheck - Optional callback for each health check result
|
|
456
|
+
* @returns An object with a `stop` function to halt the daemon
|
|
457
|
+
*/
|
|
458
|
+
export function startDaemon(options: DaemonOptions & { intervalMs: number }): { stop: () => void } {
|
|
459
|
+
const { intervalMs } = options;
|
|
460
|
+
const tailerRegistry = options._tailerRegistry ?? _defaultTailerRegistry;
|
|
461
|
+
|
|
462
|
+
// Run the first tick immediately, then on interval
|
|
463
|
+
runDaemonTick(options).catch(() => {
|
|
464
|
+
// Swallow errors in the first tick — daemon must not crash
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
const interval = setInterval(() => {
|
|
468
|
+
runDaemonTick(options).catch(() => {
|
|
469
|
+
// Swallow errors in periodic ticks — daemon must not crash
|
|
470
|
+
});
|
|
471
|
+
}, intervalMs);
|
|
472
|
+
|
|
473
|
+
return {
|
|
474
|
+
stop(): void {
|
|
475
|
+
clearInterval(interval);
|
|
476
|
+
for (const [name, handle] of tailerRegistry) {
|
|
477
|
+
handle.stop();
|
|
478
|
+
tailerRegistry.delete(name);
|
|
479
|
+
}
|
|
480
|
+
},
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Kill an agent using the appropriate method based on whether it is headless or TUI.
|
|
486
|
+
*
|
|
487
|
+
* Prefers runtime-agnostic `conn.abort()` when a RuntimeConnection is registered.
|
|
488
|
+
* If abort() succeeds, returns immediately — no PID/tmux kill needed.
|
|
489
|
+
* If abort() throws (e.g. process already exited), falls through to the
|
|
490
|
+
* defense-in-depth path below.
|
|
491
|
+
*
|
|
492
|
+
* Branching after abort:
|
|
493
|
+
* - tmuxSession === "" (headless): never call tmux.killSession — an empty `-t`
|
|
494
|
+
* prefix-matches every session in the tmux server, wildcard-killing the entire
|
|
495
|
+
* agentplate swarm (agentplate-74ce). Branch by pid:
|
|
496
|
+
* - pid !== null → kill the process tree (long-lived headless capability).
|
|
497
|
+
* - pid === null → no-op (spawn-per-turn agent between turns; the in-flight
|
|
498
|
+
* process, if any, was already handled by the abort/connection path).
|
|
499
|
+
* - tmuxSession !== "" (TUI): kill the named tmux session, but only when
|
|
500
|
+
* `tmuxAlive` to avoid spurious "session not found" errors.
|
|
501
|
+
*/
|
|
502
|
+
async function killAgent(ctx: {
|
|
503
|
+
session: AgentSession;
|
|
504
|
+
tmuxAlive: boolean;
|
|
505
|
+
tmux: { killSession: (name: string) => Promise<void> };
|
|
506
|
+
process: { killTree: (pid: number) => Promise<void> };
|
|
507
|
+
getConnection: (name: string) => RuntimeConnection | undefined;
|
|
508
|
+
removeConnection: (name: string) => void;
|
|
509
|
+
}): Promise<void> {
|
|
510
|
+
const { session, tmuxAlive, tmux, process: proc, getConnection, removeConnection } = ctx;
|
|
511
|
+
|
|
512
|
+
// Prefer runtime-agnostic abort() when a connection is registered.
|
|
513
|
+
const conn = getConnection(session.agentName);
|
|
514
|
+
if (conn) {
|
|
515
|
+
let aborted = false;
|
|
516
|
+
try {
|
|
517
|
+
await conn.abort();
|
|
518
|
+
aborted = true;
|
|
519
|
+
} catch {
|
|
520
|
+
// abort() failure — fall through to defense-in-depth path
|
|
521
|
+
}
|
|
522
|
+
removeConnection(session.agentName);
|
|
523
|
+
if (aborted) {
|
|
524
|
+
return;
|
|
525
|
+
}
|
|
526
|
+
// abort() threw — fall through to PID/tmux kill below as defense-in-depth
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
// Headless agents (no tmux session) must never reach tmux.killSession.
|
|
530
|
+
// An empty `-t` argument is prefix-matched and would kill every agentplate
|
|
531
|
+
// tmux session in the server (agentplate-74ce).
|
|
532
|
+
if (session.tmuxSession === "") {
|
|
533
|
+
if (session.pid !== null) {
|
|
534
|
+
try {
|
|
535
|
+
await proc.killTree(session.pid);
|
|
536
|
+
} catch {
|
|
537
|
+
// Already exited — not an error
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
// pid === null: spawn-per-turn agent between turns. Any in-flight process
|
|
541
|
+
// was handled by abort/connection above. No-op — next dispatch will spawn fresh.
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Named tmux session path (TUI agents).
|
|
546
|
+
if (tmuxAlive) {
|
|
547
|
+
try {
|
|
548
|
+
await tmux.killSession(session.tmuxSession);
|
|
549
|
+
} catch {
|
|
550
|
+
// Session may have died between check and kill — not an error
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
/**
|
|
556
|
+
* Send a synthetic `worker_died` mail to the parent of a watchdog-terminated
|
|
557
|
+
* session (agentplate-c111). Fire-and-forget: never throws.
|
|
558
|
+
*
|
|
559
|
+
* Called only when `tryTransitionState(..., "zombie")` returns `ok: true`, so
|
|
560
|
+
* the state-machine's idempotence dedupes us — a subsequent watchdog tick that
|
|
561
|
+
* tries to re-zombify a session sees `illegal_transition` and skips notify.
|
|
562
|
+
*/
|
|
563
|
+
function notifyParentOfDeath(ctx: {
|
|
564
|
+
session: AgentSession;
|
|
565
|
+
mailStore: MailStore | null;
|
|
566
|
+
reason: string;
|
|
567
|
+
tier: 0 | 1;
|
|
568
|
+
eventStore: EventStore | null;
|
|
569
|
+
runId: string | null;
|
|
570
|
+
}): void {
|
|
571
|
+
const { session, mailStore, reason, tier, eventStore, runId } = ctx;
|
|
572
|
+
if (mailStore === null) return;
|
|
573
|
+
if (session.parentAgent === null) return;
|
|
574
|
+
|
|
575
|
+
const payload: WorkerDiedPayload = {
|
|
576
|
+
agentName: session.agentName,
|
|
577
|
+
capability: session.capability,
|
|
578
|
+
taskId: session.taskId,
|
|
579
|
+
reason,
|
|
580
|
+
lastActivity: session.lastActivity,
|
|
581
|
+
terminatedBy: tier === 0 ? "tier0" : "tier1",
|
|
582
|
+
};
|
|
583
|
+
|
|
584
|
+
try {
|
|
585
|
+
mailStore.insert({
|
|
586
|
+
id: "",
|
|
587
|
+
from: session.agentName,
|
|
588
|
+
to: session.parentAgent,
|
|
589
|
+
subject: `[WATCHDOG] worker_died: ${session.agentName}`,
|
|
590
|
+
body:
|
|
591
|
+
`Worker "${session.agentName}" (${session.capability}) on task ${session.taskId} ` +
|
|
592
|
+
`was terminated by the watchdog. Reason: ${reason}. ` +
|
|
593
|
+
`Last activity: ${session.lastActivity}. ` +
|
|
594
|
+
`Decide whether to retry the work, escalate, or report the failure upstream.`,
|
|
595
|
+
type: "worker_died",
|
|
596
|
+
priority: "high",
|
|
597
|
+
threadId: null,
|
|
598
|
+
payload: JSON.stringify(payload),
|
|
599
|
+
});
|
|
600
|
+
} catch {
|
|
601
|
+
// Mail-send failure must never crash the watchdog.
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
recordEvent(eventStore, {
|
|
606
|
+
runId,
|
|
607
|
+
agentName: session.agentName,
|
|
608
|
+
eventType: "mail_sent",
|
|
609
|
+
level: "warn",
|
|
610
|
+
data: {
|
|
611
|
+
type: "worker_died",
|
|
612
|
+
parent: session.parentAgent,
|
|
613
|
+
reason,
|
|
614
|
+
tier,
|
|
615
|
+
},
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/**
|
|
620
|
+
* Run a single daemon tick. Exported for testing — allows direct invocation
|
|
621
|
+
* of the monitoring logic without starting the interval-based daemon loop.
|
|
622
|
+
*
|
|
623
|
+
* @param options - Same options as startDaemon (minus intervalMs)
|
|
624
|
+
*/
|
|
625
|
+
export async function runDaemonTick(options: DaemonOptions): Promise<void> {
|
|
626
|
+
const {
|
|
627
|
+
root,
|
|
628
|
+
staleThresholdMs,
|
|
629
|
+
zombieThresholdMs,
|
|
630
|
+
nudgeIntervalMs = 60_000,
|
|
631
|
+
tier1Enabled = false,
|
|
632
|
+
notifyParentOnDeath = true,
|
|
633
|
+
onHealthCheck,
|
|
634
|
+
} = options;
|
|
635
|
+
const tmux = options._tmux ?? { isSessionAlive, killSession };
|
|
636
|
+
const proc = options._process ?? { isAlive: isProcessAlive, killTree: killProcessTree };
|
|
637
|
+
const triage = options._triage ?? triageAgent;
|
|
638
|
+
const nudge = options._nudge ?? nudgeAgent;
|
|
639
|
+
const recordFailureFn = options._recordFailure ?? recordFailure;
|
|
640
|
+
const getConn = options._getConnection ?? getConnection;
|
|
641
|
+
const removeConn = options._removeConnection ?? removeConnection;
|
|
642
|
+
const tailerRegistry = options._tailerRegistry ?? _defaultTailerRegistry;
|
|
643
|
+
const tailerFactory = options._tailerFactory ?? startEventTailer;
|
|
644
|
+
const findStdoutLog = options._findLatestStdoutLog ?? findLatestStdoutLog;
|
|
645
|
+
const maxTriagePerTick = options._maxTriagePerTick ?? 3;
|
|
646
|
+
const triageCount = { value: 0 };
|
|
647
|
+
const runIdWarnState = options._runIdWarnState ?? _defaultRunIdWarnState;
|
|
648
|
+
|
|
649
|
+
const agentplateDir = join(root, ".agentplate");
|
|
650
|
+
const { store } = openSessionStore(agentplateDir);
|
|
651
|
+
|
|
652
|
+
// Open RunStore for run-id validation (agentplate-87bf). Sharing sessions.db
|
|
653
|
+
// is intentional — same file, WAL mode covers concurrent reads.
|
|
654
|
+
let runStore: RunStore | null = null;
|
|
655
|
+
let ownRunStore = false;
|
|
656
|
+
if (options._runStore !== undefined) {
|
|
657
|
+
runStore = options._runStore;
|
|
658
|
+
} else {
|
|
659
|
+
try {
|
|
660
|
+
runStore = createRunStore(join(agentplateDir, "sessions.db"));
|
|
661
|
+
ownRunStore = true;
|
|
662
|
+
} catch {
|
|
663
|
+
// RunStore creation failure is non-fatal — id validation is then skipped.
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// Open MailStore for decision gate detection (fire-and-forget: non-fatal if unavailable)
|
|
668
|
+
let mailStore: MailStore | null = null;
|
|
669
|
+
let ownMailStore = false;
|
|
670
|
+
if (options._mailStore !== undefined) {
|
|
671
|
+
mailStore = options._mailStore;
|
|
672
|
+
} else {
|
|
673
|
+
try {
|
|
674
|
+
mailStore = createMailStore(join(agentplateDir, "mail.db"));
|
|
675
|
+
ownMailStore = true;
|
|
676
|
+
} catch {
|
|
677
|
+
// MailStore creation failure is non-fatal — decision gate detection will be skipped
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
// Open EventStore for recording daemon events (fire-and-forget)
|
|
682
|
+
let eventStore: EventStore | null = null;
|
|
683
|
+
let runId: string | null = null;
|
|
684
|
+
const useInjectedEventStore = options._eventStore !== undefined;
|
|
685
|
+
if (useInjectedEventStore) {
|
|
686
|
+
eventStore = options._eventStore ?? null;
|
|
687
|
+
} else {
|
|
688
|
+
try {
|
|
689
|
+
const eventsDbPath = join(agentplateDir, "events.db");
|
|
690
|
+
eventStore = createEventStore(eventsDbPath);
|
|
691
|
+
} catch {
|
|
692
|
+
// EventStore creation failure is non-fatal for the daemon
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
try {
|
|
696
|
+
runId = await readCurrentRunId(agentplateDir);
|
|
697
|
+
} catch {
|
|
698
|
+
// Reading run ID failure is non-fatal
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
try {
|
|
702
|
+
const thresholds = {
|
|
703
|
+
staleMs: staleThresholdMs,
|
|
704
|
+
zombieMs: zombieThresholdMs,
|
|
705
|
+
};
|
|
706
|
+
|
|
707
|
+
const sessions = store.getAll();
|
|
708
|
+
|
|
709
|
+
// Track active headless agents to clean up stale tailers after the loop.
|
|
710
|
+
const activeHeadlessAgents = new Set<string>();
|
|
711
|
+
const eventsDbPath = join(agentplateDir, "events.db");
|
|
712
|
+
const sessionsDbPath = join(agentplateDir, "sessions.db");
|
|
713
|
+
|
|
714
|
+
for (const session of sessions) {
|
|
715
|
+
// Skip completed sessions — they are terminal and don't need monitoring
|
|
716
|
+
if (session.state === "completed") {
|
|
717
|
+
continue;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
// ZFC: Don't skip zombies. Re-check tmux liveness on every tick.
|
|
721
|
+
// A zombie with a live tmux session needs investigation, not silence.
|
|
722
|
+
|
|
723
|
+
// Event tailer management: start a background NDJSON tailer for each
|
|
724
|
+
// active headless agent that doesn't already have one running.
|
|
725
|
+
// Tailers persist between ticks (module-level registry) so events are
|
|
726
|
+
// continuously written to events.db while the agent is working.
|
|
727
|
+
//
|
|
728
|
+
// Both long-lived headless (pid !== null) and spawn-per-turn workers
|
|
729
|
+
// (pid === null, agentplate-7a34) emit stream-json to stdout.log, so
|
|
730
|
+
// either pattern needs a tailer.
|
|
731
|
+
if (session.tmuxSession === "") {
|
|
732
|
+
activeHeadlessAgents.add(session.agentName);
|
|
733
|
+
if (!tailerRegistry.has(session.agentName)) {
|
|
734
|
+
// Discover the latest stdout.log for this agent and start tailing.
|
|
735
|
+
const logPath = await findStdoutLog(agentplateDir, session.agentName);
|
|
736
|
+
if (logPath) {
|
|
737
|
+
const handle = tailerFactory({
|
|
738
|
+
stdoutLogPath: logPath,
|
|
739
|
+
agentName: session.agentName,
|
|
740
|
+
runId,
|
|
741
|
+
eventsDbPath,
|
|
742
|
+
sessionsDbPath,
|
|
743
|
+
});
|
|
744
|
+
tailerRegistry.set(session.agentName, handle);
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// === Liveness check ===
|
|
750
|
+
// Prefer RuntimeConnection.getState() when a connection is registered. Fall
|
|
751
|
+
// back to tmux liveness when no connection exists. For headless agents without
|
|
752
|
+
// a connection, use event-based activity detection to refresh lastActivity.
|
|
753
|
+
const conn = getConn(session.agentName);
|
|
754
|
+
let tmuxAlive: boolean;
|
|
755
|
+
|
|
756
|
+
if (conn) {
|
|
757
|
+
try {
|
|
758
|
+
const state = await Promise.race([
|
|
759
|
+
conn.getState(),
|
|
760
|
+
new Promise<never>((_, reject) =>
|
|
761
|
+
setTimeout(() => reject(new Error("getState timed out")), 5000),
|
|
762
|
+
),
|
|
763
|
+
]);
|
|
764
|
+
// Map ConnectionState → liveness:
|
|
765
|
+
// idle | working → alive (running)
|
|
766
|
+
// error → not alive (exited)
|
|
767
|
+
if (state.status === "idle" || state.status === "working") {
|
|
768
|
+
tmuxAlive = true;
|
|
769
|
+
store.updateLastActivity(session.agentName);
|
|
770
|
+
session.lastActivity = new Date().toISOString();
|
|
771
|
+
} else {
|
|
772
|
+
tmuxAlive = false;
|
|
773
|
+
}
|
|
774
|
+
} catch {
|
|
775
|
+
// getState() failed/timed out — drop stale connection, fall back to tmux
|
|
776
|
+
removeConn(session.agentName);
|
|
777
|
+
tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
778
|
+
}
|
|
779
|
+
} else {
|
|
780
|
+
tmuxAlive = await tmux.isSessionAlive(session.tmuxSession);
|
|
781
|
+
|
|
782
|
+
// Headless agents without a registered connection: event-based
|
|
783
|
+
// activity detection to avoid false-positive stale. Covers both
|
|
784
|
+
// long-lived headless (e.g. after a process restart) and
|
|
785
|
+
// spawn-per-turn workers between turns where lastActivity is
|
|
786
|
+
// the only liveness signal (agentplate-7a34).
|
|
787
|
+
if (session.tmuxSession === "" && eventStore) {
|
|
788
|
+
try {
|
|
789
|
+
const recentEvents = eventStore.getByAgent(session.agentName, {
|
|
790
|
+
since: new Date(Date.now() - staleThresholdMs).toISOString(),
|
|
791
|
+
limit: 1,
|
|
792
|
+
});
|
|
793
|
+
if (recentEvents.length > 0) {
|
|
794
|
+
store.updateLastActivity(session.agentName);
|
|
795
|
+
session.lastActivity = new Date().toISOString();
|
|
796
|
+
}
|
|
797
|
+
} catch {
|
|
798
|
+
// Non-fatal: event store query failure should not affect monitoring
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
const check = evaluateHealth(session, tmuxAlive, thresholds);
|
|
803
|
+
|
|
804
|
+
// Snapshot the pre-tick state so the worker_died notify path can
|
|
805
|
+
// dedupe across re-ticks (agentplate-c111). Subsequent `tryTransitionState`
|
|
806
|
+
// calls below mutate session.state, and the matrix allows the idempotent
|
|
807
|
+
// `zombie → zombie` self-transition — both would erase the dedup signal.
|
|
808
|
+
const stateBeforeTick = session.state;
|
|
809
|
+
|
|
810
|
+
// Transition state forward only (investigate action holds state).
|
|
811
|
+
// `transitionState` computes the watchdog's preferred target;
|
|
812
|
+
// `tryTransitionState` is the matrix-guarded CAS — `completed → *`
|
|
813
|
+
// is rejected here so a properly-completed agent cannot be
|
|
814
|
+
// reclassified as zombie by a late watchdog tick (agentplate-a993).
|
|
815
|
+
const newState = transitionState(session.state, check);
|
|
816
|
+
if (newState !== session.state) {
|
|
817
|
+
const outcome = store.tryTransitionState(session.agentName, newState);
|
|
818
|
+
if (outcome.ok) {
|
|
819
|
+
session.state = newState;
|
|
820
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
821
|
+
// Resync local mirror — another writer settled state durably.
|
|
822
|
+
session.state = outcome.prev;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
if (onHealthCheck) {
|
|
827
|
+
onHealthCheck(check);
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
if (check.action === "terminate") {
|
|
831
|
+
// Record the failure via loam (Tier 0 detection)
|
|
832
|
+
const reason = check.reconciliationNote ?? "Process terminated";
|
|
833
|
+
await recordFailureFn(root, session, reason, 0);
|
|
834
|
+
|
|
835
|
+
// Kill the agent: prefer conn.abort(), fall back to PID/tmux
|
|
836
|
+
await killAgent({
|
|
837
|
+
session,
|
|
838
|
+
tmuxAlive,
|
|
839
|
+
tmux,
|
|
840
|
+
process: proc,
|
|
841
|
+
getConnection: getConn,
|
|
842
|
+
removeConnection: removeConn,
|
|
843
|
+
});
|
|
844
|
+
// Matrix-guarded: rejected when state is `completed` so a clean
|
|
845
|
+
// `ap stop` cannot be silently downgraded to zombie by a late
|
|
846
|
+
// watchdog termination (agentplate-a993).
|
|
847
|
+
const outcome = store.tryTransitionState(session.agentName, "zombie");
|
|
848
|
+
// Reset escalation tracking on terminal state
|
|
849
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
850
|
+
if (outcome.ok) {
|
|
851
|
+
session.state = "zombie";
|
|
852
|
+
// First-time zombify: notify parent so it doesn't block on
|
|
853
|
+
// missing `worker_done` mail (agentplate-c111). Dedup uses the
|
|
854
|
+
// pre-tick snapshot because the matrix allows the idempotent
|
|
855
|
+
// zombie → zombie transition (both `outcome.ok` and the earlier
|
|
856
|
+
// transitionState call would otherwise mask re-ticks).
|
|
857
|
+
if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
|
|
858
|
+
notifyParentOfDeath({
|
|
859
|
+
session,
|
|
860
|
+
mailStore,
|
|
861
|
+
reason,
|
|
862
|
+
tier: 0,
|
|
863
|
+
eventStore,
|
|
864
|
+
runId,
|
|
865
|
+
});
|
|
866
|
+
}
|
|
867
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
868
|
+
session.state = outcome.prev;
|
|
869
|
+
}
|
|
870
|
+
session.escalationLevel = 0;
|
|
871
|
+
session.stalledSince = null;
|
|
872
|
+
} else if (check.action === "investigate") {
|
|
873
|
+
// ZFC: tmux alive but SessionStore says zombie.
|
|
874
|
+
// Log the conflict but do NOT auto-kill.
|
|
875
|
+
// The onHealthCheck callback surfaces this to the operator.
|
|
876
|
+
// No state change — keep zombie until a human or higher-tier agent decides.
|
|
877
|
+
} else if (check.action === "complete") {
|
|
878
|
+
// ZFC fallback: tmux/pid is gone AND lastActivity is stale —
|
|
879
|
+
// the agent looks like it finished naturally and only the
|
|
880
|
+
// session-end hook missed (agentplate-e74b). Mark completed
|
|
881
|
+
// without killing (process is already gone) and without
|
|
882
|
+
// notifying parents of death (this is not a crash).
|
|
883
|
+
const outcome = store.tryTransitionState(session.agentName, "completed");
|
|
884
|
+
if (outcome.ok) {
|
|
885
|
+
session.state = "completed";
|
|
886
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
887
|
+
session.state = outcome.prev;
|
|
888
|
+
}
|
|
889
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
890
|
+
session.escalationLevel = 0;
|
|
891
|
+
session.stalledSince = null;
|
|
892
|
+
} else if (check.action === "escalate") {
|
|
893
|
+
// Decision gate check: if the agent sent a decision_gate message, it is
|
|
894
|
+
// intentionally paused waiting for a human decision — not a stall.
|
|
895
|
+
// Skip watchdog escalation and clear any accumulated stall state.
|
|
896
|
+
if (mailStore !== null) {
|
|
897
|
+
const recentMail = mailStore.getAll({ from: session.agentName, limit: 20 });
|
|
898
|
+
const hasPendingDecisionGate = recentMail.some((m) => m.type === "decision_gate");
|
|
899
|
+
if (hasPendingDecisionGate) {
|
|
900
|
+
if (session.stalledSince !== null) {
|
|
901
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
902
|
+
session.stalledSince = null;
|
|
903
|
+
session.escalationLevel = 0;
|
|
904
|
+
}
|
|
905
|
+
continue;
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
// Progressive nudging: increment escalation level based on elapsed time
|
|
910
|
+
// instead of immediately delegating to AI triage.
|
|
911
|
+
|
|
912
|
+
// Initialize stalledSince on first escalation detection
|
|
913
|
+
if (session.stalledSince === null) {
|
|
914
|
+
session.stalledSince = new Date().toISOString();
|
|
915
|
+
session.escalationLevel = 0;
|
|
916
|
+
store.updateEscalation(session.agentName, 0, session.stalledSince);
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
// Check if enough time has passed to advance to the next escalation level
|
|
920
|
+
const stalledMs = Date.now() - new Date(session.stalledSince).getTime();
|
|
921
|
+
const expectedLevel = Math.min(
|
|
922
|
+
Math.floor(stalledMs / nudgeIntervalMs),
|
|
923
|
+
MAX_ESCALATION_LEVEL,
|
|
924
|
+
);
|
|
925
|
+
|
|
926
|
+
if (expectedLevel > session.escalationLevel) {
|
|
927
|
+
session.escalationLevel = expectedLevel;
|
|
928
|
+
store.updateEscalation(session.agentName, expectedLevel, session.stalledSince);
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
// Execute the action for the current escalation level
|
|
932
|
+
const actionResult = await executeEscalationAction({
|
|
933
|
+
session,
|
|
934
|
+
root,
|
|
935
|
+
tmuxAlive,
|
|
936
|
+
tier1Enabled,
|
|
937
|
+
tmux,
|
|
938
|
+
process: proc,
|
|
939
|
+
triage,
|
|
940
|
+
nudge,
|
|
941
|
+
eventStore,
|
|
942
|
+
runId,
|
|
943
|
+
recordFailure: recordFailureFn,
|
|
944
|
+
triageCount,
|
|
945
|
+
maxTriagePerTick,
|
|
946
|
+
getConnection: getConn,
|
|
947
|
+
removeConnection: removeConn,
|
|
948
|
+
});
|
|
949
|
+
|
|
950
|
+
if (actionResult.terminated) {
|
|
951
|
+
// Matrix-guarded: completed → zombie is rejected (agentplate-a993).
|
|
952
|
+
const outcome = store.tryTransitionState(session.agentName, "zombie");
|
|
953
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
954
|
+
if (outcome.ok) {
|
|
955
|
+
session.state = "zombie";
|
|
956
|
+
// First-time zombify: notify parent so it doesn't block on
|
|
957
|
+
// missing `worker_done` mail (agentplate-c111). Dedup via
|
|
958
|
+
// the pre-tick snapshot — see the terminate branch above.
|
|
959
|
+
if (notifyParentOnDeath && stateBeforeTick !== "zombie") {
|
|
960
|
+
notifyParentOfDeath({
|
|
961
|
+
session,
|
|
962
|
+
mailStore,
|
|
963
|
+
reason: actionResult.deathReason ?? "Watchdog escalation terminated agent",
|
|
964
|
+
tier: actionResult.deathTier ?? 0,
|
|
965
|
+
eventStore,
|
|
966
|
+
runId,
|
|
967
|
+
});
|
|
968
|
+
}
|
|
969
|
+
} else if (outcome.reason === "illegal_transition") {
|
|
970
|
+
session.state = outcome.prev;
|
|
971
|
+
}
|
|
972
|
+
session.escalationLevel = 0;
|
|
973
|
+
session.stalledSince = null;
|
|
974
|
+
}
|
|
975
|
+
} else if (check.action === "none" && session.stalledSince !== null) {
|
|
976
|
+
// Agent recovered — reset escalation tracking
|
|
977
|
+
store.updateEscalation(session.agentName, 0, null);
|
|
978
|
+
session.stalledSince = null;
|
|
979
|
+
session.escalationLevel = 0;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
// === Tailer cleanup ===
|
|
984
|
+
// Stop tailers for any headless agent that is no longer in the active set
|
|
985
|
+
// (i.e. completed, removed from store, or was never a headless agent).
|
|
986
|
+
for (const [name, handle] of tailerRegistry) {
|
|
987
|
+
if (!activeHeadlessAgents.has(name)) {
|
|
988
|
+
handle.stop();
|
|
989
|
+
tailerRegistry.delete(name);
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
// === Run-level completion detection ===
|
|
994
|
+
// After monitoring individual sessions, check if the entire run is done.
|
|
995
|
+
// Re-resolve the run id defensively (agentplate-87bf): a missing
|
|
996
|
+
// current-run.txt or a stale id (no row in runs table) skips the check
|
|
997
|
+
// and emits one warning per cause for the lifetime of this watchdog.
|
|
998
|
+
const validatedRunId = await resolveRunIdForCompletionCheck(
|
|
999
|
+
agentplateDir,
|
|
1000
|
+
runStore,
|
|
1001
|
+
runIdWarnState,
|
|
1002
|
+
);
|
|
1003
|
+
if (validatedRunId) {
|
|
1004
|
+
await checkRunCompletion({
|
|
1005
|
+
store,
|
|
1006
|
+
runId: validatedRunId,
|
|
1007
|
+
agentplateDir,
|
|
1008
|
+
root,
|
|
1009
|
+
nudge,
|
|
1010
|
+
eventStore,
|
|
1011
|
+
});
|
|
1012
|
+
}
|
|
1013
|
+
} finally {
|
|
1014
|
+
store.close();
|
|
1015
|
+
// Close MailStore only if we created it (not injected)
|
|
1016
|
+
if (mailStore && ownMailStore) {
|
|
1017
|
+
try {
|
|
1018
|
+
mailStore.close();
|
|
1019
|
+
} catch {
|
|
1020
|
+
// Non-fatal
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
// Close EventStore only if we created it (not injected)
|
|
1024
|
+
if (eventStore && !useInjectedEventStore) {
|
|
1025
|
+
try {
|
|
1026
|
+
eventStore.close();
|
|
1027
|
+
} catch {
|
|
1028
|
+
// Non-fatal
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
// Close RunStore only if we created it (not injected)
|
|
1032
|
+
if (runStore && ownRunStore) {
|
|
1033
|
+
try {
|
|
1034
|
+
runStore.close();
|
|
1035
|
+
} catch {
|
|
1036
|
+
// Non-fatal
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
/**
|
|
1043
|
+
* Execute the escalation action corresponding to the agent's current escalation level.
|
|
1044
|
+
*
|
|
1045
|
+
* Level 0 (warn): No direct action — onHealthCheck callback already fired above.
|
|
1046
|
+
* Level 1 (nudge): Send a tmux nudge to the agent.
|
|
1047
|
+
* Level 2 (escalate): Invoke Tier 1 AI triage (if tier1Enabled; skip otherwise).
|
|
1048
|
+
* Level 3 (terminate): Kill the tmux session.
|
|
1049
|
+
*
|
|
1050
|
+
* @returns Object indicating whether the agent was terminated or state changed.
|
|
1051
|
+
*/
|
|
1052
|
+
async function executeEscalationAction(ctx: {
|
|
1053
|
+
session: AgentSession;
|
|
1054
|
+
root: string;
|
|
1055
|
+
tmuxAlive: boolean;
|
|
1056
|
+
tier1Enabled: boolean;
|
|
1057
|
+
tmux: {
|
|
1058
|
+
isSessionAlive: (name: string) => Promise<boolean>;
|
|
1059
|
+
killSession: (name: string) => Promise<void>;
|
|
1060
|
+
};
|
|
1061
|
+
process: {
|
|
1062
|
+
killTree: (pid: number) => Promise<void>;
|
|
1063
|
+
};
|
|
1064
|
+
triage: (options: {
|
|
1065
|
+
agentName: string;
|
|
1066
|
+
root: string;
|
|
1067
|
+
lastActivity: string;
|
|
1068
|
+
}) => Promise<TriageResult | "retry" | "terminate" | "extend">;
|
|
1069
|
+
/** Shared counter across escalation calls in a single tick — enforces maxTriagePerTick. */
|
|
1070
|
+
triageCount: { value: number };
|
|
1071
|
+
/** Maximum number of triage calls allowed in one daemon tick. Default: 3. */
|
|
1072
|
+
maxTriagePerTick: number;
|
|
1073
|
+
nudge: (
|
|
1074
|
+
projectRoot: string,
|
|
1075
|
+
agentName: string,
|
|
1076
|
+
message: string,
|
|
1077
|
+
force: boolean,
|
|
1078
|
+
) => Promise<{ delivered: boolean; reason?: string }>;
|
|
1079
|
+
eventStore: EventStore | null;
|
|
1080
|
+
runId: string | null;
|
|
1081
|
+
recordFailure: (
|
|
1082
|
+
root: string,
|
|
1083
|
+
session: AgentSession,
|
|
1084
|
+
reason: string,
|
|
1085
|
+
tier: 0 | 1,
|
|
1086
|
+
triageSuggestion?: string,
|
|
1087
|
+
) => Promise<void>;
|
|
1088
|
+
getConnection: (name: string) => RuntimeConnection | undefined;
|
|
1089
|
+
removeConnection: (name: string) => void;
|
|
1090
|
+
}): Promise<{
|
|
1091
|
+
terminated: boolean;
|
|
1092
|
+
stateChanged: boolean;
|
|
1093
|
+
/** Reason and tier of the termination (only set when `terminated` is true). */
|
|
1094
|
+
deathReason?: string;
|
|
1095
|
+
deathTier?: 0 | 1;
|
|
1096
|
+
}> {
|
|
1097
|
+
const {
|
|
1098
|
+
session,
|
|
1099
|
+
root,
|
|
1100
|
+
tmuxAlive,
|
|
1101
|
+
tier1Enabled,
|
|
1102
|
+
tmux,
|
|
1103
|
+
process: proc,
|
|
1104
|
+
triage,
|
|
1105
|
+
nudge,
|
|
1106
|
+
eventStore,
|
|
1107
|
+
runId,
|
|
1108
|
+
recordFailure,
|
|
1109
|
+
triageCount,
|
|
1110
|
+
maxTriagePerTick,
|
|
1111
|
+
getConnection: getConn,
|
|
1112
|
+
removeConnection: removeConn,
|
|
1113
|
+
} = ctx;
|
|
1114
|
+
|
|
1115
|
+
switch (session.escalationLevel) {
|
|
1116
|
+
case 0: {
|
|
1117
|
+
// Level 0: warn — onHealthCheck callback already fired, no direct action
|
|
1118
|
+
recordEvent(eventStore, {
|
|
1119
|
+
runId,
|
|
1120
|
+
agentName: session.agentName,
|
|
1121
|
+
eventType: "custom",
|
|
1122
|
+
level: "warn",
|
|
1123
|
+
data: { type: "escalation", escalationLevel: 0, action: "warn" },
|
|
1124
|
+
});
|
|
1125
|
+
return { terminated: false, stateChanged: false };
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
case 1: {
|
|
1129
|
+
// Level 1: nudge — send a tmux nudge to the agent
|
|
1130
|
+
let delivered = false;
|
|
1131
|
+
try {
|
|
1132
|
+
const result = await nudge(
|
|
1133
|
+
root,
|
|
1134
|
+
session.agentName,
|
|
1135
|
+
`[WATCHDOG] Agent "${session.agentName}" appears stalled. Please check your current task and report status.`,
|
|
1136
|
+
true, // force — skip debounce for watchdog nudges
|
|
1137
|
+
);
|
|
1138
|
+
delivered = result.delivered;
|
|
1139
|
+
} catch {
|
|
1140
|
+
// Nudge delivery failure is non-fatal for the watchdog
|
|
1141
|
+
}
|
|
1142
|
+
recordEvent(eventStore, {
|
|
1143
|
+
runId,
|
|
1144
|
+
agentName: session.agentName,
|
|
1145
|
+
eventType: "custom",
|
|
1146
|
+
level: "warn",
|
|
1147
|
+
data: { type: "nudge", escalationLevel: 1, delivered },
|
|
1148
|
+
});
|
|
1149
|
+
return { terminated: false, stateChanged: false };
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
case 2: {
|
|
1153
|
+
// Level 2: escalate — invoke Tier 1 AI triage if enabled
|
|
1154
|
+
if (!tier1Enabled) {
|
|
1155
|
+
// Tier 1 disabled — skip triage, progressive nudging continues to level 3
|
|
1156
|
+
return { terminated: false, stateChanged: false };
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
// Concurrency guard: limit triage calls per tick to avoid runaway AI usage
|
|
1160
|
+
if (triageCount.value >= maxTriagePerTick) {
|
|
1161
|
+
return { terminated: false, stateChanged: false };
|
|
1162
|
+
}
|
|
1163
|
+
triageCount.value++;
|
|
1164
|
+
|
|
1165
|
+
const raw = await triage({
|
|
1166
|
+
agentName: session.agentName,
|
|
1167
|
+
root,
|
|
1168
|
+
lastActivity: session.lastActivity,
|
|
1169
|
+
});
|
|
1170
|
+
// Normalize: accept bare string (backward compat) or TriageResult
|
|
1171
|
+
const result: TriageResult =
|
|
1172
|
+
typeof raw === "string" ? { verdict: raw, fallback: false } : raw;
|
|
1173
|
+
|
|
1174
|
+
recordEvent(eventStore, {
|
|
1175
|
+
runId,
|
|
1176
|
+
agentName: session.agentName,
|
|
1177
|
+
eventType: "custom",
|
|
1178
|
+
level: "warn",
|
|
1179
|
+
data: {
|
|
1180
|
+
type: "triage",
|
|
1181
|
+
escalationLevel: 2,
|
|
1182
|
+
verdict: result.verdict,
|
|
1183
|
+
triageFailed: result.fallback,
|
|
1184
|
+
},
|
|
1185
|
+
});
|
|
1186
|
+
|
|
1187
|
+
if (result.verdict === "terminate") {
|
|
1188
|
+
// Record the failure via loam (Tier 1 AI triage)
|
|
1189
|
+
const triageReason = "AI triage classified as terminal failure";
|
|
1190
|
+
await recordFailure(root, session, triageReason, 1, result.verdict);
|
|
1191
|
+
|
|
1192
|
+
await killAgent({
|
|
1193
|
+
session,
|
|
1194
|
+
tmuxAlive,
|
|
1195
|
+
tmux,
|
|
1196
|
+
process: proc,
|
|
1197
|
+
getConnection: getConn,
|
|
1198
|
+
removeConnection: removeConn,
|
|
1199
|
+
});
|
|
1200
|
+
return {
|
|
1201
|
+
terminated: true,
|
|
1202
|
+
stateChanged: true,
|
|
1203
|
+
deathReason: triageReason,
|
|
1204
|
+
deathTier: 1,
|
|
1205
|
+
};
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
if (result.verdict === "retry") {
|
|
1209
|
+
// Send a nudge with a recovery message
|
|
1210
|
+
try {
|
|
1211
|
+
await nudge(
|
|
1212
|
+
root,
|
|
1213
|
+
session.agentName,
|
|
1214
|
+
"[WATCHDOG] Triage suggests recovery is possible. " +
|
|
1215
|
+
"Please retry your current operation or check for errors.",
|
|
1216
|
+
true, // force — skip debounce
|
|
1217
|
+
);
|
|
1218
|
+
} catch {
|
|
1219
|
+
// Nudge delivery failure is non-fatal
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
// "retry" (after nudge) and "extend" leave the session running
|
|
1224
|
+
return { terminated: false, stateChanged: false };
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
default: {
|
|
1228
|
+
// Level 3+: terminate — kill the tmux session
|
|
1229
|
+
recordEvent(eventStore, {
|
|
1230
|
+
runId,
|
|
1231
|
+
agentName: session.agentName,
|
|
1232
|
+
eventType: "custom",
|
|
1233
|
+
level: "error",
|
|
1234
|
+
data: { type: "escalation", escalationLevel: 3, action: "terminate" },
|
|
1235
|
+
});
|
|
1236
|
+
|
|
1237
|
+
// Record the failure via loam (Tier 0: progressive escalation to terminal level)
|
|
1238
|
+
const escalationReason = "Progressive escalation reached terminal level";
|
|
1239
|
+
await recordFailure(root, session, escalationReason, 0);
|
|
1240
|
+
|
|
1241
|
+
await killAgent({
|
|
1242
|
+
session,
|
|
1243
|
+
tmuxAlive,
|
|
1244
|
+
tmux,
|
|
1245
|
+
process: proc,
|
|
1246
|
+
getConnection: getConn,
|
|
1247
|
+
removeConnection: removeConn,
|
|
1248
|
+
});
|
|
1249
|
+
return {
|
|
1250
|
+
terminated: true,
|
|
1251
|
+
stateChanged: true,
|
|
1252
|
+
deathReason: escalationReason,
|
|
1253
|
+
deathTier: 0,
|
|
1254
|
+
};
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
}
|