@deftai/directive-content 0.55.2 → 0.56.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. package/.githooks/pre-commit +143 -0
  2. package/.githooks/pre-push +121 -0
  3. package/QUICK-START.md +2 -2
  4. package/Taskfile.yml +934 -0
  5. package/UPGRADING.md +47 -1
  6. package/events/README.md +3 -3
  7. package/package.json +5 -4
  8. package/scripts/_agents_md.py +494 -0
  9. package/scripts/_cache_fetch.py +635 -0
  10. package/scripts/_cache_quota.py +529 -0
  11. package/scripts/_cache_refresh.py +163 -0
  12. package/scripts/_cache_validate.py +209 -0
  13. package/scripts/_content_root.py +42 -0
  14. package/scripts/_doctor_state.py +277 -0
  15. package/scripts/_event_detect.py +305 -0
  16. package/scripts/_events.py +514 -0
  17. package/scripts/_lifecycle_hygiene.py +568 -0
  18. package/scripts/_pathspec.py +91 -0
  19. package/scripts/_policy_show_cli.py +266 -0
  20. package/scripts/_precutover.py +92 -0
  21. package/scripts/_project_context.py +224 -0
  22. package/scripts/_project_definition_io.py +164 -0
  23. package/scripts/_relocate_snapshot.py +209 -0
  24. package/scripts/_relocate_states.py +343 -0
  25. package/scripts/_resolve_preflight_path.py +152 -0
  26. package/scripts/_safe_subprocess.py +167 -0
  27. package/scripts/_session_start_hook.py +205 -0
  28. package/scripts/_sor_gate_diff.py +365 -0
  29. package/scripts/_stdio_utf8.py +59 -0
  30. package/scripts/_triage_bootstrap_gitignore.py +904 -0
  31. package/scripts/_triage_classify_cli.py +122 -0
  32. package/scripts/_triage_queue_cli.py +625 -0
  33. package/scripts/_triage_scope_cli.py +343 -0
  34. package/scripts/_triage_scope_drift_cli.py +121 -0
  35. package/scripts/_triage_scope_ignores.py +286 -0
  36. package/scripts/_triage_scope_milestone.py +432 -0
  37. package/scripts/_triage_scope_mutations.py +337 -0
  38. package/scripts/_triage_scope_renderers.py +207 -0
  39. package/scripts/_triage_smoketest_stages.py +674 -0
  40. package/scripts/_triage_subscribe_cli.py +140 -0
  41. package/scripts/_triage_welcome_cli.py +421 -0
  42. package/scripts/_vbrief_build.py +239 -0
  43. package/scripts/_vbrief_fidelity.py +479 -0
  44. package/scripts/_vbrief_legacy.py +589 -0
  45. package/scripts/_vbrief_reconciliation.py +883 -0
  46. package/scripts/_vbrief_routing.py +277 -0
  47. package/scripts/_vbrief_safety.py +778 -0
  48. package/scripts/_vbrief_sources.py +312 -0
  49. package/scripts/_vbrief_speckit.py +262 -0
  50. package/scripts/_vbrief_story_quality.py +353 -0
  51. package/scripts/_vbrief_validation.py +299 -0
  52. package/scripts/build_dist.py +412 -0
  53. package/scripts/cache.py +1078 -0
  54. package/scripts/cache_scanner.py +745 -0
  55. package/scripts/candidates_log.py +432 -0
  56. package/scripts/capacity_backfill.py +680 -0
  57. package/scripts/capacity_show.py +653 -0
  58. package/scripts/ci_local.py +689 -0
  59. package/scripts/code_structure_validate.py +765 -0
  60. package/scripts/codebase_default_extractor.py +495 -0
  61. package/scripts/codebase_map.py +304 -0
  62. package/scripts/codebase_map_fresh.py +104 -0
  63. package/scripts/codebase_projection_registry.py +94 -0
  64. package/scripts/codebase_provider.py +582 -0
  65. package/scripts/doctor.py +2257 -0
  66. package/scripts/framework_commands.py +505 -0
  67. package/scripts/gh_rest.py +882 -0
  68. package/scripts/github_auth_modes.py +437 -0
  69. package/scripts/github_body.py +292 -0
  70. package/scripts/ip_risk.py +531 -0
  71. package/scripts/issue_emit.py +670 -0
  72. package/scripts/issue_ingest.py +1064 -0
  73. package/scripts/migrate_preflight.py +418 -0
  74. package/scripts/migrate_vbrief.py +2677 -0
  75. package/scripts/monitor_pr.py +401 -0
  76. package/scripts/pack_migrate_lessons.py +336 -0
  77. package/scripts/pack_migrate_patterns.py +254 -0
  78. package/scripts/pack_migrate_rules.py +350 -0
  79. package/scripts/pack_migrate_skills.py +423 -0
  80. package/scripts/pack_migrate_strategies.py +311 -0
  81. package/scripts/pack_migrate_swarm_spec.py +250 -0
  82. package/scripts/pack_render.py +434 -0
  83. package/scripts/packs_slice.py +712 -0
  84. package/scripts/platform_capabilities.py +336 -0
  85. package/scripts/policy.py +2826 -0
  86. package/scripts/policy_set.py +324 -0
  87. package/scripts/pr_check_closing_keywords.py +524 -0
  88. package/scripts/pr_check_protected_issues.py +267 -0
  89. package/scripts/pr_merge_readiness.py +1004 -0
  90. package/scripts/pr_wait_mergeable.py +669 -0
  91. package/scripts/prd_render.py +159 -0
  92. package/scripts/preflight_architecture_sor.py +974 -0
  93. package/scripts/preflight_branch.py +289 -0
  94. package/scripts/preflight_cache.py +974 -0
  95. package/scripts/preflight_gh.py +721 -0
  96. package/scripts/preflight_implementation.py +272 -0
  97. package/scripts/preflight_story_start.py +838 -0
  98. package/scripts/preflight_wip_cap.py +149 -0
  99. package/scripts/probe_session.py +545 -0
  100. package/scripts/project_render.py +293 -0
  101. package/scripts/quarantine_ext.py +237 -0
  102. package/scripts/reconcile_issues.py +1442 -0
  103. package/scripts/refresh-path.ps1 +107 -0
  104. package/scripts/release.py +2030 -0
  105. package/scripts/release_e2e.py +1011 -0
  106. package/scripts/release_publish.py +486 -0
  107. package/scripts/release_rollback.py +980 -0
  108. package/scripts/relocate.py +1034 -0
  109. package/scripts/resolve_changelog_unreleased.py +667 -0
  110. package/scripts/resolve_version.py +490 -0
  111. package/scripts/resume_conditions.py +706 -0
  112. package/scripts/ritual_sentinel.py +609 -0
  113. package/scripts/roadmap_render.py +635 -0
  114. package/scripts/rule_ownership_lint.py +325 -0
  115. package/scripts/scm.py +591 -0
  116. package/scripts/scope_audit_log.py +387 -0
  117. package/scripts/scope_decompose.py +654 -0
  118. package/scripts/scope_demote.py +509 -0
  119. package/scripts/scope_lifecycle.py +1126 -0
  120. package/scripts/scope_undo.py +772 -0
  121. package/scripts/session_start.py +406 -0
  122. package/scripts/setup_ghx.py +339 -0
  123. package/scripts/setup_windows.ps1 +220 -0
  124. package/scripts/slice_audit.py +585 -0
  125. package/scripts/slice_record.py +530 -0
  126. package/scripts/slice_record_existing.py +692 -0
  127. package/scripts/slug_normalize.py +178 -0
  128. package/scripts/spec_render.py +477 -0
  129. package/scripts/spec_validate.py +238 -0
  130. package/scripts/subagent_monitor.py +658 -0
  131. package/scripts/swarm_complete_cohort.py +644 -0
  132. package/scripts/swarm_launch.py +1206 -0
  133. package/scripts/swarm_readiness.py +554 -0
  134. package/scripts/swarm_verify_review_clean.py +438 -0
  135. package/scripts/swarm_worktrees.py +497 -0
  136. package/scripts/toolchain-check.py +52 -0
  137. package/scripts/triage_actions.py +871 -0
  138. package/scripts/triage_bootstrap.py +1153 -0
  139. package/scripts/triage_bulk.py +630 -0
  140. package/scripts/triage_classify.py +932 -0
  141. package/scripts/triage_help.py +1685 -0
  142. package/scripts/triage_queue.py +1944 -0
  143. package/scripts/triage_reconcile.py +581 -0
  144. package/scripts/triage_refresh.py +643 -0
  145. package/scripts/triage_scope.py +999 -0
  146. package/scripts/triage_scope_drift.py +575 -0
  147. package/scripts/triage_smoketest.py +396 -0
  148. package/scripts/triage_subscribe.py +399 -0
  149. package/scripts/triage_summary.py +1011 -0
  150. package/scripts/triage_welcome.py +1178 -0
  151. package/scripts/ts_check_lane.py +86 -0
  152. package/scripts/validate-links.py +64 -0
  153. package/scripts/validate_strategy_output.py +212 -0
  154. package/scripts/vbrief_activate.py +228 -0
  155. package/scripts/vbrief_migrate_conformance.py +368 -0
  156. package/scripts/vbrief_reconcile_graph.py +306 -0
  157. package/scripts/vbrief_reconcile_labels.py +460 -0
  158. package/scripts/vbrief_reconcile_umbrellas.py +741 -0
  159. package/scripts/vbrief_validate.py +1195 -0
  160. package/scripts/verify-stubs.py +61 -0
  161. package/scripts/verify_capacity.py +160 -0
  162. package/scripts/verify_encoding.py +699 -0
  163. package/scripts/verify_hooks_installed.py +206 -0
  164. package/scripts/verify_investigation.py +360 -0
  165. package/scripts/verify_judgment_gates.py +827 -0
  166. package/scripts/verify_no_task_runtime.py +171 -0
  167. package/scripts/verify_scm_boundary.py +509 -0
  168. package/scripts/verify_session_ritual.py +389 -0
  169. package/scripts/verify_tools.py +426 -0
  170. package/scripts/verify_vbrief_conformance.py +478 -0
  171. package/tasks/architecture.yml +13 -0
  172. package/tasks/cache.yml +69 -0
  173. package/tasks/capacity.yml +38 -0
  174. package/tasks/change.yml +46 -0
  175. package/tasks/changelog.yml +24 -0
  176. package/tasks/ci.yml +49 -0
  177. package/tasks/codebase.yml +47 -0
  178. package/tasks/commit.yml +30 -0
  179. package/tasks/core.yml +126 -0
  180. package/tasks/deployments.yml +54 -0
  181. package/tasks/framework.yml +74 -0
  182. package/tasks/install.yml +60 -0
  183. package/tasks/issue.yml +50 -0
  184. package/tasks/migrate.yml +73 -0
  185. package/tasks/packs.yml +92 -0
  186. package/tasks/policy.yml +75 -0
  187. package/tasks/pr.yml +89 -0
  188. package/tasks/prd.yml +39 -0
  189. package/tasks/project.yml +27 -0
  190. package/tasks/reconcile.yml +32 -0
  191. package/tasks/relocate.yml +56 -0
  192. package/tasks/roadmap.yml +28 -0
  193. package/tasks/scm.yml +126 -0
  194. package/tasks/scope-undo.yml +36 -0
  195. package/tasks/scope.yml +141 -0
  196. package/tasks/session.yml +19 -0
  197. package/tasks/setup.yml +37 -0
  198. package/tasks/slice.yml +69 -0
  199. package/tasks/spec.yml +41 -0
  200. package/tasks/swarm.yml +85 -0
  201. package/tasks/toolchain.yml +13 -0
  202. package/tasks/triage-actions.yml +94 -0
  203. package/tasks/triage-bootstrap.yml +43 -0
  204. package/tasks/triage-bulk.yml +75 -0
  205. package/tasks/triage-classify.yml +30 -0
  206. package/tasks/triage-queue.yml +50 -0
  207. package/tasks/triage-reconcile.yml +29 -0
  208. package/tasks/triage-scope-drift.yml +29 -0
  209. package/tasks/triage-scope.yml +31 -0
  210. package/tasks/triage-smoketest.yml +33 -0
  211. package/tasks/triage-subscribe.yml +36 -0
  212. package/tasks/triage-summary.yml +29 -0
  213. package/tasks/triage-welcome.yml +32 -0
  214. package/tasks/ts.yml +328 -0
  215. package/tasks/vbrief.yml +206 -0
  216. package/tasks/verify.yml +292 -0
  217. package/templates/agents-entry.md +1 -1
@@ -0,0 +1,658 @@
1
+ #!/usr/bin/env python3
2
+ """subagent_monitor.py -- Sub-agent heartbeat watcher (#1365).
3
+
4
+ Walks one or more ``.deft-scratch/subagent-status/`` directories and reports
5
+ the liveness of every heartbeat record found there. The contract those
6
+ records implement is documented at ``docs/subagent-heartbeat.md``; this
7
+ script is the canonical reader.
8
+
9
+ Background (#1365)
10
+ ------------------
11
+ The Grok Build hybrid swarm path dispatches review-cycle sub-agents via
12
+ ``spawn_subagent``. Those agents run in isolated worktrees and have no
13
+ built-in lifecycle channel back to the monitor -- the only signals
14
+ available are side effects (commits, PR comments). The #1166 swarm
15
+ session demonstrated the failure mode: three review-cycle sub-agents
16
+ launched, one reported back, two went completely dark with **zero**
17
+ observable signals. The monitor could not distinguish "still working"
18
+ from "stalled" from "dead".
19
+
20
+ The heartbeat contract closes that gap: every long-running sub-agent
21
+ writes a small JSON record under ``.deft-scratch/subagent-status/`` with
22
+ its agent_id / parent_id / last_heartbeat_at / last_message / phase /
23
+ optional terminal_state. The monitor reads those records and flags
24
+ anything older than the staleness threshold (default 30 minutes).
25
+
26
+ This script intentionally does NOT shell out to ``gh`` or any other
27
+ external CLI -- the heartbeat surface is on-disk by design so a network
28
+ partition or rate-limit ceiling cannot mask agent liveness. The
29
+ ``scripts/_safe_subprocess.py`` UTF-8 helper is imported in case a
30
+ future caller wants to surface gh-derived context alongside the
31
+ heartbeat report (per the AGENTS.md ``## Safe subprocess capture
32
+ (#1366)`` rule that mandates routing every gh capture through the
33
+ helper), but the core liveness path is filesystem-only.
34
+
35
+ Usage
36
+ -----
37
+ # Scan the default project-root scratch dir
38
+ uv --project . run python scripts/subagent_monitor.py
39
+
40
+ # Scan one or more explicit scratch dirs (one per agent worktree)
41
+ uv --project . run python scripts/subagent_monitor.py \\
42
+ --scratch-dir C:/Repos/deft-agent3-1365/.deft-scratch/subagent-status \\
43
+ --scratch-dir C:/Repos/deft-agent4-1368/.deft-scratch/subagent-status
44
+
45
+ # Tighter threshold for impatient monitors
46
+ uv --project . run python scripts/subagent_monitor.py --threshold-minutes 5
47
+
48
+ # Machine-readable output for parent monitor agents
49
+ uv --project . run python scripts/subagent_monitor.py --json
50
+
51
+ Exit codes (three-state, mirrors task verify:cache-fresh / task
52
+ pr:merge-ready / task swarm:verify-review-clean):
53
+
54
+ 0 -- every record is fresher than threshold AND parses cleanly
55
+ 1 -- one or more records is stale OR malformed
56
+ 2 -- config error (no scratch dirs given AND no default found, or
57
+ invalid --threshold-minutes)
58
+
59
+ Pure stdlib; no third-party deps. Re-uses ``scripts/_safe_subprocess.py``
60
+ solely so any future gh capture inside this script routes through the
61
+ canonical UTF-8-safe helper (per AGENTS.md ``## Safe subprocess capture
62
+ (#1366)``).
63
+ """
64
+
65
+ from __future__ import annotations
66
+
67
+ import argparse
68
+ import json
69
+ import sys
70
+ from dataclasses import dataclass, field
71
+ from datetime import UTC, datetime, timedelta
72
+ from pathlib import Path
73
+
74
+ # Make sibling scripts importable both when run as __main__ and when imported
75
+ # by tests (mirrors scripts/swarm_verify_review_clean.py + pr_merge_readiness.py
76
+ # layout so the import seam is consistent across the swarm-verb cluster).
77
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
78
+
79
+ try:
80
+ from _stdio_utf8 import reconfigure_stdio # noqa: E402
81
+ reconfigure_stdio()
82
+ except ImportError:
83
+ # _stdio_utf8 is optional; some test contexts load this module directly.
84
+ pass
85
+
86
+ # UTF-8-safe subprocess capture (#1366). The monitor itself does not shell
87
+ # out today -- the heartbeat surface is on-disk by design -- but the
88
+ # AGENTS.md ``## Safe subprocess capture (#1366)`` rule mandates that any
89
+ # script that MIGHT shell out for parsable output (and the monitor is one
90
+ # adjacent edit away from inspecting a Greptile body on behalf of a dark
91
+ # sub-agent) imports the helper from day one. Importing here keeps the
92
+ # contract visible at the module level so the next maintainer reaches for
93
+ # ``run_text`` without thinking.
94
+ from _safe_subprocess import run_text # noqa: E402, F401
95
+
96
+ EXIT_OK = 0
97
+ EXIT_STALE = 1
98
+ EXIT_EXTERNAL_ERROR = 2
99
+
100
+ # Default staleness threshold (minutes). Calibrated for the review-cycle
101
+ # poller cadence (90s polls, 30-minute caps); the operator overrides via
102
+ # ``--threshold-minutes``.
103
+ DEFAULT_THRESHOLD_MINUTES = 30
104
+
105
+ # Canonical phase taxonomy from docs/subagent-heartbeat.md. An unknown
106
+ # phase flags the record as MALFORMED (exit 1) -- the docs declare the
107
+ # enum as a hard contract (`phase` MUST be one of the listed values), so
108
+ # the monitor surfaces an unknown phase as a typo + the operator fixes
109
+ # the agent that's writing it. Forward-compat extension is an additive
110
+ # enum bump under the contract, NOT silent acceptance at read time.
111
+ # Keep this in sync with the docs file -- the tests pin the doc + script
112
+ # as the same authoritative enumeration.
113
+ CANONICAL_PHASES = frozenset({
114
+ "starting",
115
+ "implementing",
116
+ "validating",
117
+ "committing",
118
+ "pushing",
119
+ "polling",
120
+ "fixing",
121
+ "terminal",
122
+ })
123
+
124
+ # Required field set per docs/subagent-heartbeat.md. Missing any one of
125
+ # these is a malformed-record failure (exit 1). Optional fields
126
+ # (terminal_state, pr_number, extra) are not enforced.
127
+ REQUIRED_FIELDS = ("agent_id", "parent_id", "last_heartbeat_at", "last_message", "phase")
128
+
129
+ # Module-level constant so we compute the zero-offset timedelta once. Defined
130
+ # before _parse_iso8601_utc so the reference is visible on the first call.
131
+ _UTC_ZERO_OFFSET = timedelta(0)
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Heartbeat record parsing
136
+ # ---------------------------------------------------------------------------
137
+
138
+
139
+ @dataclass
140
+ class HeartbeatRecord:
141
+ """Parsed heartbeat record. ``failures`` non-empty == malformed."""
142
+ path: str
143
+ agent_id: str | None
144
+ parent_id: str | None
145
+ last_heartbeat_at_iso: str | None
146
+ last_heartbeat_at: datetime | None
147
+ last_message: str | None
148
+ phase: str | None
149
+ terminal_state: str | None
150
+ pr_number: int | None
151
+ age_seconds: float | None
152
+ is_terminal: bool
153
+ is_stale: bool
154
+ failures: list[str] = field(default_factory=list)
155
+
156
+ @property
157
+ def ok(self) -> bool:
158
+ return not self.failures and not self.is_stale
159
+
160
+ def to_dict(self) -> dict:
161
+ return {
162
+ "path": self.path,
163
+ "agent_id": self.agent_id,
164
+ "parent_id": self.parent_id,
165
+ "last_heartbeat_at": self.last_heartbeat_at_iso,
166
+ "last_message": self.last_message,
167
+ "phase": self.phase,
168
+ "terminal_state": self.terminal_state,
169
+ "pr_number": self.pr_number,
170
+ "age_seconds": self.age_seconds,
171
+ "is_terminal": self.is_terminal,
172
+ "is_stale": self.is_stale,
173
+ "failures": list(self.failures),
174
+ "ok": self.ok,
175
+ }
176
+
177
+
178
+ def _parse_iso8601_utc(value: str) -> datetime | None:
179
+ """Parse an ISO-8601 UTC timestamp ending in ``Z`` or a ``+00:00`` offset.
180
+
181
+ Returns ``None`` on any parse failure or on a timezone other than UTC.
182
+ Local-timezone timestamps are intentionally rejected per
183
+ ``docs/subagent-heartbeat.md`` (the contract is UTC with the ``Z``
184
+ suffix; the helper accepts the canonical ``+00:00`` Python emits when
185
+ serializing ``datetime.now(timezone.utc)`` for forward-compat).
186
+ """
187
+ if not isinstance(value, str) or not value:
188
+ return None
189
+ candidate = value.strip()
190
+ # Python's fromisoformat accepts `+00:00` natively; pre-3.11 lacks the
191
+ # `Z` suffix shortcut, so we normalize manually for cross-version
192
+ # compatibility.
193
+ if candidate.endswith("Z"):
194
+ candidate = candidate[:-1] + "+00:00"
195
+ try:
196
+ parsed = datetime.fromisoformat(candidate)
197
+ except ValueError:
198
+ return None
199
+ if parsed.tzinfo is None:
200
+ # Naive timestamps would silently behave like local time; reject.
201
+ return None
202
+ if parsed.utcoffset() != _UTC_ZERO_OFFSET:
203
+ return None
204
+ return parsed
205
+
206
+
207
+ def parse_heartbeat_file(
208
+ path: Path,
209
+ *,
210
+ now: datetime,
211
+ threshold_seconds: float,
212
+ ) -> HeartbeatRecord:
213
+ """Parse one heartbeat record. ``now`` is the wall-clock reference for
214
+ staleness; the caller passes a single value so every record in a sweep
215
+ is judged against the same instant.
216
+
217
+ The function NEVER raises -- every error path is captured in the
218
+ record's ``failures`` list so a single malformed record cannot abort
219
+ the whole sweep. This matches the philosophy of
220
+ ``scripts/swarm_verify_review_clean.py``: a stalled / corrupt agent
221
+ is information the monitor needs to surface, not a fatal condition.
222
+ """
223
+ rec = HeartbeatRecord(
224
+ path=str(path),
225
+ agent_id=None,
226
+ parent_id=None,
227
+ last_heartbeat_at_iso=None,
228
+ last_heartbeat_at=None,
229
+ last_message=None,
230
+ phase=None,
231
+ terminal_state=None,
232
+ pr_number=None,
233
+ age_seconds=None,
234
+ is_terminal=False,
235
+ is_stale=False,
236
+ )
237
+
238
+ try:
239
+ raw = path.read_text(encoding="utf-8")
240
+ except OSError as exc:
241
+ rec.failures.append(f"unreadable: {exc}")
242
+ return rec
243
+
244
+ try:
245
+ payload = json.loads(raw)
246
+ except json.JSONDecodeError as exc:
247
+ rec.failures.append(f"malformed JSON: {exc.msg} at line {exc.lineno}")
248
+ return rec
249
+
250
+ if not isinstance(payload, dict):
251
+ rec.failures.append(
252
+ f"top-level must be a JSON object, got {type(payload).__name__}"
253
+ )
254
+ return rec
255
+
256
+ # Required field presence check. Collect ALL missing fields so the
257
+ # operator sees the full gap in one diagnostic rather than a
258
+ # cascade of single-field reruns.
259
+ missing = [f for f in REQUIRED_FIELDS if f not in payload]
260
+ if missing:
261
+ rec.failures.append(f"missing required field(s): {', '.join(missing)}")
262
+
263
+ # Required-field TYPE check. The presence check above only tests that
264
+ # the key exists (`f not in payload`), so a payload like
265
+ # ``{"last_heartbeat_at": null, ...}`` or
266
+ # ``{"last_heartbeat_at": 1716906470, ...}`` passes the presence gate
267
+ # while the downstream ``isinstance(..., str)`` guards silently skip
268
+ # the field assignment WITHOUT recording a failure. The record's
269
+ # ``.ok`` then evaluates to True and the monitor reports ALL ALIVE
270
+ # for an agent whose timestamp / id / phase is structurally invalid.
271
+ # Surface the type gap explicitly so writers cannot silently emit a
272
+ # broken record (Greptile review, #1365). All five REQUIRED_FIELDS
273
+ # are declared as strings in docs/subagent-heartbeat.md, so a
274
+ # non-string value is a schema violation regardless of which field.
275
+ wrong_type = [
276
+ f for f in REQUIRED_FIELDS
277
+ if f in payload and not isinstance(payload[f], str)
278
+ ]
279
+ if wrong_type:
280
+ types = ", ".join(
281
+ f"{f}={type(payload[f]).__name__}" for f in wrong_type
282
+ )
283
+ rec.failures.append(
284
+ f"required field(s) must be string, got: {types}"
285
+ )
286
+
287
+ # Populate fields opportunistically even when malformed -- the operator
288
+ # benefits from seeing whatever partial state is present (e.g. agent_id
289
+ # parsed but timestamp invalid).
290
+ if isinstance(payload.get("agent_id"), str):
291
+ rec.agent_id = payload["agent_id"]
292
+ if isinstance(payload.get("parent_id"), str):
293
+ rec.parent_id = payload["parent_id"]
294
+ if isinstance(payload.get("last_message"), str):
295
+ rec.last_message = payload["last_message"]
296
+ if isinstance(payload.get("phase"), str):
297
+ rec.phase = payload["phase"]
298
+ if isinstance(payload.get("terminal_state"), str):
299
+ rec.terminal_state = payload["terminal_state"]
300
+ pr_num = payload.get("pr_number")
301
+ if isinstance(pr_num, int):
302
+ rec.pr_number = pr_num
303
+
304
+ # Identity cross-check: the filename (sans .json) MUST match agent_id
305
+ # per docs/subagent-heartbeat.md. A mismatch surfaces a stale file
306
+ # left behind by a renamed agent.
307
+ expected_id = path.stem
308
+ if rec.agent_id is not None and rec.agent_id != expected_id:
309
+ rec.failures.append(
310
+ f"agent_id mismatch: file is '{expected_id}.json' but payload has "
311
+ f"agent_id={rec.agent_id!r}"
312
+ )
313
+
314
+ # Timestamp parse + staleness eval.
315
+ ts_value = payload.get("last_heartbeat_at")
316
+ if isinstance(ts_value, str):
317
+ rec.last_heartbeat_at_iso = ts_value
318
+ parsed_ts = _parse_iso8601_utc(ts_value)
319
+ if parsed_ts is None:
320
+ rec.failures.append(
321
+ f"last_heartbeat_at not ISO-8601 UTC (must end in 'Z' or "
322
+ f"'+00:00'): {ts_value!r}"
323
+ )
324
+ else:
325
+ rec.last_heartbeat_at = parsed_ts
326
+ rec.age_seconds = (now - parsed_ts).total_seconds()
327
+
328
+ # Phase validity check: an unknown phase flags the record as MALFORMED
329
+ # (see CANONICAL_PHASES docstring above for rationale). The contract in
330
+ # docs/subagent-heartbeat.md declares the enum as a hard MUST, so an
331
+ # unknown phase is treated as a writer-side typo, not a forward-compat
332
+ # signal -- the operator fixes the agent writing it.
333
+ if rec.phase is not None and rec.phase not in CANONICAL_PHASES:
334
+ rec.failures.append(
335
+ f"unknown phase {rec.phase!r}; expected one of "
336
+ f"{sorted(CANONICAL_PHASES)}"
337
+ )
338
+
339
+ # Terminal-state classification: phase=='terminal' MUST carry a
340
+ # populated terminal_state. The reverse is allowed (an agent MAY
341
+ # populate terminal_state mid-flight if it has decided its exit
342
+ # before writing the final heartbeat).
343
+ if rec.phase == "terminal" and not rec.terminal_state:
344
+ rec.failures.append(
345
+ "phase='terminal' requires a non-empty terminal_state field"
346
+ )
347
+ rec.is_terminal = bool(rec.terminal_state)
348
+
349
+ # Staleness: a terminal record is NEVER stale (the agent reached an
350
+ # exit on its own terms). A mid-flight record (terminal_state==None)
351
+ # IS stale if its age exceeds the threshold.
352
+ if (
353
+ rec.age_seconds is not None
354
+ and not rec.is_terminal
355
+ and rec.age_seconds > threshold_seconds
356
+ ):
357
+ rec.is_stale = True
358
+
359
+ return rec
360
+
361
+
362
+ # ---------------------------------------------------------------------------
363
+ # Sweep + report
364
+ # ---------------------------------------------------------------------------
365
+
366
+
367
+ @dataclass
368
+ class SweepResult:
369
+ """Aggregate result across one or more scratch directories."""
370
+ scratch_dirs: list[str]
371
+ threshold_minutes: float
372
+ now_iso: str
373
+ records: list[HeartbeatRecord] = field(default_factory=list)
374
+ sweep_errors: list[str] = field(default_factory=list)
375
+
376
+ @property
377
+ def all_ok(self) -> bool:
378
+ # An empty scratch dir that EXISTS (no sweep errors, no records)
379
+ # is the canonical "no agents to monitor" state and counts as OK
380
+ # per the docs/subagent-heartbeat.md three-state exit contract:
381
+ # the monitor's job is to surface stale or malformed records, and
382
+ # absence-of-records is neither. A missing scratch dir is a
383
+ # different failure mode (config error, EXIT_EXTERNAL_ERROR)
384
+ # handled upstream in main().
385
+ return not self.sweep_errors and all(r.ok for r in self.records)
386
+
387
+ def to_dict(self) -> dict:
388
+ return {
389
+ "scratch_dirs": list(self.scratch_dirs),
390
+ "threshold_minutes": self.threshold_minutes,
391
+ "now": self.now_iso,
392
+ "record_count": len(self.records),
393
+ "stale_count": sum(1 for r in self.records if r.is_stale),
394
+ "malformed_count": sum(1 for r in self.records if r.failures),
395
+ "all_ok": self.all_ok,
396
+ "records": [r.to_dict() for r in self.records],
397
+ "sweep_errors": list(self.sweep_errors),
398
+ }
399
+
400
+
401
+ def sweep_scratch_dirs(
402
+ scratch_dirs: list[Path],
403
+ *,
404
+ threshold_minutes: float,
405
+ now: datetime | None = None,
406
+ ) -> SweepResult:
407
+ """Walk every scratch dir and parse every ``*.json`` record found there.
408
+
409
+ Per-directory failures (missing dir, permission denied) are recorded in
410
+ ``sweep_errors`` so the operator sees the partial picture. Per-record
411
+ failures are captured on the record itself.
412
+ """
413
+ if now is None:
414
+ now = datetime.now(UTC)
415
+ threshold_seconds = threshold_minutes * 60.0
416
+
417
+ result = SweepResult(
418
+ scratch_dirs=[str(p) for p in scratch_dirs],
419
+ threshold_minutes=threshold_minutes,
420
+ now_iso=now.strftime("%Y-%m-%dT%H:%M:%SZ"),
421
+ )
422
+
423
+ for d in scratch_dirs:
424
+ if not d.exists():
425
+ result.sweep_errors.append(f"scratch dir does not exist: {d}")
426
+ continue
427
+ if not d.is_dir():
428
+ result.sweep_errors.append(f"scratch path is not a directory: {d}")
429
+ continue
430
+ try:
431
+ children = sorted(d.glob("*.json"))
432
+ except OSError as exc:
433
+ result.sweep_errors.append(f"scratch dir unreadable {d}: {exc}")
434
+ continue
435
+ for child in children:
436
+ if not child.is_file():
437
+ # Skip directories that happen to end in .json -- rare but
438
+ # cheap to guard against.
439
+ continue
440
+ rec = parse_heartbeat_file(
441
+ child, now=now, threshold_seconds=threshold_seconds
442
+ )
443
+ result.records.append(rec)
444
+
445
+ return result
446
+
447
+
448
+ # ---------------------------------------------------------------------------
449
+ # Rendering
450
+ # ---------------------------------------------------------------------------
451
+
452
+
453
+ def _format_age(seconds: float | None) -> str:
454
+ if seconds is None:
455
+ return "<unknown>"
456
+ if seconds < 60:
457
+ return f"{seconds:.0f}s"
458
+ minutes = seconds / 60.0
459
+ if minutes < 60:
460
+ return f"{minutes:.1f}m"
461
+ hours = minutes / 60.0
462
+ return f"{hours:.1f}h"
463
+
464
+
465
+ def render_text(result: SweepResult) -> str:
466
+ """Pretty-print the sweep verdict for human consumers."""
467
+ lines: list[str] = []
468
+ n = len(result.records)
469
+ lines.append(
470
+ f"Sub-agent heartbeat sweep ({n} record{'s' if n != 1 else ''}, "
471
+ f"threshold {result.threshold_minutes:g} min, now={result.now_iso})"
472
+ )
473
+ for d in result.scratch_dirs:
474
+ lines.append(f" Scratch dir: {d}")
475
+ if result.sweep_errors:
476
+ lines.append(" Sweep errors:")
477
+ for err in result.sweep_errors:
478
+ lines.append(f" [!] {err}")
479
+ if not result.records and not result.sweep_errors:
480
+ lines.append("")
481
+ lines.append(" No heartbeat records found (empty scratch dir).")
482
+ for rec in result.records:
483
+ if rec.failures and rec.is_stale:
484
+ status = "STALE+MALFORMED"
485
+ elif rec.failures:
486
+ status = "MALFORMED"
487
+ elif rec.is_stale:
488
+ status = "STALE"
489
+ elif rec.is_terminal:
490
+ status = "TERMINAL"
491
+ else:
492
+ status = "OK"
493
+ agent = rec.agent_id or Path(rec.path).stem
494
+ lines.append("")
495
+ lines.append(f" {agent} -- {status}")
496
+ lines.append(f" Path: {rec.path}")
497
+ lines.append(f" Parent: {rec.parent_id or '<unset>'}")
498
+ lines.append(
499
+ f" Last heartbeat: {rec.last_heartbeat_at_iso or '<unparsed>'} "
500
+ f"(age {_format_age(rec.age_seconds)})"
501
+ )
502
+ lines.append(f" Phase: {rec.phase or '<unset>'}")
503
+ if rec.pr_number is not None:
504
+ lines.append(f" PR: #{rec.pr_number}")
505
+ if rec.terminal_state:
506
+ lines.append(f" Terminal state: {rec.terminal_state}")
507
+ if rec.last_message:
508
+ lines.append(f" Last message: {rec.last_message}")
509
+ for i, fail in enumerate(rec.failures, 1):
510
+ lines.append(f" [{i}] {fail}")
511
+ lines.append("")
512
+ if not result.records and not result.sweep_errors:
513
+ lines.append(
514
+ "Result: NO AGENTS TO MONITOR -- empty scratch dir (no stale state)"
515
+ )
516
+ elif result.all_ok:
517
+ lines.append(
518
+ "Result: ALL AGENTS ALIVE -- no stale or malformed records"
519
+ )
520
+ else:
521
+ stale = sum(1 for r in result.records if r.is_stale)
522
+ malformed = sum(1 for r in result.records if r.failures)
523
+ dir_errors = len(result.sweep_errors)
524
+ # When the only blocker is a directory-load failure but every
525
+ # record present is healthy, surface that as a CONFIG remediation
526
+ # rather than "re-dispatch stalled agents" -- the misleading
527
+ # phrasing was flagged on the #1375 review (the previous
528
+ # ``ATTENTION -- 0 stale, 0 malformed`` line pushed the operator
529
+ # at the wrong fix surface; the real action is to verify the
530
+ # scratch-dir paths). The two failure modes -- agents-actually-
531
+ # stale-or-malformed vs scratch-dir-unreadable -- now produce
532
+ # distinct, actionable summary lines.
533
+ if dir_errors and not stale and not malformed:
534
+ healthy = len(result.records)
535
+ lines.append(
536
+ f"Result: ATTENTION -- {dir_errors} scratch dir "
537
+ f"error(s); {healthy} record(s) healthy. Verify each "
538
+ f"--scratch-dir path; correct the misconfigured or "
539
+ f"missing directories surfaced above."
540
+ )
541
+ else:
542
+ dir_tail = (
543
+ f", {dir_errors} scratch dir error(s)" if dir_errors else ""
544
+ )
545
+ lines.append(
546
+ f"Result: ATTENTION -- {stale} stale, {malformed} "
547
+ f"malformed record(s){dir_tail}. Inspect diagnostics "
548
+ f"above and either re-dispatch the stalled agent(s) "
549
+ f"or take over manually."
550
+ )
551
+ return "\n".join(lines)
552
+
553
+
554
+ # ---------------------------------------------------------------------------
555
+ # CLI
556
+ # ---------------------------------------------------------------------------
557
+
558
+
559
+ def _default_scratch_dir() -> Path:
560
+ """Default scratch dir = ``<cwd>/.deft-scratch/subagent-status``.
561
+
562
+ The monitor runs from the parent's working directory (typically the
563
+ swarm root) and inspects that root's scratch dir. For multi-worktree
564
+ setups, the operator passes ``--scratch-dir`` explicitly per worktree.
565
+ """
566
+ return Path.cwd() / ".deft-scratch" / "subagent-status"
567
+
568
+
569
+ def _build_parser() -> argparse.ArgumentParser:
570
+ parser = argparse.ArgumentParser(
571
+ prog="subagent_monitor",
572
+ description=(
573
+ "Sub-agent heartbeat watcher (#1365). Walks one or more "
574
+ ".deft-scratch/subagent-status/ directories and reports the "
575
+ "liveness of every heartbeat record. Three-state exit: 0 ok, "
576
+ "1 stale or malformed, 2 config error."
577
+ ),
578
+ )
579
+ parser.add_argument(
580
+ "--scratch-dir",
581
+ dest="scratch_dirs",
582
+ action="append",
583
+ default=[],
584
+ metavar="PATH",
585
+ help=(
586
+ "Path to a .deft-scratch/subagent-status/ directory. May be "
587
+ "passed multiple times (one per agent worktree). Defaults to "
588
+ "<cwd>/.deft-scratch/subagent-status when omitted."
589
+ ),
590
+ )
591
+ parser.add_argument(
592
+ "--threshold-minutes",
593
+ dest="threshold_minutes",
594
+ type=float,
595
+ default=DEFAULT_THRESHOLD_MINUTES,
596
+ metavar="N",
597
+ help=(
598
+ f"Staleness threshold in minutes. Records older than this whose "
599
+ f"terminal_state is empty are flagged STALE. Default: "
600
+ f"{DEFAULT_THRESHOLD_MINUTES}."
601
+ ),
602
+ )
603
+ parser.add_argument(
604
+ "--json",
605
+ dest="emit_json",
606
+ action="store_true",
607
+ help="Emit the sweep result as a single JSON object on stdout.",
608
+ )
609
+ return parser
610
+
611
+
612
+ def main(argv: list[str] | None = None) -> int:
613
+ args = _build_parser().parse_args(argv)
614
+
615
+ if args.threshold_minutes <= 0:
616
+ print(
617
+ f"Error: --threshold-minutes must be positive, got "
618
+ f"{args.threshold_minutes}",
619
+ file=sys.stderr,
620
+ )
621
+ return EXIT_EXTERNAL_ERROR
622
+
623
+ scratch_paths: list[Path] = (
624
+ [Path(p) for p in args.scratch_dirs]
625
+ if args.scratch_dirs
626
+ else [_default_scratch_dir()]
627
+ )
628
+
629
+ result = sweep_scratch_dirs(
630
+ scratch_paths,
631
+ threshold_minutes=args.threshold_minutes,
632
+ )
633
+
634
+ # If the operator pointed at one or more scratch dirs that do not
635
+ # exist AND we found no records at all, that's a config error
636
+ # distinct from "the scratch dir exists but is empty" (which is
637
+ # also non-zero but a different message). Both routes return
638
+ # EXIT_EXTERNAL_ERROR so a missing-scratch-dir setup does not
639
+ # silently masquerade as "all agents alive".
640
+ config_error = (
641
+ bool(result.sweep_errors)
642
+ and not result.records
643
+ )
644
+
645
+ if args.emit_json:
646
+ print(json.dumps(result.to_dict(), indent=2))
647
+ else:
648
+ print(render_text(result))
649
+
650
+ if config_error:
651
+ return EXIT_EXTERNAL_ERROR
652
+ if result.all_ok:
653
+ return EXIT_OK
654
+ return EXIT_STALE
655
+
656
+
657
+ if __name__ == "__main__":
658
+ sys.exit(main())