@kontourai/flow-agents 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/.github/CODEOWNERS +29 -0
  2. package/.github/actions/trust-verify/action.yml +145 -0
  3. package/.github/workflows/ci.yml +11 -4
  4. package/.github/workflows/kit-gates-demo.yml +2 -2
  5. package/.github/workflows/publish-npm.yml +10 -2
  6. package/.github/workflows/release-please.yml +1 -1
  7. package/.github/workflows/runtime-compat.yml +1 -1
  8. package/.github/workflows/trust-reconcile.yml +113 -0
  9. package/AGENTS.md +13 -0
  10. package/CHANGELOG.md +103 -0
  11. package/CONTRIBUTING.md +4 -4
  12. package/README.md +1 -0
  13. package/agents/tool-planner.json +1 -1
  14. package/build/src/cli/init.js +242 -20
  15. package/build/src/cli/validate-workflow-artifacts.js +19 -2
  16. package/build/src/cli/verify.d.ts +1 -0
  17. package/build/src/cli/verify.js +90 -0
  18. package/build/src/cli/workflow-sidecar.d.ts +316 -8
  19. package/build/src/cli/workflow-sidecar.js +1996 -91
  20. package/build/src/cli.js +2 -3
  21. package/build/src/lib/flow-resolver.d.ts +111 -0
  22. package/build/src/lib/flow-resolver.js +308 -0
  23. package/build/src/tools/build-universal-bundles.js +34 -22
  24. package/build/src/tools/generate-context-map.js +3 -16
  25. package/build/src/tools/validate-source-tree.d.ts +1 -1
  26. package/build/src/tools/validate-source-tree.js +42 -162
  27. package/context/contracts/artifact-contract.md +10 -0
  28. package/context/contracts/delivery-contract.md +1 -0
  29. package/context/contracts/review-contract.md +1 -0
  30. package/context/contracts/verification-contract.md +2 -0
  31. package/context/gate-awareness.md +39 -0
  32. package/context/scripts/hooks/stop-goal-fit.js +632 -70
  33. package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
  34. package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
  35. package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
  36. package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
  37. package/docs/adr/0007-skill-audit.md +1 -1
  38. package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
  39. package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
  40. package/docs/adr/0011-mcp-posture.md +100 -0
  41. package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
  42. package/docs/adr/0013-context-lifecycle.md +151 -0
  43. package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
  44. package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
  45. package/docs/adr/0016-three-hard-boundary-model.md +71 -0
  46. package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
  47. package/docs/agent-system-guidebook.md +5 -12
  48. package/docs/context-map.md +4 -10
  49. package/docs/index.md +3 -2
  50. package/docs/integrations/framework-adapter.md +19 -6
  51. package/docs/integrations/index.md +2 -2
  52. package/docs/north-star.md +4 -4
  53. package/docs/operating-layers.md +3 -3
  54. package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
  55. package/docs/repository-structure.md +2 -2
  56. package/docs/skills-map.md +1 -0
  57. package/docs/spec/runtime-hook-surface.md +62 -9
  58. package/docs/standards-register.md +3 -3
  59. package/docs/survey-utterance-check.md +1 -1
  60. package/docs/trust-anchor-adoption.md +197 -0
  61. package/docs/verifiable-trust.md +95 -0
  62. package/docs/veritas-integration.md +2 -2
  63. package/docs/workflow-usage-guide.md +69 -0
  64. package/evals/acceptance/DEMO-false-completion.md +144 -0
  65. package/evals/acceptance/demo-cast.sh +92 -0
  66. package/evals/acceptance/demo-false-completion.sh +72 -0
  67. package/evals/acceptance/demo-real-evidence.sh +104 -0
  68. package/evals/acceptance/demo.tape +29 -0
  69. package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
  70. package/evals/acceptance/prove-capture-teeth.sh +114 -0
  71. package/evals/acceptance/prove-teeth.sh +105 -0
  72. package/evals/ci/antigaming-suite.sh +55 -0
  73. package/evals/ci/run-baseline.sh +2 -0
  74. package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
  75. package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
  76. package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
  77. package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
  78. package/evals/integration/test_builder_step_producers.sh +379 -0
  79. package/evals/integration/test_bundle_install.sh +35 -71
  80. package/evals/integration/test_bundle_lifecycle.sh +39 -2
  81. package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
  82. package/evals/integration/test_checkpoint_signing.sh +489 -0
  83. package/evals/integration/test_claim_lookup.sh +352 -0
  84. package/evals/integration/test_command_log_fork_classification.sh +134 -0
  85. package/evals/integration/test_command_log_integrity.sh +275 -0
  86. package/evals/integration/test_context_map.sh +0 -2
  87. package/evals/integration/test_dual_emit_flow_step.sh +278 -0
  88. package/evals/integration/test_enforcer_expects_driven.sh +281 -0
  89. package/evals/integration/test_evidence_capture_hook.sh +185 -0
  90. package/evals/integration/test_flow_kit_repository.sh +2 -0
  91. package/evals/integration/test_flowdef_session_activation.sh +273 -0
  92. package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
  93. package/evals/integration/test_gate_bypass_chain.sh +448 -0
  94. package/evals/integration/test_gate_lockdown.sh +1137 -0
  95. package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
  96. package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
  97. package/evals/integration/test_goal_fit_hook.sh +69 -4
  98. package/evals/integration/test_goal_fit_rederive.sh +263 -0
  99. package/evals/integration/test_install_merge.sh +1176 -0
  100. package/evals/integration/test_kit_identity_trust.sh +393 -0
  101. package/evals/integration/test_mint_attestation.sh +373 -0
  102. package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
  103. package/evals/integration/test_publish_delivery.sh +269 -0
  104. package/evals/integration/test_reconcile_soundness.sh +528 -0
  105. package/evals/integration/test_resolvefirststep_security.sh +208 -0
  106. package/evals/integration/test_session_resume_roundtrip.sh +286 -0
  107. package/evals/integration/test_trust_checkpoint.sh +325 -0
  108. package/evals/integration/test_trust_reconcile.sh +293 -0
  109. package/evals/integration/test_verify_cli.sh +208 -0
  110. package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
  111. package/evals/lib/node.sh +0 -6
  112. package/evals/run.sh +47 -0
  113. package/evals/static/test_workflow_skills.sh +6 -13
  114. package/install.sh +0 -7
  115. package/integrations/strands-ts/README.md +25 -15
  116. package/integrations/veritas/flow-agents.adapter.json +1 -2
  117. package/kits/builder/flows/build.flow.json +59 -12
  118. package/kits/builder/kit.json +85 -15
  119. package/kits/builder/skills/continue-work/SKILL.md +116 -0
  120. package/kits/builder/skills/deliver/SKILL.md +36 -6
  121. package/kits/builder/skills/design-probe/SKILL.md +28 -0
  122. package/kits/builder/skills/execute-plan/SKILL.md +9 -1
  123. package/kits/builder/skills/gate-review/SKILL.md +234 -0
  124. package/kits/builder/skills/learning-review/SKILL.md +30 -0
  125. package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
  126. package/kits/builder/skills/plan-work/SKILL.md +13 -1
  127. package/kits/builder/skills/pull-work/SKILL.md +19 -0
  128. package/kits/knowledge/adapters/default-store/index.js +38 -0
  129. package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
  130. package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
  131. package/kits/knowledge/docs/store-contract.md +314 -0
  132. package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
  133. package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
  134. package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
  135. package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
  136. package/kits/knowledge/evals/entities/suite.test.js +40 -0
  137. package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
  138. package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
  139. package/kits/knowledge/evals/retirement/suite.test.js +145 -0
  140. package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
  141. package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
  142. package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
  143. package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
  144. package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
  145. package/kits/knowledge/kit.json +51 -1
  146. package/package.json +6 -6
  147. package/packaging/conformance/README.md +10 -2
  148. package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
  149. package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
  150. package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
  151. package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
  152. package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
  153. package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
  154. package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
  155. package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
  156. package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
  157. package/packaging/conformance/run-conformance.js +1 -1
  158. package/scripts/README.md +2 -1
  159. package/scripts/build-universal-bundles.js +0 -1
  160. package/scripts/ci/mint-attestation.js +221 -0
  161. package/scripts/ci/trust-reconcile.js +545 -0
  162. package/scripts/hooks/config-protection.js +423 -1
  163. package/scripts/hooks/evidence-capture.js +348 -0
  164. package/scripts/hooks/lib/liveness-read.js +113 -0
  165. package/scripts/hooks/run-hook.js +6 -1
  166. package/scripts/hooks/stop-goal-fit.js +1524 -79
  167. package/scripts/hooks/workflow-steering.js +135 -5
  168. package/scripts/install-codex-home.sh +39 -0
  169. package/scripts/install-merge.js +330 -0
  170. package/scripts/repair-command-log.js +115 -0
  171. package/src/cli/init.ts +218 -20
  172. package/src/cli/validate-workflow-artifacts.ts +18 -2
  173. package/src/cli/verify.ts +100 -0
  174. package/src/cli/workflow-sidecar.ts +2127 -84
  175. package/src/cli.ts +2 -3
  176. package/src/lib/flow-resolver.ts +369 -0
  177. package/src/tools/build-universal-bundles.ts +34 -21
  178. package/src/tools/generate-context-map.ts +3 -17
  179. package/src/tools/validate-source-tree.ts +44 -104
  180. package/build/src/tools/filter-installed-packs.d.ts +0 -2
  181. package/build/src/tools/filter-installed-packs.js +0 -135
  182. package/packaging/packs.json +0 -49
  183. package/scripts/filter-installed-packs.js +0 -2
  184. package/src/tools/filter-installed-packs.ts +0 -132
@@ -4,9 +4,22 @@
4
4
  *
5
5
  * The hook reads .flow-agents artifacts, looks for the most recent active
6
6
  * delivery/session file, and reports missing Definition Of Done, Goal Fit, or
7
- * Final Acceptance state. It is warning-only by default. Set
8
- * FLOW_AGENTS_GOAL_FIT_STRICT=true to return exit code 2 when local goal fit is
9
- * incomplete.
7
+ * Final Acceptance state.
8
+ *
9
+ * Enforcement is controlled by FLOW_AGENTS_GOAL_FIT_MODE:
10
+ * - block: return exit code 2 (blocks the Stop) when local goal fit is incomplete.
11
+ * - warn: return exit code 0 but still emit the guidance on stderr (default).
12
+ * - off: stay silent.
13
+ * The legacy FLOW_AGENTS_GOAL_FIT_STRICT=true is honored as an alias for block.
14
+ * The canonical engine default is warn; shipped runtime configs (e.g. Claude
15
+ * Code at L2) set block so the installed product enforces while the engine
16
+ * default and conformance contract stay warn.
17
+ *
18
+ * Scope: the gate evaluates the session's current task (.flow-agents/current.json)
19
+ * when set, so an unrelated active workflow elsewhere in the repo does not gate
20
+ * this stop. It also never hard-blocks a pre-execution (not-yet-started) task on
21
+ * mere incompleteness — only genuine false-completion signals (a claimed pass the
22
+ * capture log or evidence.json contradicts) block before execution begins.
10
23
  */
11
24
 
12
25
  'use strict';
@@ -14,6 +27,7 @@
14
27
  const fs = require('fs');
15
28
  const path = require('path');
16
29
  const { spawnSync } = require('child_process');
30
+ const crypto = require('crypto');
17
31
 
18
32
  const MAX_STDIN = 1024 * 1024;
19
33
  const ACTIVE_STATUSES = new Set([
@@ -29,9 +43,23 @@ const ACTIVE_STATUSES = new Set([
29
43
  'blocked',
30
44
  'partial',
31
45
  ]);
32
- const DELIVERY_TYPES = new Set(['deliver', 'delivery', 'fix-bug', 'execute-plan', 'verify-work']);
33
- const SIDECAR_NAMES = new Set(['state.json', 'acceptance.json', 'evidence.json', 'handoff.json']);
34
- const OPTIONAL_SIDECAR_NAMES = new Set(['critique.json']);
46
+ // WORKFLOW_SESSION_TYPES: used for artifact identification only, not for verdict production.
47
+ const WORKFLOW_SESSION_TYPES = new Set(['deliver', 'delivery', 'fix-bug', 'execute-plan', 'verify-work']);
48
+ // Phase 4c: bundle-only. Required set = {state.json, handoff.json, trust.bundle}. Drop evidence.json/acceptance.json/critique.json.
49
+ const SIDECAR_NAMES = new Set(['state.json', 'handoff.json', 'trust.bundle']);
50
+ const OPTIONAL_SIDECAR_NAMES = new Set();
51
+
52
+ // A workflow that has not started execution is EXPECTED to be incomplete, so the
53
+ // Stop gate must not hard-block on its missing DOD / Goal Fit / not-done state.
54
+ // Only genuine false-completion signals block a pre-execution task; execution
55
+ // onward gates fully.
56
+ const PRE_EXECUTION_STATUSES = new Set(['new', 'planning', 'planned', 'backlog']);
57
+ const PRE_EXECUTION_PHASES = new Set(['idea', 'backlog', 'pickup', 'planning']);
58
+
59
+ // Terminal tasks are complete — they must never gate a stop or count as "active".
60
+ // A stale current.json pointing at one, or a graveyard of finished states, must
61
+ // not block an unrelated session.
62
+ const TERMINAL_STATUSES = new Set(['done', 'delivered', 'accepted', 'archived', 'complete', 'completed']);
35
63
 
36
64
  function parseJson(raw) {
37
65
  try { return JSON.parse(raw || '{}'); } catch { return {}; }
@@ -110,7 +138,23 @@ function sidecarValidation(root, artifactDir) {
110
138
  if (requireSidecars || requireCritique) {
111
139
  const present = new Set(sidecarFiles.map(file => path.basename(file)));
112
140
  const requiredNames = new Set(requireSidecars ? SIDECAR_NAMES : []);
113
- if (requireCritique) requiredNames.add('critique.json');
141
+ // Phase 4c: critique.json is no longer written; trust.bundle carries critique claims.
142
+ // FLOW_AGENTS_REQUIRE_CRITIQUE is satisfied by:
143
+ // - critique.json (legacy, may not be in SIDECAR_NAMES but may still be on disk), OR
144
+ // - trust.bundle that contains at least one workflow.critique.review claim.
145
+ if (requireCritique) {
146
+ // Check disk directly (critique.json is no longer in SIDECAR_NAMES so may not be in present)
147
+ const hasCritiqueJson = fs.existsSync(path.join(artifactDir, 'critique.json'));
148
+ const bundleFile = path.join(artifactDir, 'trust.bundle');
149
+ let hasBundleCritique = false;
150
+ if (fs.existsSync(bundleFile)) {
151
+ try {
152
+ const b = JSON.parse(fs.readFileSync(bundleFile, 'utf8'));
153
+ hasBundleCritique = Array.isArray(b.claims) && b.claims.some(c => c && c.claimType === 'workflow.critique.review');
154
+ } catch { /* fall through — no bundle critique */ }
155
+ }
156
+ if (!hasCritiqueJson && !hasBundleCritique) requiredNames.add('critique.json');
157
+ }
114
158
  const missing = [...requiredNames].filter(name => !present.has(name)).sort();
115
159
  if (missing.length > 0) {
116
160
  return missing.map(name => `${relative(root, path.join(artifactDir, name))} sidecar validation: required sidecar is missing`);
@@ -186,7 +230,7 @@ function isWorkflowArtifact(artifact) {
186
230
  if (!artifact) return false;
187
231
  if (artifact.role === 'plan' || artifact.role === 'review') return false;
188
232
  if (artifact.file.endsWith('-plan.md') || artifact.file.endsWith('-review.md')) return false;
189
- if (DELIVERY_TYPES.has(artifact.type)) return true;
233
+ if (WORKFLOW_SESSION_TYPES.has(artifact.type)) return true;
190
234
  return /--(deliver|fix-bug|execute-plan|verify-work)\b/.test(path.basename(artifact.file));
191
235
  }
192
236
 
@@ -219,6 +263,44 @@ function readJsonFile(file) {
219
263
  }
220
264
  }
221
265
 
266
+ // ─── ADR 0010 Phase 2b: re-derive-at-gate via Surface (fail-open) ─────────────
267
+ // Surface (@kontourai/surface) is ESM-only; stop-goal-fit.js is CJS.
268
+ // Load it via a fail-open dynamic import(), cached after the first attempt.
269
+ // If Surface cannot be loaded (package absent, env mismatch), we fall back to
270
+ // the stored claim.status check from #133 — no regression for environments that
271
+ // lack @kontourai/surface. The module is never written to disk.
272
+ let _surfaceModule; // undefined = not tried yet; null = unavailable
273
+ async function tryLoadSurface() {
274
+ if (_surfaceModule !== undefined) return _surfaceModule;
275
+ try {
276
+ const m = await import('@kontourai/surface');
277
+ _surfaceModule = m;
278
+ return _surfaceModule;
279
+ } catch {
280
+ _surfaceModule = null;
281
+ return null;
282
+ }
283
+ }
284
+
285
+ // ─── ADR 0016 Abstraction A P-c: flow-resolver integration ────────────────────
286
+ // Load the compiled flow-resolver (build/src/lib/flow-resolver.js) via CJS
287
+ // require behind the same hasBuild guard used for the validator. Fail-open:
288
+ // returns null when build/ is absent, require throws, or current.json has no
289
+ // active_flow_id / active_step_id. The caller (bundleEnforcement, sidecarGuidance)
290
+ // treats null as "no active FlowDefinition" and falls back to the workflow.* path.
291
+ function loadActiveFlowStep(flowAgentsDir) {
292
+ const packageRoot = path.resolve(__dirname, '..', '..');
293
+ const builtResolver = path.join(packageRoot, 'build', 'src', 'lib', 'flow-resolver.js');
294
+ if (!fs.existsSync(builtResolver)) return null; // hasBuild guard: no build/ yet
295
+ try {
296
+ const resolver = require(builtResolver);
297
+ if (typeof resolver.resolveActiveFlowStep !== 'function') return null;
298
+ return resolver.resolveActiveFlowStep(flowAgentsDir);
299
+ } catch {
300
+ return null; // require failed or resolver threw — fail-open
301
+ }
302
+ }
303
+
222
304
  function safeOneLine(value, maxLength = 220) {
223
305
  const text = String(value || '').replace(/\s+/g, ' ').trim();
224
306
  if (text.length <= maxLength) return text;
@@ -229,19 +311,225 @@ function normalizedStatus(value) {
229
311
  return safeOneLine(value, 80).toLowerCase();
230
312
  }
231
313
 
232
- function sidecarGuidance(root, artifactDir) {
314
+ // ─── ADR 0010 Phase 4b: bundle-first helpers for consumer migration ────────────
315
+ // These helpers extract evidence/critique/acceptance data from the trust.bundle
316
+ // when it is present, falling back to the bespoke sidecar for bundle-less sessions.
317
+ // The sidecar content is IDENTICAL to the bundle projection (Phase 4a guarantee),
318
+ // so consumer reads produce identical verdicts.
319
+
320
+ /**
321
+ * Extract the effective "verdict" from trust.bundle workflow.check.* claims,
322
+ * or from declared claimTypes when a FlowDefinition is active (P-c extension).
323
+ * Priority of non-pass statuses: fail > not_verified > partial > pass.
324
+ * Returns null when the bundle has no matching claims.
325
+ *
326
+ * @param {Array} claims - trust.bundle claims array
327
+ * @param {Set<string>|null} [declaredClaimTypes] - optional set of declared claimTypes from gateExpects[]
328
+ */
329
+ function bundleEvidenceVerdict(claims, declaredClaimTypes) {
330
+ const checkClaims = claims.filter(c => {
331
+ if (!c || typeof c.claimType !== 'string') return false;
332
+ if (c.claimType.startsWith('workflow.check.')) return true;
333
+ return declaredClaimTypes != null && declaredClaimTypes.has(c.claimType);
334
+ });
335
+ if (checkClaims.length === 0) return null;
336
+ let worst = 'pass';
337
+ const PRIORITY = { fail: 4, failed: 4, not_verified: 3, 'not-verified': 3, partial: 2, pass: 1, skip: 0 };
338
+ for (const c of checkClaims) {
339
+ const v = normalizedStatus(c.value || 'pass');
340
+ if ((PRIORITY[v] || 0) > (PRIORITY[worst] || 0)) worst = v;
341
+ }
342
+ return worst;
343
+ }
344
+
345
+ /**
346
+ * Extract the check ID from a claim's subjectId (format: "${slug}/${checkId}").
347
+ * Returns the part after the first slash, or the full subjectId if no slash.
348
+ */
349
+ function claimCheckId(subjectId) {
350
+ const s = String(subjectId || '');
351
+ const slash = s.indexOf('/');
352
+ return slash >= 0 ? s.slice(slash + 1) : s;
353
+ }
354
+
355
+ /**
356
+ * Build the list of blocking check-claims from trust.bundle (equivalent to
357
+ * evidence.json.checks filtered to non-pass status).
358
+ * Returns objects shaped like { id, status, summary } (summary from fieldOrBehavior).
359
+ *
360
+ * @param {Array} claims - trust.bundle claims array
361
+ * @param {Set<string>|null} [declaredClaimTypes] - optional set of declared claimTypes from gateExpects[]
362
+ */
363
+ function bundleBlockingChecks(claims, declaredClaimTypes) {
364
+ return claims.filter(c => {
365
+ if (!c || typeof c.claimType !== 'string') return false;
366
+ const typeMatch = c.claimType.startsWith('workflow.check.')
367
+ || (declaredClaimTypes != null && declaredClaimTypes.has(c.claimType));
368
+ if (!typeMatch) return false;
369
+ const v = normalizedStatus(c.value || '');
370
+ return v === 'fail' || v === 'failed' || v === 'not_verified' || v === 'not-verified';
371
+ }).map(c => ({
372
+ id: claimCheckId(c.subjectId),
373
+ status: c.value || 'unknown',
374
+ summary: c.fieldOrBehavior || '',
375
+ }));
376
+ }
377
+
378
+ /**
379
+ * Determine critique status from trust.bundle workflow.critique.review claims,
380
+ * or from declared claimTypes when a FlowDefinition is active (P-c extension).
381
+ * Returns the "worst" value among critique claims, or null when none present.
382
+ *
383
+ * @param {Array} claims - trust.bundle claims array
384
+ * @param {Set<string>|null} [declaredClaimTypes] - optional set of declared claimTypes from gateExpects[]
385
+ */
386
+ function bundleCritiqueStatus(claims, declaredClaimTypes) {
387
+ const critiqueClaims = claims.filter(c => {
388
+ if (!c || typeof c.claimType !== 'string') return false;
389
+ if (c.claimType === 'workflow.critique.review') return true;
390
+ return declaredClaimTypes != null && declaredClaimTypes.has(c.claimType);
391
+ });
392
+ if (critiqueClaims.length === 0) return null;
393
+ // A disputed or failed critique is blocking
394
+ for (const c of critiqueClaims) {
395
+ const v = normalizedStatus(c.value || '');
396
+ if (v === 'fail' || v === 'failed' || c.status === 'disputed' || c.status === 'rejected') return c.value || 'fail';
397
+ }
398
+ return 'pass';
399
+ }
400
+
401
+ /**
402
+ * Build the list of claimed-pass command checks from the trust.bundle's evidence[]
403
+ * (items with execution.label) and from workflow.check.command claims whose effective
404
+ * value is "pass" (never-captured claimed pass). Falls back to an empty list when
405
+ * the bundle has no evidence items.
406
+ *
407
+ * Returns objects shaped like { id, kind, status, command } — same shape as
408
+ * evidence.json.checks — so captureCrossReference's body logic is unchanged.
409
+ *
410
+ * @param {object} bundle - trust.bundle object
411
+ * @param {Set<string>|null} [declaredClaimTypes] - optional set of declared claimTypes from gateExpects[]
412
+ */
413
+ function bundleClaimedPassCommandChecks(bundle, declaredClaimTypes) {
414
+ const allEvidence = Array.isArray(bundle.evidence) ? bundle.evidence : [];
415
+ const allClaims = Array.isArray(bundle.claims) ? bundle.claims : [];
416
+
417
+ // Build a map from claimId -> claim for fast lookup
418
+ const claimById = new Map();
419
+ for (const c of allClaims) {
420
+ if (c && c.id) claimById.set(c.id, c);
421
+ }
422
+
423
+ const checks = [];
424
+ const seen = new Set();
425
+
426
+ // (A) Evidence items with execution.label (command captures).
427
+ // These represent commands that actually ran — include them regardless of
428
+ // effective status so we can cross-reference against the live log.
429
+ for (const ev of allEvidence) {
430
+ if (!ev || !ev.execution || !ev.execution.label) continue;
431
+ const cmd = String(ev.execution.label || '').replace(/\s+/g, ' ').trim();
432
+ if (!cmd) continue;
433
+ const claim = claimById.get(ev.claimId);
434
+ if (!claim) continue;
435
+ const claimTypeStr = String(claim.claimType || '');
436
+ if (!claimTypeStr.startsWith('workflow.check.') && !(declaredClaimTypes != null && declaredClaimTypes.has(claimTypeStr))) continue;
437
+ // Deduplicate by command
438
+ if (seen.has(cmd)) continue;
439
+ seen.add(cmd);
440
+ const id = claimCheckId(claim.subjectId);
441
+ // Use 'pass' as the nominal claimed status; cross-reference catches contradictions.
442
+ checks.push({ id, kind: 'command', status: 'pass', command: cmd });
443
+ }
444
+
445
+ // (B) Workflow.check.command claims with effective value "pass" but no capture
446
+ // (no evidence item with execution) — these are originally-claimed-pass checks
447
+ // that were never captured.
448
+ for (const c of allClaims) {
449
+ if (!c || typeof c.claimType !== 'string') continue;
450
+ const isCommandType = c.claimType === 'workflow.check.command'
451
+ || (declaredClaimTypes != null && declaredClaimTypes.has(c.claimType));
452
+ if (!isCommandType) continue;
453
+ if (normalizedStatus(c.value || '') !== 'pass') continue;
454
+ // Check if this claim already has a capture evidence item (covered in (A))
455
+ const hasCapture = allEvidence.some(ev => ev && ev.claimId === c.id && ev.execution && ev.execution.label);
456
+ if (hasCapture) continue;
457
+ // No capture — use fieldOrBehavior as command identifier for backstop resolution.
458
+ const evItem = allEvidence.find(ev => ev && ev.claimId === c.id);
459
+ const cmd = evItem
460
+ ? normalizeCommand(evItem.excerptOrSummary || '')
461
+ : normalizeCommand(c.fieldOrBehavior || '');
462
+ const id = claimCheckId(c.subjectId);
463
+ if (!cmd) {
464
+ checks.push({ id, kind: 'command', status: 'pass', command: '' });
465
+ continue;
466
+ }
467
+ if (seen.has(cmd)) continue;
468
+ seen.add(cmd);
469
+ checks.push({ id, kind: 'command', status: 'pass', command: cmd });
470
+ }
471
+
472
+ return checks;
473
+ }
474
+
475
+ /**
476
+ * Extract pending acceptance criteria from trust.bundle workflow.acceptance.criterion claims,
477
+ * or from declared claimTypes when a FlowDefinition is active (P-c extension).
478
+ * Returns the count of claims whose value is pending/not_started/empty/unknown.
479
+ * Returns null when the bundle has no matching claims.
480
+ *
481
+ * @param {Array} claims - trust.bundle claims array
482
+ * @param {Set<string>|null} [declaredClaimTypes] - optional set of declared claimTypes from gateExpects[]
483
+ */
484
+ function bundlePendingCriteriaCount(claims, declaredClaimTypes) {
485
+ const criteriaClaims = claims.filter(c => {
486
+ if (!c || typeof c.claimType !== 'string') return false;
487
+ if (c.claimType === 'workflow.acceptance.criterion') return true;
488
+ return declaredClaimTypes != null && declaredClaimTypes.has(c.claimType);
489
+ });
490
+ if (criteriaClaims.length === 0) return null;
491
+ const pending = criteriaClaims.filter(c => {
492
+ const v = normalizedStatus(c.value || '');
493
+ return v === 'pending' || v === 'not_started' || v === '' || v === 'unknown';
494
+ });
495
+ return pending.length;
496
+ }
497
+
498
+ // ─────────────────────────────────────────────────────────────────────────────
499
+
500
+ /**
501
+ * ADR 0010 Phase 4b: sidecarGuidance — bundle-first evidence/critique reads.
502
+ * state.json reads are UNCHANGED (state.json stays as primary source).
503
+ * evidence.json verdict/checks: read from trust.bundle when present, fall back
504
+ * to evidence.json for bundle-less sessions (no regression).
505
+ * not_verified_gaps: always from evidence.json (no bundle equivalent).
506
+ * critique status: read from trust.bundle when present, fall back to critique.json.
507
+ * Finding details: still from critique.json when present (both bundle and sidecar paths).
508
+ *
509
+ * ADR 0016 P-c: when activeFlowStep is non-null, pass its declared claimTypes to
510
+ * bundle helpers so declared-type claims (e.g. builder.verify.tests) produce the
511
+ * same sidecar guidance signals as workflow.* claims.
512
+ */
513
+ function sidecarGuidance(root, artifactDir, activeFlowStep) {
514
+ // Build the declared claimType set from the FlowDefinition gate expects[] (P-c).
515
+ // Null when no FlowDefinition is active (fallback: helpers use workflow.* prefix only).
516
+ const declaredClaimTypes = activeFlowStep && Array.isArray(activeFlowStep.gateExpects)
517
+ ? new Set(activeFlowStep.gateExpects.map(e => e && e.bundle_claim && e.bundle_claim.claimType).filter(Boolean))
518
+ : null;
233
519
  const warnings = [];
234
520
  const state = readJsonFile(path.join(artifactDir, 'state.json'));
235
- const evidence = readJsonFile(path.join(artifactDir, 'evidence.json'));
236
- const critique = readJsonFile(path.join(artifactDir, 'critique.json'));
237
521
  const base = relative(root, artifactDir);
238
522
 
239
523
  if (state) {
240
524
  const status = normalizedStatus(state.status || 'unknown');
241
525
  const phase = normalizedStatus(state.phase || 'unknown');
242
526
  const next = state.next_action && typeof state.next_action === 'object' ? state.next_action : null;
243
- if (!['done', 'delivered', 'archived', 'accepted', 'complete', 'completed'].includes(status)) {
244
- const nextStatus = next ? normalizedStatus(next.status || 'unknown') : 'unknown';
527
+ const nextStatus = next ? normalizedStatus(next.status || 'unknown') : 'unknown';
528
+ // The agent's work is complete when the recorded next action is done — the
529
+ // gate must not block the agent for a remaining human/CI step (e.g. a verified
530
+ // task whose only next_action is "commit the migration").
531
+ const agentComplete = nextStatus === 'done';
532
+ if (!TERMINAL_STATUSES.has(status) && !agentComplete) {
245
533
  const nextSummary = next && next.summary ? `; next_action:${nextStatus} "${safeOneLine(next.summary)}"` : '';
246
534
  warnings.push(`${base} workflow state: status:${status} phase:${phase}${nextSummary}`);
247
535
  }
@@ -252,54 +540,1074 @@ function sidecarGuidance(root, artifactDir) {
252
540
  warnings.push(`${base} next action: ${safeOneLine(next.summary)}${next.target_phase ? ` (target phase: ${safeOneLine(next.target_phase, 80)})` : ''}`);
253
541
  }
254
542
 
255
- if (evidence && normalizedStatus(evidence.verdict) && normalizedStatus(evidence.verdict) !== 'pass') {
256
- warnings.push(`${base} evidence verdict:${safeOneLine(evidence.verdict, 40)}; do not deliver without accepted gap or new evidence.`);
543
+ // ── Evidence verdict + checks: bundle-first, fallback to evidence.json ────
544
+ const bundle = readJsonFile(path.join(artifactDir, 'trust.bundle'));
545
+ const bundleClaims = bundle && Array.isArray(bundle.claims) ? bundle.claims : null;
546
+
547
+ if (bundleClaims) {
548
+ // Phase 4b: read verdict and per-check signals from trust.bundle claims.
549
+ // P-c: pass declaredClaimTypes so declared-type claims are included alongside workflow.*.
550
+ const verdict = bundleEvidenceVerdict(bundleClaims, declaredClaimTypes);
551
+ if (verdict && verdict !== 'pass' && verdict !== 'skip') {
552
+ warnings.push(`${base} evidence verdict:${safeOneLine(verdict, 40)}; do not deliver without accepted gap or new evidence.`);
553
+ }
554
+ const blockingChecks = bundleBlockingChecks(bundleClaims, declaredClaimTypes);
555
+ for (const check of blockingChecks.slice(0, 4)) {
556
+ const status = safeOneLine(check.status || 'unknown', 40);
557
+ warnings.push(`${base} evidence check ${safeOneLine(check.id || 'unknown', 80)} status:${status}: ${safeOneLine(check.summary)}`);
558
+ }
559
+ } else {
560
+ // Fallback: no bundle — read from evidence.json (existing behavior, no regression).
561
+ const evidence = readJsonFile(path.join(artifactDir, 'evidence.json'));
562
+ if (evidence && normalizedStatus(evidence.verdict) && normalizedStatus(evidence.verdict) !== 'pass') {
563
+ warnings.push(`${base} evidence verdict:${safeOneLine(evidence.verdict, 40)}; do not deliver without accepted gap or new evidence.`);
564
+ }
565
+ if (evidence && Array.isArray(evidence.checks)) {
566
+ const blockingChecks = evidence.checks.filter(check => {
567
+ const status = normalizedStatus(check && check.status);
568
+ return status === 'fail' || status === 'failed' || status === 'not_verified' || status === 'not-verified';
569
+ });
570
+ for (const check of blockingChecks.slice(0, 4)) {
571
+ const status = safeOneLine(check.status || 'unknown', 40);
572
+ warnings.push(`${base} evidence check ${safeOneLine(check.id || 'unknown', 80)} status:${status}: ${safeOneLine(check.summary)}`);
573
+ }
574
+ }
257
575
  }
576
+
577
+ // not_verified_gaps: always from evidence.json (no bundle equivalent).
578
+ const evidence = readJsonFile(path.join(artifactDir, 'evidence.json'));
258
579
  if (evidence && Array.isArray(evidence.not_verified_gaps) && evidence.not_verified_gaps.length > 0) {
259
580
  for (const gap of evidence.not_verified_gaps.slice(0, 3)) {
260
581
  warnings.push(`${base} evidence NOT_VERIFIED gap: ${safeOneLine(gap)}`);
261
582
  }
262
583
  }
263
- if (evidence && Array.isArray(evidence.checks)) {
264
- const blockingChecks = evidence.checks.filter(check => {
265
- const status = normalizedStatus(check && check.status);
266
- return status === 'fail' || status === 'failed' || status === 'not_verified' || status === 'not-verified';
584
+
585
+ // ── Critique: bundle-first status, critique.json for finding details ──────
586
+ const critique = readJsonFile(path.join(artifactDir, 'critique.json'));
587
+
588
+ if (bundleClaims) {
589
+ // Phase 4b: read critique status from trust.bundle claims.
590
+ // P-c: pass declaredClaimTypes so declared-type critique claims are included.
591
+ const critiqueStatusVal = bundleCritiqueStatus(bundleClaims, declaredClaimTypes);
592
+ const critiqueIsBlocking = critiqueStatusVal !== null && normalizedStatus(critiqueStatusVal) !== 'pass';
593
+ if (critiqueIsBlocking) {
594
+ warnings.push(`${base} critique status:${safeOneLine(critiqueStatusVal || 'unknown', 40)}; required critique must pass or findings be accepted.`);
595
+ // Finding details: still from critique.json when present (both paths use the same details source).
596
+ const critiques = critique && Array.isArray(critique.critiques) ? critique.critiques : [];
597
+ let openCount = 0;
598
+ for (const review of critiques) {
599
+ const findings = Array.isArray(review && review.findings) ? review.findings : [];
600
+ for (const finding of findings) {
601
+ if (!finding || normalizedStatus(finding.status) !== 'open') continue;
602
+ warnings.push(`${base} critique open ${safeOneLine(finding.severity || 'unknown', 40)}: ${safeOneLine(finding.description)}`);
603
+ openCount += 1;
604
+ if (openCount >= 3) break;
605
+ }
606
+ if (openCount >= 3) break;
607
+ }
608
+ }
609
+ } else {
610
+ // Fallback: no bundle — read from critique.json (existing behavior, no regression).
611
+ if (critique && critique.required === true && normalizedStatus(critique.status) !== 'pass') {
612
+ warnings.push(`${base} critique status:${safeOneLine(critique.status || 'unknown', 40)}; required critique must pass or findings be accepted.`);
613
+ const critiques = Array.isArray(critique.critiques) ? critique.critiques : [];
614
+ let openCount = 0;
615
+ for (const review of critiques) {
616
+ const findings = Array.isArray(review && review.findings) ? review.findings : [];
617
+ for (const finding of findings) {
618
+ if (!finding || normalizedStatus(finding.status) !== 'open') continue;
619
+ warnings.push(`${base} critique open ${safeOneLine(finding.severity || 'unknown', 40)}: ${safeOneLine(finding.description)}`);
620
+ openCount += 1;
621
+ if (openCount >= 3) break;
622
+ }
623
+ if (openCount >= 3) break;
624
+ }
625
+ }
626
+ }
627
+
628
+ return warnings;
629
+ }
630
+
631
+ // -----------------------------------------------------------------------
632
+ // Capture-first evidence determinism (Part B)
633
+ //
634
+ // The trust.bundle (emitted by workflow-sidecar via @kontourai/surface) carries
635
+ // capture-authoritative evidence items. The capture hook (evidence-capture.js)
636
+ // writes REAL command results to command-log.jsonl at the source. Here at the
637
+ // Stop gate we cross-reference claimed-pass command checks against that captured
638
+ // truth, and only fall back to re-running a TRUSTED command when the log has no
639
+ // execution for a claimed-pass command (i.e. it was never actually run).
640
+ //
641
+ // ADR 0010 Phase 4b: source the claimed-pass command checks from the bundle's
642
+ // evidence[] (execution/command items) instead of evidence.json checks.
643
+ // command-log.jsonl path UNCHANGED — it stays the capture truth source.
644
+ // -----------------------------------------------------------------------
645
+
646
+ function normalizeCommand(value) {
647
+ return String(value || '').replace(/\s+/g, ' ').trim();
648
+ }
649
+
650
+ /**
651
+ * Read command-log.jsonl into a map of normalized-command -> aggregate outcome.
652
+ * If the same command was run more than once, a single FAIL makes the aggregate
653
+ * a fail (a caught false-completion must not be masked by a later pass-claim).
654
+ */
655
+ function readCommandLog(artifactDir) {
656
+ const file = path.join(artifactDir, 'command-log.jsonl');
657
+ let raw = '';
658
+ try { raw = fs.readFileSync(file, 'utf8'); } catch { return new Map(); }
659
+ const byCommand = new Map();
660
+ for (const line of raw.split('\n')) {
661
+ const trimmed = line.trim();
662
+ if (!trimmed) continue;
663
+ let entry;
664
+ try { entry = JSON.parse(trimmed); } catch { continue; }
665
+ if (!entry || typeof entry.command !== 'string') continue;
666
+ const key = normalizeCommand(entry.command);
667
+ if (!key) continue;
668
+ const failed = entry.observedResult === 'fail' || (Number.isInteger(entry.exitCode) && entry.exitCode !== 0);
669
+ const prev = byCommand.get(key);
670
+ byCommand.set(key, {
671
+ ran: true,
672
+ failed: failed || (prev ? prev.failed : false),
673
+ exitCode: Number.isInteger(entry.exitCode) ? entry.exitCode : (prev ? prev.exitCode : null),
267
674
  });
268
- for (const check of blockingChecks.slice(0, 4)) {
269
- const status = safeOneLine(check.status || 'unknown', 40);
270
- warnings.push(`${base} evidence check ${safeOneLine(check.id || 'unknown', 80)} status:${status}: ${safeOneLine(check.summary)}`);
675
+ }
676
+ return byCommand;
677
+ }
678
+
679
+ /**
680
+ * Read command-log.jsonl into a map of normalized-command -> LATEST capture outcome.
681
+ * The LAST entry for each command wins (unlike readCommandLog which makes FAIL sticky).
682
+ * Used for both capturedFailReconciliation and captureCrossReference (Fix C): we want to
683
+ * know the LAST result, so a genuine re-run-to-pass clears the earlier FAIL. Only an actual
684
+ * re-run (new PASS entry in the log) clears it — a new claim cannot change the log.
685
+ */
686
+ function readLatestCommandLog(artifactDir) {
687
+ const file = path.join(artifactDir, 'command-log.jsonl');
688
+ let raw = '';
689
+ try { raw = fs.readFileSync(file, 'utf8'); } catch { return new Map(); }
690
+ const byCommand = new Map();
691
+ for (const line of raw.split('\n')) {
692
+ const trimmed = line.trim();
693
+ if (!trimmed) continue;
694
+ let entry;
695
+ try { entry = JSON.parse(trimmed); } catch { continue; }
696
+ if (!entry || typeof entry.command !== 'string') continue;
697
+ const key = normalizeCommand(entry.command);
698
+ if (!key) continue;
699
+ const failed = entry.observedResult === 'fail' || (Number.isInteger(entry.exitCode) && entry.exitCode !== 0);
700
+ // LAST entry wins — genuine re-run-to-pass overwrites the earlier FAIL.
701
+ byCommand.set(key, {
702
+ ran: true,
703
+ failed,
704
+ exitCode: Number.isInteger(entry.exitCode) ? entry.exitCode : null,
705
+ });
706
+ }
707
+ return byCommand;
708
+ }
709
+
710
+ // ─── Claim-status helpers for capturedFailReconciliation ─────────────────────
711
+
712
+ /**
713
+ * Returns true when a claim's stored status+value asserts the command PASSED.
714
+ * Used to detect namespace-agnostic false-completions.
715
+ */
716
+ function claimAssertsPass(status, value) {
717
+ const s = String(status || '').toLowerCase();
718
+ const v = String(value || '').toLowerCase().replace(/\s+/g, ' ').trim();
719
+ // Fix E: added 'approved' status alias and 'true'/'ok' value aliases
720
+ return (s === 'verified' || s === 'assumed' || s === 'accepted' || s === 'trusted' || s === 'approved')
721
+ && (v === 'pass' || v === 'passed' || v === 'verified' || v === 'true' || v === 'ok');
722
+ }
723
+
724
+ /**
725
+ * Returns true when a claim's stored status+value ACKNOWLEDGES a failure
726
+ * (the agent owned the failure rather than claiming pass).
727
+ */
728
+ function claimAcknowledgesFailure(status, value) {
729
+ const s = String(status || '').toLowerCase();
730
+ const v = String(value || '').toLowerCase().replace(/\s+/g, ' ').trim();
731
+ return s === 'disputed' || s === 'rejected' || s === 'failing' || s === 'failed'
732
+ || s === 'not_verified' || s === 'not-verified'
733
+ || v === 'fail' || v === 'failed' || v === 'not_verified' || v === 'failing';
734
+ }
735
+
736
+ /**
737
+ * Returns true when a command string contains an exit-code-neutralizing operator.
738
+ * A claimed-pass check whose captured command uses one of these cannot be accepted as a
739
+ * deterministic pass — the real sub-command may have failed silently.
740
+ *
741
+ * R6 extended logic (identical patterns used by scripts/ci/trust-reconcile.js — centralize
742
+ * as a follow-up if drift becomes a maintenance concern):
743
+ * - ANY || operator is flagged. A legitimate verification command never needs || — its
744
+ * only purpose in a verification command is to mask the real exit code (e.g.
745
+ * `npm test || exit 0`, `npm test || echo ok`, `npm test || /bin/true`, `npm test || (exit 0)`).
746
+ * - | true (single pipe into true — always exits 0)
747
+ * - Trailing ; or newline followed by: true : exit 0 /bin/true
748
+ *
749
+ * Fix D: applied in captureCrossReference's satisfied path and capturedFailReconciliation.
750
+ */
751
+ function hasLaunderingOperator(cmd) {
752
+ // ANY || in a claimed verification command is an exit-code mask.
753
+ // Legitimate verification commands never need || — its only purpose there is to
754
+ // suppress the real exit code (|| exit 0, || echo ok, || /bin/true, || (exit 0), etc.).
755
+ if (/\|\|/.test(cmd)) return true;
756
+ // | true — single-pipe into true: `cmd | true` always exits 0 regardless of left-side exit code.
757
+ if (/\|\s*true\b/.test(cmd)) return true;
758
+ // Trailing ; or \n followed by exit-neutralizing commands (same threat, appended after the real cmd):
759
+ // ; true ; : ; exit 0 ; /bin/true (and \n variants)
760
+ if (/[;\n]\s*true\b/.test(cmd)) return true;
761
+ if (/[;\n]\s*:\s*(?:$|\s|;)/.test(cmd)) return true;
762
+ if (/[;\n]\s*exit\s+0\b/.test(cmd)) return true;
763
+ if (/[;\n]\s*\/bin\/true\b/.test(cmd)) return true;
764
+ return false;
765
+ }
766
+
767
+ // ─── Hash-chain integrity verification (Increment B2, tamper-EVIDENCE) ────────
768
+ //
769
+ // command-log.jsonl is the independent capture truth source. Each chained entry
770
+ // carries a `_chain` block: {seq, prevHash, hash} where
771
+ // hash = sha256(prevHash + canonicalJson(record))
772
+ // and canonicalJson is the record WITHOUT `_chain`, keys sorted alphabetically.
773
+ //
774
+ // HONEST FRAMING: this is tamper-EVIDENCE, not tamper-PROOF. An agent that
775
+ // rewrites all hashes can still forge a valid chain. The real tamper-proof
776
+ // boundary is the signed checkpoint (B1). This raises the local bar and catches
777
+ // casual/accidental tampering and corruption.
778
+ //
779
+ // verifyCommandLogChain returns:
780
+ // "ok" — all entries have _chain and every hash/linkage is valid.
781
+ // "legacy" — no _chain fields at all (pre-B2 log); treat as today, no
782
+ // integrity claim. Backward-compat: existing fixtures stay green.
783
+ // "broken" — a chained entry has a bad hash or bad linkage; the capture
784
+ // truth source appears altered/removed/reordered.
785
+ //
786
+ // The genesis prevHash is a fixed arbitrary sentinel — NOT the SHA256 of any
787
+ // specific input string. The comment in evidence-capture.js previously (and
788
+ // incorrectly) claimed it was sha256("flow-agents:command-log:genesis"); it is not.
789
+ // Writer (evidence-capture.js CHAIN_GENESIS) and verifier (CHAIN_GENESIS_VERIFY here)
790
+ // MUST use the same value. Do not change one without changing the other.
791
+ const CHAIN_GENESIS_VERIFY = 'a3f9e2b7d5c84f1e6a0d2c3b9f7e1a4d8c6b5f2e9a0d3c7b1f4e8a2d6c0b9f3';
792
+
793
+ /**
794
+ * Canonical JSON for chain verification: record WITHOUT `_chain`, keys sorted.
795
+ * Must be byte-identical to canonicalJsonForChain() in evidence-capture.js.
796
+ */
797
+ function canonicalJsonForVerify(record) {
798
+ const keys = Object.keys(record).filter(k => k !== '_chain').sort();
799
+ const obj = {};
800
+ for (const k of keys) obj[k] = record[k];
801
+ return JSON.stringify(obj);
802
+ }
803
+
804
+ /**
805
+ * Verify the hash chain of command-log.jsonl.
806
+ * Returns { status, brokenAt, forkAt } where:
807
+ * status = "ok" | "legacy" | "broken" | "forked"
808
+ * brokenAt = index (0-based) of the first broken entry, or null
809
+ * forkAt = index (0-based) of the first concurrent-fork sibling, or null
810
+ *
811
+ * "forked" is a BENIGN concurrent-append race, not tampering: two PostToolUse
812
+ * captures appended off the same parent tip (e.g. parallel agents sharing one
813
+ * log) before the writer lock (flow-agents#232) serialized them. It is
814
+ * distinguished from "broken" because:
815
+ * - every entry's hash is still self-consistent (no content was edited), and
816
+ * - every entry's parent is reachable (nothing was reordered or removed);
817
+ * - the only anomaly is a parent claimed by >1 capture-sourced sibling.
818
+ * Tamper — a content edit (self-hash mismatch), a reorder, or a deletion
819
+ * (unreachable parent) — still returns "broken". A fork cannot be used to
820
+ * launder a content edit: editing a record breaks its self-hash, which is
821
+ * checked before fork classification.
822
+ */
823
+ function verifyCommandLogChain(artifactDir) {
824
+ const file = path.join(artifactDir, 'command-log.jsonl');
825
+ let raw = '';
826
+ try { raw = fs.readFileSync(file, 'utf8'); } catch { return { status: 'legacy', brokenAt: null, forkAt: null }; }
827
+
828
+ const lines = raw.split('\n').filter(l => l.trim());
829
+ if (lines.length === 0) return { status: 'legacy', brokenAt: null, forkAt: null };
830
+
831
+ // Parse all entries, tolerating unparseable lines (they count as legacy/unchained).
832
+ const entries = [];
833
+ for (const line of lines) {
834
+ try {
835
+ const entry = JSON.parse(line);
836
+ if (entry && typeof entry === 'object') entries.push(entry);
837
+ } catch { /* skip malformed lines */ }
838
+ }
839
+ if (entries.length === 0) return { status: 'legacy', brokenAt: null, forkAt: null };
840
+
841
+ // Classify: are there any chained entries?
842
+ const hasAnyChain = entries.some(e => e._chain && typeof e._chain.hash === 'string');
843
+ if (!hasAnyChain) return { status: 'legacy', brokenAt: null, forkAt: null };
844
+
845
+ // Walk in file order. A chained entry is ACCEPTED when both:
846
+ // (a) self-consistent: hash === sha256(prevHash + canonicalJson(record)),
847
+ // so a content edit (e.g. flipping exitCode) without rehashing fails; and
848
+ // (b) reachable: prevHash is genesis or the hash of any prior accepted entry.
849
+ // We track the SET of reachable hashes (not just the latest tip) so that
850
+ // concurrent-fork siblings — which share a still-reachable parent — are
851
+ // tolerated, while a reorder/deletion (parent not reachable) is caught.
852
+ const reachable = new Set([CHAIN_GENESIS_VERIFY]);
853
+ const parentSources = new Map(); // prevHash -> [source, ...] (fork detection)
854
+ let prevWasChained = false;
855
+ let forked = false;
856
+ let firstForkAt = null;
857
+
858
+ for (let i = 0; i < entries.length; i++) {
859
+ const entry = entries[i];
860
+ const chain = entry._chain;
861
+ if (!chain || typeof chain.hash !== 'string') {
862
+ // Legacy entry without _chain. If we have already seen a chained entry,
863
+ // a gap in the chain (a legacy entry in the middle) counts as broken
864
+ // (it could indicate a removed chained entry was replaced by a legacy one).
865
+ if (prevWasChained) return { status: 'broken', brokenAt: i, forkAt: null };
866
+ // Before any chained entry: tolerate (legacy prefix).
867
+ continue;
271
868
  }
869
+
870
+ // (a) Self-consistency. A content edit without rehashing fails here.
871
+ if (typeof chain.prevHash !== 'string') return { status: 'broken', brokenAt: i, forkAt: null };
872
+ const selfHash = crypto.createHash('sha256')
873
+ .update(chain.prevHash + canonicalJsonForVerify(entry), 'utf8')
874
+ .digest('hex');
875
+ if (chain.hash !== selfHash) return { status: 'broken', brokenAt: i, forkAt: null };
876
+
877
+ // (b) Reachability. An unreachable parent means a reorder or a removed
878
+ // predecessor — structural tamper, not a benign concurrent append.
879
+ if (!reachable.has(chain.prevHash)) return { status: 'broken', brokenAt: i, forkAt: null };
880
+
881
+ // Fork detection: a parent claimed by more than one entry is a fork. It is
882
+ // benign only when EVERY sibling on that parent is a PostToolUse capture
883
+ // (two captures racing on the same tip). Any non-capture sibling on a
884
+ // shared parent is treated as tamper (conservative).
885
+ const sources = parentSources.get(chain.prevHash) || [];
886
+ sources.push(entry.source);
887
+ parentSources.set(chain.prevHash, sources);
888
+ if (sources.length > 1) {
889
+ if (!sources.every(s => s === 'postToolUse-capture')) {
890
+ return { status: 'broken', brokenAt: i, forkAt: null };
891
+ }
892
+ if (firstForkAt === null) firstForkAt = i;
893
+ forked = true;
894
+ }
895
+
896
+ reachable.add(chain.hash);
897
+ prevWasChained = true;
272
898
  }
273
899
 
274
- if (critique && critique.required === true && normalizedStatus(critique.status) !== 'pass') {
275
- warnings.push(`${base} critique status:${safeOneLine(critique.status || 'unknown', 40)}; required critique must pass or findings be accepted.`);
276
- const critiques = Array.isArray(critique.critiques) ? critique.critiques : [];
277
- let openCount = 0;
278
- for (const review of critiques) {
279
- const findings = Array.isArray(review && review.findings) ? review.findings : [];
280
- for (const finding of findings) {
281
- if (!finding || normalizedStatus(finding.status) !== 'open') continue;
282
- warnings.push(`${base} critique open ${safeOneLine(finding.severity || 'unknown', 40)}: ${safeOneLine(finding.description)}`);
283
- openCount += 1;
284
- if (openCount >= 3) break;
900
+ if (forked) return { status: 'forked', brokenAt: null, forkAt: firstForkAt };
901
+ return { status: 'ok', brokenAt: null, forkAt: null };
902
+ }
903
+ // ─────────────────────────────────────────────────────────────────────────────
904
+
905
+ /**
906
+ * Resolve a TRUSTED command to re-run for a claimed-pass check whose command was
907
+ * never captured. Priority (most trusted first):
908
+ * (a) the command named by the matching acceptance criterion (acceptance.json
909
+ * evidence_ref of kind "command", `excerpt`/`command`) — authored upfront.
910
+ * (b) the project's declared manifest target — package.json scripts.{test,
911
+ * build,lint}, Makefile target, cargo test, pyproject/tox, just/task.
912
+ * (c) the model's free-form evidence.checks[].command — ONLY when
913
+ * FLOW_AGENTS_GOAL_FIT_RECHECK=true (the RCE-risky opt-in path).
914
+ * Returns { argv, cwd, source } or null when nothing trusted resolves.
915
+ */
916
+ function resolveTrustedCommand(root, artifactDir, check, acceptance) {
917
+ // (a) acceptance criterion command for the matching criterion.
918
+ const fromAcceptance = acceptanceCommandFor(check, acceptance);
919
+ if (fromAcceptance) return { argv: ['bash', '-lc', fromAcceptance], cwd: root, source: 'acceptance' };
920
+
921
+ // (b) declared manifest target. Map the check command/id to a declared script.
922
+ const declared = declaredManifestTarget(root, check);
923
+ if (declared) return { argv: declared.argv, cwd: declared.cwd || root, source: 'manifest' };
924
+
925
+ // (c) free-form model command — opt-in only.
926
+ if (String(process.env.FLOW_AGENTS_GOAL_FIT_RECHECK || '').toLowerCase() === 'true') {
927
+ const cmd = normalizeCommand(check && check.command);
928
+ if (cmd) return { argv: ['bash', '-lc', cmd], cwd: root, source: 'model-command (FLOW_AGENTS_GOAL_FIT_RECHECK)' };
929
+ }
930
+ return null;
931
+ }
932
+
933
+ function acceptanceCommandFor(check, acceptance) {
934
+ if (!acceptance || !Array.isArray(acceptance.criteria)) return null;
935
+ const checkId = normalizedStatus(check && check.id);
936
+ const checkCmd = normalizeCommand(check && check.command);
937
+ let firstCommand = null;
938
+ for (const criterion of acceptance.criteria) {
939
+ const refs = Array.isArray(criterion && criterion.evidence_refs) ? criterion.evidence_refs : [];
940
+ for (const ref of refs) {
941
+ if (!ref || typeof ref !== 'object' || ref.kind !== 'command') continue;
942
+ const refCmd = normalizeCommand(ref.excerpt || ref.command);
943
+ if (!refCmd) continue;
944
+ if (!firstCommand) firstCommand = refCmd;
945
+ // Strong match: the criterion id matches the check id, or the commands match.
946
+ const idMatch = checkId && normalizedStatus(criterion.id) === checkId;
947
+ if (idMatch || (checkCmd && refCmd === checkCmd)) return refCmd;
948
+ }
949
+ }
950
+ // No id/command match — only fall back to the first authored command when the
951
+ // check itself names no command (so we still have an upfront-trusted target).
952
+ return checkCmd ? null : firstCommand;
953
+ }
954
+
955
+ /**
956
+ * Map a claimed-pass command check to a project-declared, NAMED manifest target.
957
+ * Never allowlists arbitrary strings: we only run a target the project itself
958
+ * declared (npm script, Makefile target, cargo/tox/just/task). The check's
959
+ * command/id is used to pick WHICH declared target (test|build|lint), not to run
960
+ * the raw string. `veritas readiness` is just one such declared command — no
961
+ * special-casing.
962
+ */
963
+ function declaredManifestTarget(root, check) {
964
+ const haystack = `${normalizeCommand(check && check.command)} ${normalizedStatus(check && check.id)} ${normalizedStatus(check && check.kind)}`.toLowerCase();
965
+ let want = null;
966
+ if (/\btest|spec|jest|vitest|pytest\b/.test(haystack)) want = 'test';
967
+ else if (/\bbuild|compile|bundle\b/.test(haystack)) want = 'build';
968
+ else if (/\blint|format|style|typecheck\b/.test(haystack)) want = 'lint';
969
+ if (!want) return null;
970
+
971
+ // package.json scripts.{test,build,lint}
972
+ const pkg = readJsonFile(path.join(root, 'package.json'));
973
+ if (pkg && pkg.scripts && typeof pkg.scripts === 'object') {
974
+ const scriptName = pkg.scripts[want] ? want
975
+ : want === 'lint' && pkg.scripts.typecheck ? 'typecheck'
976
+ : null;
977
+ if (scriptName) return { argv: ['npm', 'run', scriptName, '--silent'], cwd: root };
978
+ }
979
+ // Makefile target
980
+ const makefile = ['Makefile', 'makefile', 'GNUmakefile'].map(n => path.join(root, n)).find(p => fs.existsSync(p));
981
+ if (makefile) {
982
+ try {
983
+ const text = fs.readFileSync(makefile, 'utf8');
984
+ if (new RegExp(`^${want}\\s*:`, 'm').test(text)) return { argv: ['make', want], cwd: root };
985
+ } catch { /* ignore */ }
986
+ }
987
+ // cargo
988
+ if (want === 'test' && fs.existsSync(path.join(root, 'Cargo.toml'))) return { argv: ['cargo', 'test'], cwd: root };
989
+ if (want === 'build' && fs.existsSync(path.join(root, 'Cargo.toml'))) return { argv: ['cargo', 'build'], cwd: root };
990
+ // py ecosystem: tox / pyproject (declared test target)
991
+ if (want === 'test' && fs.existsSync(path.join(root, 'tox.ini'))) return { argv: ['tox'], cwd: root };
992
+ if (want === 'test' && fs.existsSync(path.join(root, 'pyproject.toml'))) return { argv: ['pytest'], cwd: root };
993
+ // just / task runners
994
+ for (const runner of [['just', 'justfile'], ['task', 'Taskfile.yml'], ['task', 'Taskfile.yaml']]) {
995
+ if (fs.existsSync(path.join(root, runner[1]))) return { argv: [runner[0], want], cwd: root };
996
+ }
997
+ return null;
998
+ }
999
+
1000
+ function resolveBackstopTimeout() {
1001
+ const raw = Number.parseInt(process.env.FLOW_AGENTS_GOAL_FIT_BACKSTOP_TIMEOUT_MS || '', 10);
1002
+ return Number.isInteger(raw) && raw > 0 ? raw : 120000;
1003
+ }
1004
+
1005
+ /**
1006
+ * Whether the trusted backstop re-run may ride block mode. Default-on so a
1007
+ * never-actually-run claimed-pass command is caught, but operator-disablable for
1008
+ * latency via FLOW_AGENTS_GOAL_FIT_BACKSTOP=off (re-run becomes warn-only) or
1009
+ * =skip (no re-run at all → record NOT_VERIFIED instead).
1010
+ */
1011
+ function resolveBackstopMode() {
1012
+ const v = String(process.env.FLOW_AGENTS_GOAL_FIT_BACKSTOP || '').trim().toLowerCase();
1013
+ if (v === 'off' || v === 'warn' || v === 'skip' || v === 'block') return v === 'warn' ? 'off' : v;
1014
+ return 'block';
1015
+ }
1016
+
1017
+ function runBackstop(trusted) {
1018
+ const result = spawnSync(trusted.argv[0], trusted.argv.slice(1), {
1019
+ cwd: trusted.cwd,
1020
+ encoding: 'utf8',
1021
+ timeout: resolveBackstopTimeout(),
1022
+ killSignal: 'SIGKILL',
1023
+ stdio: ['ignore', 'pipe', 'pipe'],
1024
+ });
1025
+ if (result.error) return { ran: false, error: result.error.code || result.error.message };
1026
+ if (result.signal) return { ran: false, error: `killed (${result.signal})`, timedOut: result.signal === 'SIGKILL' || result.signal === 'SIGTERM' };
1027
+ return { ran: true, passed: result.status === 0, exitCode: result.status };
1028
+ }
1029
+
1030
+ /**
1031
+ * ADR 0010 Phase 4b: captureCrossReference — bundle-first command check sourcing.
1032
+ * Sources the claimed-pass command checks from trust.bundle evidence[] (execution/
1033
+ * command items) when the bundle is present, falling back to evidence.json checks
1034
+ * for bundle-less sessions. command-log.jsonl UNCHANGED — it stays the capture
1035
+ * truth source. The teeth (claimed-pass + captured-fail → block) are byte-identical.
1036
+ *
1037
+ * ADR 0016 P-c (fix): accept activeFlowStep so declared-type sessions (e.g.
1038
+ * builder.verify.tests) are visible to the cross-reference, closing the hole
1039
+ * where captureCrossReference was the only capture consumer not threaded with
1040
+ * the FlowDefinition. Mirrors the pattern in bundleEnforcement / sidecarGuidance.
1041
+ */
1042
+ function captureCrossReference(root, artifactDir, activeFlowStep) {
1043
+ // Build the declared claimType set from the FlowDefinition gate expects[] (P-c).
1044
+ // Null when no FlowDefinition is active (fallback: bundleClaimedPassCommandChecks
1045
+ // uses workflow.check.* prefix only — no regression for non-FlowDefinition sessions).
1046
+ const declaredClaimTypes = activeFlowStep && Array.isArray(activeFlowStep.gateExpects)
1047
+ ? new Set(activeFlowStep.gateExpects.map(e => e && e.bundle_claim && e.bundle_claim.claimType).filter(Boolean))
1048
+ : null;
1049
+ const bundle = readJsonFile(path.join(artifactDir, 'trust.bundle'));
1050
+ const acceptance = readJsonFile(path.join(artifactDir, 'acceptance.json'));
1051
+ const log = readLatestCommandLog(artifactDir); // Fix C: latest-wins; genuine fix-then-rerun-to-pass clears the block
1052
+ const base = relative(root, artifactDir);
1053
+ const backstopMode = resolveBackstopMode();
1054
+ const warnings = [];
1055
+
1056
+ // AC3 fail-closed: detect a missing command log in a post-execution session.
1057
+ // When state.json confirms the session is past the planning phase (commands should
1058
+ // have run), but command-log.jsonl is absent, emit a FULL_BLOCK warning.
1059
+ // A genuine pre-execution session (no commands expected yet) is NOT blocked —
1060
+ // distinguished by requiring BOTH a non-pre-execution phase AND a non-pre-execution
1061
+ // status (avoids blocking a freshly planned task that has state.json but no log yet).
1062
+ //
1063
+ // BAR-RAISER: an agent could forge state.json with a pre-execution phase to avoid
1064
+ // this. The real anchor is external (signed checkpoints + human review).
1065
+ {
1066
+ const logFileMissing = !fs.existsSync(path.join(artifactDir, 'command-log.jsonl'));
1067
+ if (logFileMissing) {
1068
+ const captureState = readJsonFile(path.join(artifactDir, 'state.json'));
1069
+ if (captureState) {
1070
+ const capturePhase = normalizedStatus(captureState.phase || '');
1071
+ const captureStatus = normalizedStatus(captureState.status || '');
1072
+ // Post-execution: phase is set AND not a pre-execution phase (idea/backlog/pickup/planning).
1073
+ // Also require status is not a pre-execution status (new/planning/planned/backlog).
1074
+ const postExecPhase = capturePhase && !PRE_EXECUTION_PHASES.has(capturePhase);
1075
+ const preExecStatus = !captureStatus || captureStatus === 'new' || PRE_EXECUTION_STATUSES.has(captureStatus);
1076
+ if (postExecPhase && !preExecStatus) {
1077
+ // Fix #216 over-block: only emit the missing-log warning when a command was
1078
+ // actually EXPECTED to be captured — i.e., the trust.bundle evidence has at
1079
+ // least one item with execution.label (concrete proof a command was meant to
1080
+ // be captured). A no-command session (doc review, policy task advanced to
1081
+ // verification without running shell commands) must NOT be blocked here.
1082
+ // Note: `bundle` is already read at the top of captureCrossReference.
1083
+ const captureEvidence = bundle && Array.isArray(bundle.evidence) ? bundle.evidence : [];
1084
+ const hasExpectedCapture = captureEvidence.some(ev => ev && ev.execution && ev.execution.label);
1085
+ if (hasExpectedCapture) {
1086
+ warnings.push(
1087
+ `${base} expected capture log is missing — possible deletion of the capture truth source; ` +
1088
+ `phase:${capturePhase} status:${captureStatus} indicates commands should have run. ` +
1089
+ 'Cannot verify command execution deterministically. ' +
1090
+ 'Restore from a checkpoint or investigate.'
1091
+ );
1092
+ }
1093
+ }
285
1094
  }
286
- if (openCount >= 3) break;
287
1095
  }
288
1096
  }
289
1097
 
1098
+ // ── Hash-chain integrity check ──────────────────────────────────────────────
1099
+ // Verify command-log.jsonl before trusting its pass/fail signals. If the chain
1100
+ // is broken (altered, removed, or reordered entries), the capture truth source
1101
+ // is compromised: we must NOT trust its pass signals for claimed-pass checks.
1102
+ //
1103
+ // ok → proceed normally (chain is valid, log is trustworthy).
1104
+ // legacy → proceed normally (pre-B2 log, no chain to verify, existing behavior).
1105
+ // forked → benign concurrent-append race (not tampering): emit a loud but
1106
+ // NON-blocking advisory and keep trusting the records. The capture
1107
+ // contradiction teeth still run (the records are genuine, just not
1108
+ // linearly ordered); the operator can re-linearize with the repair
1109
+ // tool. This is what stops honest parallel work from being trapped.
1110
+ // broken → emit a loud warning and treat ALL claimed-pass commands relying on
1111
+ // this log as NOT_VERIFIED/blocking — do not let them sail through.
1112
+ let chainBroken = false;
1113
+ {
1114
+ const chainResult = verifyCommandLogChain(artifactDir);
1115
+ if (chainResult.status === 'broken') {
1116
+ chainBroken = true;
1117
+ const brokenIdx = chainResult.brokenAt !== null ? ` (entry ${chainResult.brokenAt})` : '';
1118
+ warnings.push(
1119
+ `${base} command-log integrity check FAILED — capture truth source appears tampered${brokenIdx}: ` +
1120
+ 'claimed-pass checks relying on it are NOT trusted. ' +
1121
+ 'This is tamper-EVIDENCE (hash-chain broken); alteration, removal, or reordering detected. ' +
1122
+ 'NOT_VERIFIED: cannot confirm or deny claimed passes.'
1123
+ );
1124
+ } else if (chainResult.status === 'forked') {
1125
+ // NOT a hard block: this string must not match HARD_BLOCK/FULL_BLOCK. A
1126
+ // concurrent fork is benign — no content was edited and nothing was
1127
+ // removed — so honest parallel work proceeds. We surface it loudly and
1128
+ // point at the deterministic repair.
1129
+ const forkIdx = chainResult.forkAt !== null ? ` (entry ${chainResult.forkAt})` : '';
1130
+ warnings.push(
1131
+ `${base} command-log shows a concurrent-capture fork${forkIdx} — two PostToolUse captures appended off the same parent ` +
1132
+ '(parallel writers before the writer lock). This is NOT tampering: every record is self-consistent and reachable. ' +
1133
+ 'Records remain trusted; re-linearize with: node scripts/repair-command-log.js <artifact-dir>'
1134
+ );
1135
+ }
1136
+ }
1137
+
1138
+ // Build the list of claimed-pass command checks — bundle-first, evidence.json fallback.
1139
+ let claimedPass;
1140
+ if (bundle && Array.isArray(bundle.claims)) {
1141
+ // Phase 4b: source from trust.bundle evidence[] (execution/command items).
1142
+ claimedPass = bundleClaimedPassCommandChecks(bundle, declaredClaimTypes);
1143
+ } else {
1144
+ // Fallback: no bundle — read from evidence.json (existing behavior, no regression).
1145
+ const evidence = readJsonFile(path.join(artifactDir, 'evidence.json'));
1146
+ if (!evidence || !Array.isArray(evidence.checks)) return warnings;
1147
+ claimedPass = evidence.checks.filter(check => {
1148
+ if (!check || typeof check !== 'object') return false;
1149
+ const kind = normalizedStatus(check.kind);
1150
+ const status = normalizedStatus(check.status);
1151
+ return kind === 'command' && (status === 'pass' || status === 'passed') && normalizeCommand(check.command);
1152
+ });
1153
+ }
1154
+
1155
+ for (const check of claimedPass.slice(0, 8)) {
1156
+ const cmd = normalizeCommand(check.command);
1157
+ if (!cmd) continue;
1158
+ const id = safeOneLine(check.id || cmd, 80);
1159
+ const logged = log.get(cmd);
1160
+
1161
+ if (!chainBroken && logged && logged.ran) {
1162
+ // (1) Cross-reference the capture log first (only when chain is intact).
1163
+ // A broken chain means we cannot trust the log's pass signals — skip this
1164
+ // shortcut and fall through to the backstop/NOT_VERIFIED path below.
1165
+ if (logged.failed) {
1166
+ const exit = Number.isInteger(logged.exitCode) ? ` (exitCode:${logged.exitCode})` : '';
1167
+ warnings.push(`${base} evidence check ${id}: capture log CONTRADICTS claimed pass — command "${safeOneLine(cmd, 120)}" was recorded as FAIL${exit}. This is a caught false-completion.`);
1168
+ } else if (hasLaunderingOperator(cmd)) {
1169
+ // Fix D: exit-code laundering. The captured exit-0 is not trustworthy — the command
1170
+ // baked in '|| true' / '|| :' / '; true' / '; exit 0' / '| true' to mask the real result.
1171
+ warnings.push(`${base} evidence check ${id}: claimed pass relies on an exit-code-laundered command "${safeOneLine(cmd, 120)}" — the exit code is not a trustworthy signal (laundering operators mask the real exit code).`);
1172
+ }
1173
+ // else: log shows it ran and passed with no laundering → satisfied deterministically.
1174
+ continue;
1175
+ }
1176
+
1177
+ // (2) Backstop: the log has NO execution for this claimed-pass command.
1178
+ if (backstopMode === 'skip') {
1179
+ warnings.push(`${base} evidence check ${id}: claimed pass but NOT_VERIFIED — command "${safeOneLine(cmd, 120)}" was never captured and backstop re-run is disabled (FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip).`);
1180
+ continue;
1181
+ }
1182
+ const trusted = resolveTrustedCommand(root, artifactDir, check, acceptance);
1183
+ if (!trusted) {
1184
+ warnings.push(`${base} evidence check ${id}: claimed pass but NOT_VERIFIED — command "${safeOneLine(cmd, 120)}" was never captured and no trusted command (acceptance criterion / declared manifest target) resolves to re-run it. Set FLOW_AGENTS_GOAL_FIT_RECHECK=true to opt into re-running the model's free-form command.`);
1185
+ continue;
1186
+ }
1187
+ const outcome = runBackstop(trusted);
1188
+ if (!outcome.ran) {
1189
+ warnings.push(`${base} evidence check ${id}: claimed pass but NOT_VERIFIED — trusted backstop (${trusted.source}) could not run (${safeOneLine(outcome.error, 80)}).`);
1190
+ continue;
1191
+ }
1192
+ if (!outcome.passed) {
1193
+ const note = `${base} evidence check ${id}: trusted backstop (${trusted.source}) re-run of "${trusted.argv.join(' ')}" FAILED with exit ${outcome.exitCode}, contradicting the claimed pass. This is a caught false-completion.`;
1194
+ if (backstopMode === 'off') warnings.push(`${note} [backstop in warn mode — not blocking]`);
1195
+ else warnings.push(note);
1196
+ }
1197
+ // backstop passed → claim deterministically confirmed by re-run, no warning.
1198
+ }
1199
+
1200
+ return warnings;
1201
+ }
1202
+
1203
+ /**
1204
+ * Namespace-agnostic captured-FAIL reconciliation (AC1 — closes the allowlist bypass).
1205
+ *
1206
+ * The existing captureCrossReference only checks claims that pass the namespace
1207
+ * allowlist (workflow.* prefix or declared gateExpects[]). A kit-typed claim
1208
+ * (e.g. builder.verify.tests) whose command-log entry says FAIL can slip through
1209
+ * when no active FlowDefinition declares that claimType.
1210
+ *
1211
+ * This function is namespace-agnostic: it builds the LATEST-capture-per-command map
1212
+ * and for each command whose last capture is FAIL it checks:
1213
+ * (A) Any claim (ANY namespace) asserting pass for that command → false-completion HARD_BLOCK
1214
+ * Fix A: runs on EVERY stop (status-independent). A claim contradicting the capture is
1215
+ * a false-completion regardless of whether state.json shows the task as 'done'.
1216
+ * Fix D: also checks commands with laundering operators whose latest capture is PASS (exit 0);
1217
+ * a claimed-pass for a laundered command is NOT a trustworthy signal.
1218
+ * Fix B: Case B (unaccounted at completion — no-claim-at-all branch) REMOVED.
1219
+ * It over-blocked incidental failures (grep no-match, git diff --exit-code, etc.).
1220
+ * Case A covers the real threat (claimed pass contradicts captured fail).
1221
+ * Fix E: verifyCommandLogChain called; on broken chain reconciliation is skipped (log
1222
+ * integrity is already signalled by captureCrossReference).
1223
+ *
1224
+ * No-over-block guarantees:
1225
+ * - Fail-then-re-run-to-pass: latest is PASS → not in latestFails → no warning.
1226
+ * - Acknowledged failure: claim has failing/disputed status → ackClaims → no warning.
1227
+ * - No-command session: no log → latestLog empty → no warning.
1228
+ * - Incidental fail (grep/diff/find) with no pass-claim → no warning (Case B removed).
1229
+ */
1230
+ function capturedFailReconciliation(root, artifactDir, taskStatus) {
1231
+ // Fix A: removed the `completing` guard. Run on EVERY stop — status-independent.
1232
+ // A claim contradicting the capture is a false-completion whether or not the agent
1233
+ // has set state.json.status to a terminal value. (taskStatus param kept for compat.)
1234
+
1235
+ const latestLog = readLatestCommandLog(artifactDir);
1236
+ if (latestLog.size === 0) return []; // No captures — nothing to reconcile
1237
+
1238
+ // Fix E: verify chain integrity; skip reconciliation when broken (log untrusted).
1239
+ // The main integrity warning is already emitted by captureCrossReference.
1240
+ const chainResult = verifyCommandLogChain(artifactDir);
1241
+ if (chainResult.status === 'broken') return []; // Can't trust pass/fail signals
1242
+
1243
+ // Collect commands whose LATEST capture is FAIL (Case A).
1244
+ const latestFails = new Map(); // cmd -> {failed:true, exitCode}
1245
+ for (const [cmd, info] of latestLog) {
1246
+ if (info.failed) latestFails.set(cmd, info);
1247
+ }
1248
+
1249
+ // Fix D: Collect commands whose latest capture is PASS (exit 0) but whose command
1250
+ // string contains an exit-code-neutralizing operator (laundering). The captured
1251
+ // exit-0 is not a trustworthy signal for these — real test failures are hidden.
1252
+ const launderedPass = new Map(); // cmd -> {failed:false, exitCode:0}
1253
+ for (const [cmd, info] of latestLog) {
1254
+ if (!info.failed && hasLaunderingOperator(cmd)) launderedPass.set(cmd, info);
1255
+ }
1256
+
1257
+ if (latestFails.size === 0 && launderedPass.size === 0) return []; // Nothing to flag
1258
+
1259
+ const base = relative(root, artifactDir);
1260
+ const warnings = [];
1261
+
1262
+ // Load the trust.bundle for claim/evidence analysis.
1263
+ const bundle = readJsonFile(path.join(artifactDir, 'trust.bundle'));
1264
+ const allClaims = bundle && Array.isArray(bundle.claims) ? bundle.claims : [];
1265
+ const allEvidence = bundle && Array.isArray(bundle.evidence) ? bundle.evidence : [];
1266
+
1267
+ // Build: claimId → claim (for fast evidence→claim lookup)
1268
+ const claimById = new Map();
1269
+ for (const c of allClaims) {
1270
+ if (c && c.id) claimById.set(c.id, c);
1271
+ }
1272
+
1273
+ // commandsToCheck: FAIL-latest commands + laundered-pass commands
1274
+ const commandsToCheck = new Set([...latestFails.keys(), ...launderedPass.keys()]);
1275
+
1276
+ // For each relevant command, track what claims say about it.
1277
+ // cmdAcc: cmd → {passClaims: [...], ackClaims: []}
1278
+ const cmdAcc = new Map();
1279
+ const initAcc = (cmd) => {
1280
+ if (!cmdAcc.has(cmd)) cmdAcc.set(cmd, { passClaims: [], ackClaims: [] });
1281
+ return cmdAcc.get(cmd);
1282
+ };
1283
+
1284
+ // Path A: evidence items with execution.label link a claim to a specific command.
1285
+ for (const ev of allEvidence) {
1286
+ if (!ev || !ev.execution || !ev.execution.label) continue;
1287
+ const cmd = normalizeCommand(ev.execution.label);
1288
+ if (!cmd || !commandsToCheck.has(cmd)) continue;
1289
+ const claim = claimById.get(ev.claimId);
1290
+ if (!claim) continue;
1291
+ const acc = initAcc(cmd);
1292
+ const s = String(claim.status || '').toLowerCase();
1293
+ const v = normalizedStatus(claim.value || '');
1294
+ if (claimAssertsPass(s, v)) acc.passClaims.push(claim);
1295
+ if (claimAcknowledgesFailure(s, v)) acc.ackClaims.push(claim);
1296
+ }
1297
+
1298
+ // Path B: claim.fieldOrBehavior resolves directly to the command (field-based resolution).
1299
+ for (const c of allClaims) {
1300
+ if (!c) continue;
1301
+ const cmd = normalizeCommand(c.fieldOrBehavior || '');
1302
+ if (!cmd || !commandsToCheck.has(cmd)) continue;
1303
+ const acc = initAcc(cmd);
1304
+ const s = String(c.status || '').toLowerCase();
1305
+ const v = normalizedStatus(c.value || '');
1306
+ if (claimAssertsPass(s, v)) acc.passClaims.push(c);
1307
+ if (claimAcknowledgesFailure(s, v)) acc.ackClaims.push(c);
1308
+ }
1309
+
1310
+ // Case A: Evaluate each FAIL-latest command for pass-claims (status-independent).
1311
+ for (const [cmd, failInfo] of latestFails) {
1312
+ const exit = Number.isInteger(failInfo.exitCode) ? failInfo.exitCode : null;
1313
+ const exitStr = exit !== null ? ` (exit ${exit})` : '';
1314
+ const acc = cmdAcc.get(cmd);
1315
+
1316
+ if (acc && acc.passClaims.length > 0) {
1317
+ // Any-namespace claim asserts pass for a command whose latest capture is FAIL.
1318
+ // This is the namespace-agnostic false-completion signal.
1319
+ const claim = acc.passClaims[0];
1320
+ warnings.push(
1321
+ `${base} captured command '${safeOneLine(cmd, 120)}' last ran FAIL${exitStr} ` +
1322
+ `but claim ${safeOneLine(claim.subjectId || claim.id, 80)} (${safeOneLine(claim.claimType, 48)}) ` +
1323
+ `asserts pass — namespace-agnostic caught false-completion.`
1324
+ );
1325
+ }
1326
+ // Fix B: Case B (unaccounted at completion — no-claim-at-all) REMOVED.
1327
+ // It over-blocked incidental failures with no pass-claim. Case A covers the real threat.
1328
+ // Acknowledged failure (ackClaims.length > 0) → OK, no warning.
1329
+ }
1330
+
1331
+ // Fix D: Evaluate laundered-pass commands for pass-claims.
1332
+ for (const [cmd] of launderedPass) {
1333
+ const acc = cmdAcc.get(cmd);
1334
+ if (acc && acc.passClaims.length > 0) {
1335
+ const claim = acc.passClaims[0];
1336
+ warnings.push(
1337
+ `${base} captured command '${safeOneLine(cmd, 120)}' claimed pass relies on an ` +
1338
+ `exit-code-laundered command — claim ${safeOneLine(claim.subjectId || claim.id, 80)} ` +
1339
+ `(${safeOneLine(claim.claimType, 48)}) asserts pass but the exit code is not a ` +
1340
+ `trustworthy signal (laundering operators mask the real exit code).`
1341
+ );
1342
+ }
1343
+ }
1344
+
1345
+ return warnings;
1346
+ }
1347
+
1348
+ // ─── ADR 0010 Phase 2: enforce on the canonical Hachure trust.bundle ──────────
1349
+
1350
+ // ─── ADR 0010 Phase 2: enforce on the canonical Hachure trust.bundle ──────────
1351
+ // The trust.bundle (emitted by workflow-sidecar via @kontourai/surface) carries
1352
+ // each claim's Surface-derived status — including capture-authoritative results
1353
+ // (a claimed-pass whose captured command FAILED is already `disputed` here). A
1354
+ // high-impact `disputed` claim is the canonical false-completion signal; we gate
1355
+ // on the bundle the producers already emit, not on bespoke markdown.
1356
+ //
1357
+ // ADR 0010 Phase 2b: re-derive-at-gate hardening.
1358
+ // We re-derive each claim's status from the bundle's own evidence/events/policies
1359
+ // via Surface's canonical deriveClaimStatus, so editing the stored `claim.status`
1360
+ // field does not bypass the gate. If the re-derived status is disputed/rejected
1361
+ // for a high/critical claim, we block. If the re-derived status DIFFERS from the
1362
+ // stored status (e.g. stored "verified" but evidence re-derives to "disputed"),
1363
+ // that mismatch is a strong tamper signal — block with an explicit warning.
1364
+ // Fail-open: if Surface is unavailable, fall back to the stored-status check.
1365
+ //
1366
+ // ADR 0016 P-c: when activeFlowStep is non-null, claim-selection uses the gate's
1367
+ // declared claimType set (gateExpects[].bundle_claim.claimType). When null, the
1368
+ // existing workflow.* prefix filter runs unchanged (fallback). The re-derivation
1369
+ // loop, tamper-detection, high/critical filter, and block/exit-2 logic are
1370
+ // STRUCTURALLY UNCHANGED — only WHICH claims are selected changes.
1371
+ async function bundleEnforcement(artifactDir, activeFlowStep) {
1372
+ const bundle = readJsonFile(path.join(artifactDir, 'trust.bundle'));
1373
+ if (!bundle || !Array.isArray(bundle.claims)) return [];
1374
+
1375
+ const surface = await tryLoadSurface();
1376
+ const warnings = [];
1377
+
1378
+ const allEvidence = Array.isArray(bundle.evidence) ? bundle.evidence : [];
1379
+ const allEvents = Array.isArray(bundle.events) ? bundle.events : [];
1380
+ const allPolicies = Array.isArray(bundle.policies) ? bundle.policies : [];
1381
+
1382
+ // P-c: claim-selection predicate.
1383
+ // When activeFlowStep is non-null, select claims whose claimType is in the
1384
+ // gate's declared set. When null, fall back to the existing workflow.* prefix
1385
+ // filter so no-FlowDefinition sessions are unaffected.
1386
+ const declaredClaimTypes = activeFlowStep && Array.isArray(activeFlowStep.gateExpects)
1387
+ ? new Set(activeFlowStep.gateExpects.map(e => e && e.bundle_claim && e.bundle_claim.claimType).filter(Boolean))
1388
+ : null;
1389
+
1390
+ // SECURITY (Layer 2 — gate-bypass-chain fix): use UNION form instead of if/else.
1391
+ // With the old if/else, an empty declaredClaimTypes (Set{}) from a fake flow with
1392
+ // expects:[] caused isSelectedClaim to return false for EVERY claim — all
1393
+ // bundleEnforcement checks were silently bypassed. The union form ensures workflow.*
1394
+ // claims are ALWAYS enforced regardless of whether a FlowDefinition is active or what
1395
+ // its expects[] contains. Declared claimTypes are added on top of the baseline.
1396
+ //
1397
+ // AC3 two-part dependency (regression guard — see test_captured_fail_reconciliation.sh):
1398
+ // Part 1 (this union form): ensures bundleEnforcement always enforces workflow.* claims
1399
+ // regardless of declaredClaimTypes being null or empty Set.
1400
+ // Part 2 (empty-expects guard below): emits gate-misconfiguration HARD_BLOCK when
1401
+ // declaredClaimTypes is a non-null empty Set (active flow with expects:[]).
1402
+ // Both parts are required: Part 1 alone lets the empty-expects bypass slip through
1403
+ // without a loud signal; Part 2 alone without the union would silently pass 0 claims.
1404
+ const isSelectedClaim = (claim) => {
1405
+ const ct = String(claim.claimType || '');
1406
+ // Union: workflow.* is always selected (baseline); declared types extend it.
1407
+ return ct.startsWith('workflow.') || (declaredClaimTypes != null && declaredClaimTypes.has(ct));
1408
+ };
1409
+
1410
+ // Misconfiguration guard: an active FlowDefinition with zero expects[] is suspicious.
1411
+ // Empty expects can indicate a tampered flow definition (the gate-bypass exploit).
1412
+ // Emit a hard warning so operators see it regardless of session phase.
1413
+ if (declaredClaimTypes !== null && declaredClaimTypes.size === 0) {
1414
+ warnings.push(`gate misconfiguration: active FlowDefinition has empty expects[] (possible tampered flow definition or zero-expects bypass attempt); verify the flow definition is authentic before delivery.`);
1415
+ }
1416
+
1417
+ // AC3 fail-closed (high-assurance path): when @kontourai/surface is unavailable, we
1418
+ // cannot re-derive claim status from evidence/events/policies. For high/critical-impact
1419
+ // selected claims this means we silently trust the stored status field — exactly the
1420
+ // vector re-derive-at-gate (ADR 0010 Phase 2b) was designed to close.
1421
+ // Fail-closed: emit a FULL_BLOCK warning so operators see that re-derivation is absent.
1422
+ // Low-impact-only bundles stay non-blocking (noise reduction for common dev sessions).
1423
+ //
1424
+ // BAR-RAISER, NOT AIRTIGHT: an agent that deletes/corrupts @kontourai/surface to force
1425
+ // surface=null would trigger this path deliberately. The real anchor is external:
1426
+ // signed checkpoints + human review. This raises the cost of the silent-trust vector.
1427
+ if (!surface) {
1428
+ const highImpactCount = bundle.claims.reduce((n, claim) => {
1429
+ if (!claim || typeof claim !== 'object') return n;
1430
+ const impact = String(claim.impactLevel || '').toLowerCase();
1431
+ if (impact !== 'high' && impact !== 'critical') return n;
1432
+ return isSelectedClaim(claim) ? n + 1 : n;
1433
+ }, 0);
1434
+ if (highImpactCount > 0) {
1435
+ warnings.push(
1436
+ `surface unavailable — ${highImpactCount} high/critical-impact claim(s) could not be re-derived at gate; ` +
1437
+ 'stored claim status is trusted without independent re-derivation (fail-closed: high-assurance path). ' +
1438
+ 'Ensure @kontourai/surface is installed and importable, or escalate for operator review.'
1439
+ );
1440
+ }
1441
+ }
1442
+
1443
+ for (const claim of bundle.claims) {
1444
+ if (!claim || typeof claim !== 'object') continue;
1445
+ const impact = String(claim.impactLevel || '').toLowerCase();
1446
+ const storedStatus = String(claim.status || '').toLowerCase();
1447
+ if (impact !== 'high' && impact !== 'critical') continue;
1448
+ // P-c: claim-selection — only process claims matching the active predicate.
1449
+ if (!isSelectedClaim(claim)) continue;
1450
+
1451
+ // Step 1: Re-derive status via Surface when available.
1452
+ // This closes the gaming vector: editing the stored status field cannot bypass
1453
+ // the gate because we recompute from evidence/events/policies.
1454
+ let recomputedStatus = null; // null means re-derive was not attempted or threw
1455
+ if (surface && typeof surface.deriveClaimStatus === 'function') {
1456
+ const claimId = claim.id;
1457
+ const claimEvidence = allEvidence.filter(ev => ev && ev.claimId === claimId);
1458
+ const claimEvents = allEvents.filter(evt => evt && evt.claimId === claimId);
1459
+ try {
1460
+ const result = surface.deriveClaimStatus({
1461
+ claim,
1462
+ evidence: claimEvidence,
1463
+ events: claimEvents,
1464
+ policies: allPolicies,
1465
+ });
1466
+ recomputedStatus = result && typeof result.status === 'string' ? result.status.toLowerCase() : 'unknown';
1467
+ } catch {
1468
+ // deriveClaimStatus threw (e.g. schema mismatch) — fall back to stored status.
1469
+ recomputedStatus = null;
1470
+ }
1471
+ }
1472
+
1473
+ // Step 2: Compute the effective blocking status.
1474
+ // Use the STRICTER of stored vs recomputed so neither can be individually
1475
+ // gamed: deleting evidence cannot clear a stored `disputed`, and flipping
1476
+ // stored to "verified" cannot hide a recomputed `disputed`.
1477
+ const effectiveDisputed = storedStatus === 'disputed' || storedStatus === 'rejected'
1478
+ || recomputedStatus === 'disputed' || recomputedStatus === 'rejected';
1479
+
1480
+ if (!effectiveDisputed) continue; // neither stored nor recomputed is blocking
1481
+
1482
+ // Step 3: Emit the appropriate warning.
1483
+ // Tamper-detection: stored "verified"/"assumed" but evidence re-derives to
1484
+ // "disputed"/"rejected" — the stored status was likely altered to bypass the gate.
1485
+ const isTampered = recomputedStatus !== null
1486
+ && (storedStatus === 'verified' || storedStatus === 'assumed')
1487
+ && (recomputedStatus === 'disputed' || recomputedStatus === 'rejected');
1488
+
1489
+ if (isTampered) {
1490
+ warnings.push(`trust.bundle claim tampered: ${safeOneLine(claim.subjectId || claim.id, 80)} (${safeOneLine(claim.claimType, 48)}) — stored status "${storedStatus}" does not match recompute "${recomputedStatus}" (possible tampered bundle); caught false-completion. Run: npm run workflow:sidecar -- claim ${safeOneLine(claim.subjectId || claim.id, 80)} ${artifactDir}`);
1491
+ } else {
1492
+ warnings.push(`trust.bundle claim disputed: ${safeOneLine(claim.subjectId || claim.id, 80)} (${safeOneLine(claim.claimType, 48)}) — Surface recompute shows not verified; caught false-completion. Run: npm run workflow:sidecar -- claim ${safeOneLine(claim.subjectId || claim.id, 80)} ${artifactDir}`);
1493
+ }
1494
+ }
290
1495
  return warnings;
291
1496
  }
292
1497
 
293
- function markdownVerdict(text) {
294
- const verdict = (/###\s+Verdict:\s*([A-Za-z_ -]+)/i.exec(text) || [])[1]
295
- || (/^Build:\s*\[?([A-Za-z_ -]+)\]?/im.exec(text) || [])[1]
296
- || '';
297
- return normalizedStatus(verdict).replace(/[^a-z_ -].*$/, '').trim();
1498
+ /**
1499
+ * Scope to the session's current task when .flow-agents/current.json points at
1500
+ * one (mirroring evidence-capture.js). Returns the slug dir, or null to fall back
1501
+ * to scanning all of .flow-agents (newest-mtime).
1502
+ */
1503
+ function preferredArtifactDir(flowAgentsDir) {
1504
+ const current = readJsonFile(path.join(flowAgentsDir, 'current.json'));
1505
+ if (!current) return null;
1506
+ const slug = current.artifact_dir || current.active_slug;
1507
+ if (typeof slug !== 'string' || !slug.trim()) return null;
1508
+ const safe = slug.replace(/\.\.+/g, '').replace(/^[/\\]+/, '');
1509
+ const dir = path.join(flowAgentsDir, safe);
1510
+ return dir.startsWith(flowAgentsDir + path.sep) && fs.existsSync(dir) ? dir : null;
1511
+ }
1512
+
1513
+ /**
1514
+ * A task is pre-execution (work not yet started) when its state.json status/phase
1515
+ * is still in the idea→planning band, or (no state.json) its markdown status is.
1516
+ */
1517
+ function isPreExecution(artifactDir, markdownStatus) {
1518
+ const state = readJsonFile(path.join(artifactDir, 'state.json'));
1519
+ if (state) {
1520
+ return PRE_EXECUTION_STATUSES.has(normalizedStatus(state.status))
1521
+ || PRE_EXECUTION_PHASES.has(normalizedStatus(state.phase));
1522
+ }
1523
+ return PRE_EXECUTION_STATUSES.has(normalizedStatus(markdownStatus));
1524
+ }
1525
+
1526
+
1527
+ // ─── Wave 2c: no-bundle/no-state fallback gate ────────────────────────────────
1528
+ // Sessions that have NEITHER a trust.bundle NOR a state.json fall through
1529
+ // both bundleEnforcement (no bundle) and sidecarGuidance (no state). Without the
1530
+ // old markdown heading checks this would create a silent ungated-session path.
1531
+ // If a trust.bundle exists, bundleEnforcement handles it. If state.json exists,
1532
+ // sidecarGuidance handles it. The gap: a session with only a markdown artifact.
1533
+ //
1534
+ // ADR 0010 Phase 4b: Adjustment A (Final Acceptance hygiene):
1535
+ // When the task is delivered but acceptance criteria are still pending, emit the
1536
+ // Final Acceptance reminder. Read from trust.bundle claims when present; fall back
1537
+ // to acceptance.json for bundle-less sessions.
1538
+ //
1539
+ // ADR 0016 P-c: pass activeFlowStep so bundlePendingCriteriaCount includes declared types.
1540
+ function missingBundleOrStateSignal(artifactDir, activeFlowStep) {
1541
+ // Build the declared claimType set from the FlowDefinition gate expects[] (P-c).
1542
+ const declaredClaimTypes = activeFlowStep && Array.isArray(activeFlowStep.gateExpects)
1543
+ ? new Set(activeFlowStep.gateExpects.map(e => e && e.bundle_claim && e.bundle_claim.claimType).filter(Boolean))
1544
+ : null;
1545
+ const warnings = [];
1546
+ const hasBundle = fs.existsSync(path.join(artifactDir, 'trust.bundle'));
1547
+ const state = readJsonFile(path.join(artifactDir, 'state.json'));
1548
+
1549
+ if (!hasBundle && !state) {
1550
+ // Neither trust.bundle nor state.json: session is untracked by sidecar path.
1551
+ // Emit a NOT_VERIFIED warning so execution-phase sessions remain gated.
1552
+ const base = path.basename(artifactDir);
1553
+ warnings.push(`${base} NOT_VERIFIED — no trust.bundle or state.json found; run 'workflow-sidecar record-evidence' to build the evidence record before delivery.`);
1554
+ return warnings;
1555
+ }
1556
+
1557
+ // Adjustment A: Final Acceptance hygiene.
1558
+ // When the task is delivered but acceptance criteria are still pending, emit the
1559
+ // Final Acceptance reminder. Bundle-first; fall back to acceptance.json.
1560
+ const bundle = readJsonFile(path.join(artifactDir, 'trust.bundle'));
1561
+ const bundleClaims = bundle && Array.isArray(bundle.claims) ? bundle.claims : null;
1562
+
1563
+ if (bundleClaims) {
1564
+ // Phase 4b: read pending criteria count from trust.bundle claims.
1565
+ // P-c: pass declaredClaimTypes so declared-type acceptance claims are included.
1566
+ const pendingCount = bundlePendingCriteriaCount(bundleClaims, declaredClaimTypes);
1567
+ if (pendingCount !== null && pendingCount > 0) {
1568
+ const base = path.basename(artifactDir);
1569
+ warnings.push(`${base} Final Acceptance: ${pendingCount} acceptance criterion/criteria still pending; complete CI/merge/docs before final delivery.`);
1570
+ }
1571
+ } else {
1572
+ // Fallback: no bundle — read from acceptance.json (existing behavior, no regression).
1573
+ const acceptance = readJsonFile(path.join(artifactDir, 'acceptance.json'));
1574
+ if (acceptance && Array.isArray(acceptance.criteria)) {
1575
+ const pendingCriteria = acceptance.criteria.filter(c => {
1576
+ const s = normalizedStatus(c && c.status);
1577
+ return s === 'pending' || s === 'not_started' || s === '' || s === 'unknown';
1578
+ });
1579
+ if (pendingCriteria.length > 0) {
1580
+ const base = path.basename(artifactDir);
1581
+ warnings.push(`${base} Final Acceptance: ${pendingCriteria.length} acceptance criterion/criteria still pending; complete CI/merge/docs before final delivery.`);
1582
+ }
1583
+ }
1584
+ }
1585
+
1586
+ return warnings;
298
1587
  }
299
1588
 
300
- function analyze(root, now = Date.now()) {
301
- const dirs = [path.join(root, '.flow-agents')];
302
- const artifacts = dirs
1589
+ // ─── Gate severity classification regexes (module scope used by analyze() and run())
1590
+ //
1591
+ // HARD_BLOCK: always blocks, even for pre-execution and terminal tasks.
1592
+ // Fires on genuine false-completion signals (a claimed pass the capture log or
1593
+ // evidence.json contradicts), integrity failures, and gate misconfiguration.
1594
+ //
1595
+ // FULL_BLOCK: fires for execution-onward tasks (post-planning, non-terminal).
1596
+ // Includes all HARD_BLOCK patterns plus completeness/hygiene and not-done state.
1597
+ //
1598
+ // Both are used in analyze() for blocking decisions AND in run() for the AC2
1599
+ // MAX_BLOCKS hard-block guard (preventing auto-release of hard blocks).
1600
+ const HARD_BLOCK = /contradicts evidence\.json|caught false-completion|evidence verdict:|evidence check .+ status:|critique status|critique open|required sidecar is missing|command-log integrity check FAILED|gate misconfiguration:|exit-code-laundered/;
1601
+ // FULL_BLOCK adds: workflow-state hygiene, surface-unavailable fail-closed, missing log.
1602
+ const FULL_BLOCK = /status:|Definition Of Done|Goal Fit|sidecar validation:|contradicts evidence\.json|workflow state|evidence verdict|evidence check|NOT_VERIFIED gap|critique status|critique open|next action|caught false-completion|NOT_VERIFIED —|command-log integrity check FAILED|gate misconfiguration:|surface unavailable —|expected capture log is missing|exit-code-laundered/;
1603
+
1604
+ async function analyze(root, now = Date.now()) {
1605
+ const flowAgentsDir = path.join(root, '.flow-agents');
1606
+ // Scope to the session's current task when current.json names one, so an
1607
+ // unrelated active workflow elsewhere in the repo does not gate this stop.
1608
+ const scoped = preferredArtifactDir(flowAgentsDir);
1609
+ const searchDirs = scoped ? [scoped] : [flowAgentsDir];
1610
+ const artifacts = searchDirs
303
1611
  .flatMap(dir => walkMarkdown(dir))
304
1612
  .map(readArtifact)
305
1613
  .filter(isWorkflowArtifact)
@@ -317,51 +1625,174 @@ function analyze(root, now = Date.now()) {
317
1625
  warnings.push(`${relPath} is still status:${status} (${ageMinutes}m old). Do not final-answer as complete unless the next step is explicit.`);
318
1626
  }
319
1627
 
320
- if (!hasHeading(latest.text, 'Definition Of Done')) {
321
- warnings.push(`${relPath} is missing ## Definition Of Done, so the user-facing finish line is not explicit.`);
322
- }
1628
+ // Builder heading completeness checks (hasHeading DOD/Goal Fit Gate) removed in ADR 0010 2c.
1629
+ // Verdict is now bundle-driven via bundleEnforcement + sidecarGuidance.
1630
+ // Sessions with neither trust.bundle nor state.json are caught by missingBundleOrStateSignal.
323
1631
 
324
- if (!hasHeading(latest.text, 'Goal Fit Gate')) {
325
- warnings.push(`${relPath} is missing ## Goal Fit Gate, so local acceptance has not been checked.`);
326
- } else {
327
- for (const item of uncheckedInSection(latest.text, 'Goal Fit Gate').slice(0, 6)) {
328
- warnings.push(`${relPath} Goal Fit unchecked: ${item}`);
329
- }
330
- }
331
-
332
- if (status === 'delivered' && hasHeading(latest.text, 'Final Acceptance')) {
333
- const uncheckedFinal = uncheckedInSection(latest.text, 'Final Acceptance');
334
- if (uncheckedFinal.length > 0) {
335
- warnings.push(`${relPath} local delivery is marked delivered, but Final Acceptance still has ${uncheckedFinal.length} open item(s) for CI/merge/docs promotion.`);
336
- }
337
- }
1632
+ // ADR 0016 P-c: load the active FlowDefinition gate (fail-open: null when absent).
1633
+ // Null existing workflow.* fallback path unchanged. Non-null expects[]-driven claim selection.
1634
+ const activeFlowStep = loadActiveFlowStep(flowAgentsDir);
338
1635
 
339
1636
  warnings.push(...sidecarValidation(root, path.dirname(latest.file)));
340
- const evidence = readJsonFile(path.join(path.dirname(latest.file), 'evidence.json'));
341
- if (evidence && markdownVerdict(latest.text) === 'pass' && normalizedStatus(evidence.verdict) === 'fail') {
342
- warnings.push(`${relPath} Markdown PASS contradicts evidence.json verdict fail.`);
1637
+ warnings.push(...sidecarGuidance(root, path.dirname(latest.file), activeFlowStep));
1638
+ const captureWarnings = captureCrossReference(root, path.dirname(latest.file), activeFlowStep);
1639
+ warnings.push(...captureWarnings);
1640
+ // Dedup: bundleEnforcement and captureCrossReference can both fire "caught false-completion"
1641
+ // for the same disputed claim. Suppress the bundleEnforcement warning ONLY when
1642
+ // captureCrossReference already produced a hard-block warning ("caught false-completion")
1643
+ // for the same check. NOT_VERIFIED / backstop-skip capture warnings must NOT suppress
1644
+ // the bundle tamper/disputed signal — that mismatch is a re-derive block independent of
1645
+ // whether the command was ever captured (anti-gaming guarantee, ADR 0010 Phase 2b).
1646
+ const captureHardBlockIds = new Set();
1647
+ for (const w of captureWarnings) {
1648
+ if (!/caught false-completion/.test(w)) continue; // only hard blocks suppress bundle warning
1649
+ const m = /evidence check ([^\s:]+):/.exec(w);
1650
+ if (m) captureHardBlockIds.add(m[1]);
343
1651
  }
344
- warnings.push(...sidecarGuidance(root, path.dirname(latest.file)));
1652
+ const bundleWarnings = (await bundleEnforcement(path.dirname(latest.file), activeFlowStep)).filter(w => {
1653
+ if (!captureHardBlockIds.size) return true;
1654
+ // bundleEnforcement warns: "trust.bundle claim disputed: <subjectId> ..."
1655
+ const m = /trust\.bundle claim (?:disputed|tampered): ([^\s(]+)/.exec(w);
1656
+ if (!m) return true;
1657
+ const subjectId = m[1];
1658
+ // subjectId = "slug/checkId" — extract the checkId (last segment)
1659
+ const checkId = subjectId.includes('/') ? subjectId.slice(subjectId.indexOf('/') + 1) : subjectId;
1660
+ // If captureCrossReference already hard-blocked this check, suppress the bundle warning.
1661
+ return !captureHardBlockIds.has(checkId);
1662
+ });
1663
+ warnings.push(...bundleWarnings);
1664
+ warnings.push(...missingBundleOrStateSignal(path.dirname(latest.file), activeFlowStep));
1665
+
1666
+ // A pre-execution task (not started) OR a terminal task (which is itself a
1667
+ // completion *claim*) must not block on mere incompleteness — but a FALSE claim
1668
+ // (capture/evidence contradiction) still blocks at any phase. This is the whole
1669
+ // point of the capture cross-reference: catch a task that falsely claims done.
1670
+ const gateState = readJsonFile(path.join(path.dirname(latest.file), 'state.json'));
1671
+ const taskStatus = gateState ? normalizedStatus(gateState.status) : normalizedStatus(status);
1672
+ const preExecution = isPreExecution(path.dirname(latest.file), status);
1673
+ const terminal = TERMINAL_STATUSES.has(taskStatus);
1674
+
1675
+ // Namespace-agnostic captured-FAIL reconciliation (AC1 — closes the allowlist bypass).
1676
+ // Fix A: status-independent — runs on EVERY stop. A claim contradicting the capture
1677
+ // is a false-completion whether or not the agent says the task is 'done'.
1678
+ warnings.push(...capturedFailReconciliation(root, path.dirname(latest.file), taskStatus));
345
1679
 
346
- const blocking = warnings.some(w => /status:|Definition Of Done|Goal Fit|sidecar validation:|contradicts evidence\.json|workflow state|evidence verdict|evidence check|NOT_VERIFIED gap|critique status|critique open|next action/.test(w));
347
- return { warnings, blocking };
1680
+ // Use module-scope HARD_BLOCK / FULL_BLOCK (defined above analyze()).
1681
+ // pre-execution/terminal tasks: only HARD_BLOCK signals cause a block.
1682
+ // execution-onward tasks: FULL_BLOCK signals cause a block.
1683
+ const blockRe = (preExecution || terminal) ? HARD_BLOCK : FULL_BLOCK;
1684
+ const blocking = warnings.some(w => {
1685
+ // Capture cross-reference warn-mode notes never block (operator opted out).
1686
+ if (/\[backstop in warn mode — not blocking\]/.test(w)) return false;
1687
+ return blockRe.test(w);
1688
+ });
1689
+ return { warnings, blocking, preExecution };
1690
+ }
1691
+
1692
+ /**
1693
+ * Resolve the enforcement mode. FLOW_AGENTS_GOAL_FIT_MODE (block|warn|off) wins;
1694
+ * the legacy FLOW_AGENTS_GOAL_FIT_STRICT=true maps to block; otherwise the
1695
+ * canonical engine default is warn.
1696
+ */
1697
+ function resolveGoalFitMode() {
1698
+ const explicit = String(process.env.FLOW_AGENTS_GOAL_FIT_MODE || '').trim().toLowerCase();
1699
+ if (explicit === 'block' || explicit === 'warn' || explicit === 'off') return explicit;
1700
+ const strict = String(process.env.FLOW_AGENTS_GOAL_FIT_STRICT || '').toLowerCase() === 'true';
1701
+ return strict ? 'block' : 'warn';
1702
+ }
1703
+
1704
+ /**
1705
+ * Escape hatch: cap how many times block mode may refuse the SAME goal-fit gap
1706
+ * in a row, so a genuinely-unsatisfiable goal cannot trap the agent forever.
1707
+ * After this many consecutive identical blocks the hook releases (exit 0) with a
1708
+ * loud notice. Configurable via FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS (default 3).
1709
+ */
1710
+ function resolveMaxBlocks() {
1711
+ const raw = Number.parseInt(process.env.FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS || '', 10);
1712
+ return Number.isInteger(raw) && raw > 0 ? raw : 3;
348
1713
  }
349
1714
 
350
- function run(rawInput) {
1715
+ function blockStreakFile(root) {
1716
+ return path.join(root, '.flow-agents', '.goal-fit-block-streak.json');
1717
+ }
1718
+
1719
+ function reasonsHash(warnings) {
1720
+ const text = (warnings || []).join('\n');
1721
+ let h = 5381;
1722
+ for (let i = 0; i < text.length; i += 1) h = ((h << 5) + h + text.charCodeAt(i)) >>> 0;
1723
+ return String(h);
1724
+ }
1725
+
1726
+ function clearBlockStreak(root) {
1727
+ try { fs.rmSync(blockStreakFile(root), { force: true }); } catch { /* best effort */ }
1728
+ }
1729
+
1730
+ function bumpBlockStreak(root, hash) {
1731
+ const file = blockStreakFile(root);
1732
+ const prev = readJsonFile(file) || {};
1733
+ const count = prev.hash === hash ? (Number(prev.count) || 0) + 1 : 1;
1734
+ try {
1735
+ fs.mkdirSync(path.dirname(file), { recursive: true });
1736
+ fs.writeFileSync(file, JSON.stringify({ hash, count }));
1737
+ } catch { /* best effort */ }
1738
+ return count;
1739
+ }
1740
+
1741
+ async function run(rawInput) {
351
1742
  const input = parseJson(rawInput);
352
1743
  const root = findRepoRoot(input.cwd || process.cwd());
353
- const result = analyze(root);
354
- if (result.warnings.length === 0) return rawInput;
1744
+ const mode = resolveGoalFitMode();
1745
+ if (mode === 'off') return rawInput;
1746
+ const result = await analyze(root);
1747
+ if (result.warnings.length === 0) {
1748
+ clearBlockStreak(root);
1749
+ return rawInput;
1750
+ }
355
1751
 
356
1752
  const message = [
357
1753
  '[Hook] Goal Fit warning:',
358
1754
  ...result.warnings.map(w => ` - ${w}`),
359
1755
  ].join('\n');
360
- const strict = String(process.env.FLOW_AGENTS_GOAL_FIT_STRICT || '').toLowerCase() === 'true';
1756
+
1757
+ if (mode !== 'block' || !result.blocking) {
1758
+ clearBlockStreak(root);
1759
+ return { stdout: rawInput, stderr: message, exitCode: 0 };
1760
+ }
1761
+
1762
+ const maxBlocks = resolveMaxBlocks();
1763
+ const count = bumpBlockStreak(root, reasonsHash(result.warnings));
1764
+ if (count >= maxBlocks) {
1765
+ // AC2: never auto-release a HARD block (caught false-completion, capture contradiction,
1766
+ // tamper signal, gate misconfiguration, integrity failure). An agent burning through
1767
+ // N identical hard blocks to escape via the streak counter must not work.
1768
+ //
1769
+ // BAR-RAISER, NOT AIRTIGHT: an agent with shell access can still reset the streak
1770
+ // by deleting .flow-agents/.goal-fit-block-streak.json or by modifying the warning
1771
+ // text so the hash changes. The real anchor is external (signed checkpoints + human
1772
+ // review). This raises the cost of the burn-through-the-counter escape vector.
1773
+ const isHardBlock = result.warnings.some(w => {
1774
+ if (/\[backstop in warn mode — not blocking\]/.test(w)) return false;
1775
+ return HARD_BLOCK.test(w);
1776
+ });
1777
+ if (isHardBlock) {
1778
+ // Do NOT clear the streak — keep accumulating so the same hard block stays visible.
1779
+ return {
1780
+ stdout: rawInput,
1781
+ stderr: `${message}\n[Hook] Goal Fit: max-blocks reached but the block is a caught false-completion / integrity failure — not auto-releasing; requires a real fix or operator override.`,
1782
+ exitCode: 2,
1783
+ };
1784
+ }
1785
+ clearBlockStreak(root);
1786
+ return {
1787
+ stdout: rawInput,
1788
+ stderr: `${message}\n[Hook] Goal Fit block RELEASED after ${count} consecutive identical blocks (FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=${maxBlocks}): the same gap persists, surfacing to the human instead of looping.`,
1789
+ exitCode: 0,
1790
+ };
1791
+ }
361
1792
  return {
362
1793
  stdout: rawInput,
363
- stderr: message,
364
- exitCode: strict && result.blocking ? 2 : 0,
1794
+ stderr: `${message}\n[Hook] Goal Fit BLOCK ${count}/${maxBlocks}.`,
1795
+ exitCode: 2,
365
1796
  };
366
1797
  }
367
1798
 
@@ -372,14 +1803,28 @@ if (require.main === module) {
372
1803
  if (data.length < MAX_STDIN) data += chunk.substring(0, MAX_STDIN - data.length);
373
1804
  });
374
1805
  process.stdin.on('end', () => {
375
- const output = run(data);
376
- if (output && typeof output === 'object') {
377
- if (output.stderr) process.stderr.write(output.stderr.endsWith('\n') ? output.stderr : `${output.stderr}\n`);
378
- process.stdout.write(String(output.stdout ?? data));
379
- process.exit(Number.isInteger(output.exitCode) ? output.exitCode : 0);
380
- }
381
- process.stdout.write(String(output));
1806
+ // run() is now async (Surface load). We wrap in an async IIFE so the
1807
+ // stdin/exit flow is preserved and errors are surfaced as warnings (fail-open).
1808
+ (async () => {
1809
+ let output;
1810
+ try {
1811
+ output = await run(data);
1812
+ } catch (err) {
1813
+ // Unexpected failure in the async gate path — fail-open, allow the Stop.
1814
+ process.stderr.write(`[Hook] Goal Fit async error (fail-open): ${String(err && err.message || err)}\n`);
1815
+ process.stdout.write(data);
1816
+ process.exit(0);
1817
+ return;
1818
+ }
1819
+ if (output && typeof output === 'object') {
1820
+ if (output.stderr) process.stderr.write(output.stderr.endsWith('\n') ? output.stderr : `${output.stderr}\n`);
1821
+ process.stdout.write(String(output.stdout ?? data));
1822
+ process.exit(Number.isInteger(output.exitCode) ? output.exitCode : 0);
1823
+ return;
1824
+ }
1825
+ process.stdout.write(String(output));
1826
+ })();
382
1827
  });
383
1828
  }
384
1829
 
385
- module.exports = { analyze, run, uncheckedInSection, findRepoRoot, sidecarGuidance, safeOneLine };
1830
+ module.exports = { analyze, run, resolveGoalFitMode, uncheckedInSection, findRepoRoot, sidecarGuidance, safeOneLine, captureCrossReference, bundleEnforcement, loadActiveFlowStep, readCommandLog, resolveTrustedCommand, declaredManifestTarget, verifyCommandLogChain, CHAIN_GENESIS_VERIFY, hasLaunderingOperator };