@kontourai/flow-agents 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/.github/CODEOWNERS +29 -0
  2. package/.github/actions/trust-verify/action.yml +145 -0
  3. package/.github/workflows/ci.yml +11 -4
  4. package/.github/workflows/kit-gates-demo.yml +2 -2
  5. package/.github/workflows/publish-npm.yml +10 -2
  6. package/.github/workflows/release-please.yml +1 -1
  7. package/.github/workflows/trust-reconcile.yml +113 -0
  8. package/AGENTS.md +13 -0
  9. package/CHANGELOG.md +95 -0
  10. package/CONTRIBUTING.md +4 -4
  11. package/README.md +1 -0
  12. package/agents/tool-planner.json +1 -1
  13. package/build/src/cli/init.js +242 -20
  14. package/build/src/cli/validate-workflow-artifacts.js +19 -2
  15. package/build/src/cli/verify.d.ts +1 -0
  16. package/build/src/cli/verify.js +90 -0
  17. package/build/src/cli/workflow-sidecar.d.ts +300 -8
  18. package/build/src/cli/workflow-sidecar.js +1934 -83
  19. package/build/src/cli.js +2 -3
  20. package/build/src/lib/flow-resolver.d.ts +82 -0
  21. package/build/src/lib/flow-resolver.js +237 -0
  22. package/build/src/tools/build-universal-bundles.js +34 -22
  23. package/build/src/tools/generate-context-map.js +3 -16
  24. package/build/src/tools/validate-source-tree.d.ts +1 -1
  25. package/build/src/tools/validate-source-tree.js +42 -162
  26. package/context/contracts/artifact-contract.md +10 -0
  27. package/context/contracts/delivery-contract.md +1 -0
  28. package/context/contracts/review-contract.md +1 -0
  29. package/context/contracts/verification-contract.md +2 -0
  30. package/context/gate-awareness.md +39 -0
  31. package/context/scripts/hooks/stop-goal-fit.js +632 -70
  32. package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
  33. package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
  34. package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
  35. package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
  36. package/docs/adr/0007-skill-audit.md +1 -1
  37. package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
  38. package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
  39. package/docs/adr/0011-mcp-posture.md +100 -0
  40. package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
  41. package/docs/adr/0013-context-lifecycle.md +151 -0
  42. package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
  43. package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
  44. package/docs/adr/0016-three-hard-boundary-model.md +71 -0
  45. package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
  46. package/docs/agent-system-guidebook.md +5 -12
  47. package/docs/context-map.md +4 -10
  48. package/docs/index.md +3 -2
  49. package/docs/integrations/framework-adapter.md +19 -6
  50. package/docs/integrations/index.md +2 -2
  51. package/docs/north-star.md +4 -4
  52. package/docs/operating-layers.md +3 -3
  53. package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
  54. package/docs/repository-structure.md +2 -2
  55. package/docs/skills-map.md +1 -0
  56. package/docs/spec/runtime-hook-surface.md +62 -9
  57. package/docs/standards-register.md +3 -3
  58. package/docs/survey-utterance-check.md +1 -1
  59. package/docs/trust-anchor-adoption.md +197 -0
  60. package/docs/verifiable-trust.md +95 -0
  61. package/docs/veritas-integration.md +2 -2
  62. package/docs/workflow-usage-guide.md +69 -0
  63. package/evals/acceptance/DEMO-false-completion.md +144 -0
  64. package/evals/acceptance/demo-cast.sh +92 -0
  65. package/evals/acceptance/demo-false-completion.sh +72 -0
  66. package/evals/acceptance/demo-real-evidence.sh +104 -0
  67. package/evals/acceptance/demo.tape +29 -0
  68. package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
  69. package/evals/acceptance/prove-capture-teeth.sh +114 -0
  70. package/evals/acceptance/prove-teeth.sh +105 -0
  71. package/evals/ci/antigaming-suite.sh +54 -0
  72. package/evals/ci/run-baseline.sh +2 -0
  73. package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
  74. package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
  75. package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
  76. package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
  77. package/evals/integration/test_builder_step_producers.sh +379 -0
  78. package/evals/integration/test_bundle_install.sh +35 -71
  79. package/evals/integration/test_bundle_lifecycle.sh +39 -2
  80. package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
  81. package/evals/integration/test_checkpoint_signing.sh +489 -0
  82. package/evals/integration/test_claim_lookup.sh +352 -0
  83. package/evals/integration/test_command_log_integrity.sh +275 -0
  84. package/evals/integration/test_context_map.sh +0 -2
  85. package/evals/integration/test_dual_emit_flow_step.sh +278 -0
  86. package/evals/integration/test_enforcer_expects_driven.sh +281 -0
  87. package/evals/integration/test_evidence_capture_hook.sh +185 -0
  88. package/evals/integration/test_flow_kit_repository.sh +2 -0
  89. package/evals/integration/test_flowdef_session_activation.sh +273 -0
  90. package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
  91. package/evals/integration/test_gate_bypass_chain.sh +448 -0
  92. package/evals/integration/test_gate_lockdown.sh +1137 -0
  93. package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
  94. package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
  95. package/evals/integration/test_goal_fit_hook.sh +69 -4
  96. package/evals/integration/test_goal_fit_rederive.sh +263 -0
  97. package/evals/integration/test_install_merge.sh +1176 -0
  98. package/evals/integration/test_mint_attestation.sh +373 -0
  99. package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
  100. package/evals/integration/test_publish_delivery.sh +269 -0
  101. package/evals/integration/test_reconcile_soundness.sh +528 -0
  102. package/evals/integration/test_resolvefirststep_security.sh +208 -0
  103. package/evals/integration/test_session_resume_roundtrip.sh +286 -0
  104. package/evals/integration/test_trust_checkpoint.sh +325 -0
  105. package/evals/integration/test_trust_reconcile.sh +293 -0
  106. package/evals/integration/test_verify_cli.sh +208 -0
  107. package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
  108. package/evals/lib/node.sh +0 -6
  109. package/evals/run.sh +45 -0
  110. package/evals/static/test_workflow_skills.sh +6 -13
  111. package/install.sh +0 -7
  112. package/integrations/strands-ts/README.md +25 -15
  113. package/integrations/veritas/flow-agents.adapter.json +1 -2
  114. package/kits/builder/flows/build.flow.json +59 -12
  115. package/kits/builder/kit.json +85 -15
  116. package/kits/builder/skills/continue-work/SKILL.md +116 -0
  117. package/kits/builder/skills/deliver/SKILL.md +36 -6
  118. package/kits/builder/skills/design-probe/SKILL.md +28 -0
  119. package/kits/builder/skills/execute-plan/SKILL.md +9 -1
  120. package/kits/builder/skills/gate-review/SKILL.md +234 -0
  121. package/kits/builder/skills/learning-review/SKILL.md +30 -0
  122. package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
  123. package/kits/builder/skills/plan-work/SKILL.md +13 -1
  124. package/kits/builder/skills/pull-work/SKILL.md +19 -0
  125. package/kits/knowledge/adapters/default-store/index.js +38 -0
  126. package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
  127. package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
  128. package/kits/knowledge/docs/store-contract.md +314 -0
  129. package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
  130. package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
  131. package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
  132. package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
  133. package/kits/knowledge/evals/entities/suite.test.js +40 -0
  134. package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
  135. package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
  136. package/kits/knowledge/evals/retirement/suite.test.js +145 -0
  137. package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
  138. package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
  139. package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
  140. package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
  141. package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
  142. package/kits/knowledge/kit.json +51 -1
  143. package/package.json +4 -4
  144. package/packaging/conformance/README.md +10 -2
  145. package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
  146. package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
  147. package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
  148. package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
  149. package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
  150. package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
  151. package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
  152. package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
  153. package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
  154. package/packaging/conformance/run-conformance.js +1 -1
  155. package/scripts/README.md +2 -1
  156. package/scripts/build-universal-bundles.js +0 -1
  157. package/scripts/ci/mint-attestation.js +221 -0
  158. package/scripts/ci/trust-reconcile.js +545 -0
  159. package/scripts/hooks/config-protection.js +423 -1
  160. package/scripts/hooks/evidence-capture.js +348 -0
  161. package/scripts/hooks/lib/liveness-read.js +113 -0
  162. package/scripts/hooks/run-hook.js +6 -1
  163. package/scripts/hooks/stop-goal-fit.js +1471 -79
  164. package/scripts/hooks/workflow-steering.js +135 -5
  165. package/scripts/install-codex-home.sh +39 -0
  166. package/scripts/install-merge.js +330 -0
  167. package/src/cli/init.ts +218 -20
  168. package/src/cli/validate-workflow-artifacts.ts +18 -2
  169. package/src/cli/verify.ts +100 -0
  170. package/src/cli/workflow-sidecar.ts +2064 -77
  171. package/src/cli.ts +2 -3
  172. package/src/lib/flow-resolver.ts +284 -0
  173. package/src/tools/build-universal-bundles.ts +34 -21
  174. package/src/tools/generate-context-map.ts +3 -17
  175. package/src/tools/validate-source-tree.ts +44 -104
  176. package/build/src/tools/filter-installed-packs.d.ts +0 -2
  177. package/build/src/tools/filter-installed-packs.js +0 -135
  178. package/packaging/packs.json +0 -49
  179. package/scripts/filter-installed-packs.js +0 -2
  180. package/src/tools/filter-installed-packs.ts +0 -132
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env bash
2
+ # test_evidence_capture_hook.sh — Capture-first evidence determinism contracts.
3
+ #
4
+ # Part A: evidence-capture.js deterministically records command executions to
5
+ # .flow-agents/<slug>/command-log.jsonl (machine-recorded, not model-claimed).
6
+ # Part B: stop-goal-fit.js cross-references evidence.json claimed-pass command
7
+ # checks against the capture log, and re-runs a TRUSTED backstop command
8
+ # only when the log has no execution for a claimed-pass command.
9
+ set -uo pipefail
10
+
11
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
12
+ CAPTURE="$ROOT/scripts/hooks/evidence-capture.js"
13
+ GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
14
+
15
+ # Disable the block escape hatch so repeated independent assertions never trip it.
16
+ export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
17
+
18
+ TMP="$(mktemp -d)"
19
+ errors=0
20
+ _pass() { echo " ✓ $1"; }
21
+ _fail() { echo " ✗ $1"; errors=$((errors + 1)); }
22
+
23
+ # ---- helpers -------------------------------------------------------------
24
+ seed_repo() { # $1 dir, $2 slug
25
+ local p="$1" slug="$2"
26
+ mkdir -p "$p/.flow-agents/$slug"
27
+ printf '# Repo\n' > "$p/AGENTS.md"
28
+ printf '%s' "{\"schema_version\":\"1.0\",\"task_slug\":\"$slug\",\"status\":\"delivered\",\"phase\":\"done\",\"updated_at\":\"2026-06-23T00:00:00Z\",\"next_action\":{\"status\":\"done\",\"summary\":\"done\"}}" > "$p/.flow-agents/$slug/state.json"
29
+ cat > "$p/.flow-agents/$slug/$slug--deliver.md" <<MD
30
+ # $slug
31
+
32
+ branch: main
33
+ status: delivered
34
+ type: deliver
35
+
36
+ ## Definition Of Done
37
+ - [x] tests pass
38
+
39
+ ## Goal Fit Gate
40
+ - [x] acceptance verified
41
+
42
+ ### Verdict: PASS
43
+ MD
44
+ }
45
+
46
+ capture() { # stdin = payload json
47
+ node "$CAPTURE" >/dev/null 2>&1
48
+ }
49
+
50
+ # ============================================================================
51
+ # Part A — deterministic capture
52
+ # ============================================================================
53
+ A="$TMP/capture"; seed_repo "$A" t1
54
+ echo "Part A: deterministic capture"
55
+
56
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm test"},"tool_response":{"exitCode":0,"stdout":"ok"}}' "$A" | capture
57
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm run lint"},"error":"command failed"}' "$A" | capture
58
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"make build"},"tool_response":{"exit_code":2}}' "$A" | capture
59
+ # A non-command tool (Write) must NOT be captured.
60
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Write","cwd":"%s","tool_input":{"file_path":"/tmp/x"}}' "$A" | capture
61
+
62
+ LOG="$A/.flow-agents/t1/command-log.jsonl"
63
+ if [[ -f "$LOG" ]]; then _pass "capture writes command-log.jsonl"; else _fail "capture did not write command-log.jsonl"; fi
64
+
65
+ lines=$(wc -l < "$LOG" | tr -d ' ')
66
+ if [[ "$lines" == "3" ]]; then _pass "capture records 3 command executions (Write tool excluded)"; else _fail "expected 3 log lines, got $lines"; fi
67
+
68
+ if rg -q '"command":"npm test","observedResult":"pass","exitCode":0' "$LOG"; then
69
+ _pass "clean exit 0 recorded as observedResult:pass exitCode:0"
70
+ else _fail "passing command not recorded correctly: $(cat "$LOG")"; fi
71
+
72
+ if rg -q '"command":"npm run lint","observedResult":"fail","exitCode":null' "$LOG"; then
73
+ _pass "error field with no exit code recorded as fail exitCode:null"
74
+ else _fail "errored command not recorded correctly"; fi
75
+
76
+ if rg -q '"command":"make build","observedResult":"fail","exitCode":2' "$LOG"; then
77
+ _pass "non-zero exit recorded as fail with exitCode"
78
+ else _fail "non-zero-exit command not recorded correctly"; fi
79
+
80
+ if rg -q '"source":"postToolUse-capture"' "$LOG"; then _pass "records source:postToolUse-capture"; else _fail "missing source field"; fi
81
+
82
+ # Capture is non-blocking: it always exits 0 and echoes stdin.
83
+ out=$(printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"echo hi"},"error":"boom"}' "$A" | node "$CAPTURE"; echo "EXIT=$?")
84
+ if rg -q 'EXIT=0' <<<"$out" && rg -q 'echo hi' <<<"$out"; then
85
+ _pass "capture is non-blocking (exit 0, echoes stdin) even on a failing command"
86
+ else _fail "capture should be non-blocking and echo stdin"; fi
87
+
88
+ # ============================================================================
89
+ # Part B1 — gate cross-references log: claimed pass but log shows FAIL → block
90
+ # ============================================================================
91
+ echo "Part B1: log contradicts claimed pass → block"
92
+ B="$TMP/contradict"; seed_repo "$B" t1
93
+ printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$B/.flow-agents/t1/evidence.json"
94
+ printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' > "$B/.flow-agents/t1/command-log.jsonl"
95
+
96
+ if FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip node "$GATE" >/dev/null 2>"$TMP/b1.err" <<JSON
97
+ {"hook_event_name":"Stop","cwd":"$B"}
98
+ JSON
99
+ then _fail "gate should BLOCK when capture log contradicts claimed pass"
100
+ else
101
+ status=$?
102
+ if [[ "$status" -eq 2 ]] && rg -q 'capture log CONTRADICTS claimed pass' "$TMP/b1.err" && rg -q 'caught false-completion' "$TMP/b1.err"; then
103
+ _pass "gate blocks (exit 2) caught false-completion via capture log"
104
+ else _fail "gate returned unexpected result: status=$status output=$(cat "$TMP/b1.err")"; fi
105
+ fi
106
+
107
+ # ============================================================================
108
+ # Part B2 — gate cross-references log: claimed pass and log shows PASS → no re-run
109
+ # ============================================================================
110
+ echo "Part B2: log confirms claimed pass → satisfied, no re-run"
111
+ C="$TMP/confirm"; seed_repo "$C" t1
112
+ printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$C/.flow-agents/t1/evidence.json"
113
+ printf '%s\n' '{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' > "$C/.flow-agents/t1/command-log.jsonl"
114
+ # A poisoned npm on PATH proves the gate does NOT re-run when the log confirms.
115
+ POISON="$TMP/poison"; mkdir -p "$POISON"
116
+ printf '#!/usr/bin/env bash\necho "npm should not run" >&2\nexit 99\n' > "$POISON/npm"; chmod +x "$POISON/npm"
117
+ PATH="$POISON:$PATH" FLOW_AGENTS_GOAL_FIT_MODE=block node "$GATE" >/dev/null 2>"$TMP/b2.err" <<JSON
118
+ {"hook_event_name":"Stop","cwd":"$C"}
119
+ JSON
120
+ if rg -q 'CONTRADICTS|backstop|npm should not run' "$TMP/b2.err"; then
121
+ _fail "gate should NOT re-run or warn when the capture log confirms the pass: $(cat "$TMP/b2.err")"
122
+ else _pass "gate trusts the log on a confirmed pass and does not re-run the backstop"; fi
123
+
124
+ # ============================================================================
125
+ # Part B3 — never-captured claimed-pass command → trusted backstop re-run (declared manifest target FAILS) → block
126
+ # ============================================================================
127
+ echo "Part B3: never-captured claim → trusted manifest backstop catches a fail"
128
+ D="$TMP/backstop"; seed_repo "$D" t1
129
+ printf '%s' '{"name":"x","scripts":{"test":"exit 7"}}' > "$D/package.json"
130
+ printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$D/.flow-agents/t1/evidence.json"
131
+ # command-log.jsonl intentionally absent — the command was never actually run.
132
+
133
+ if FLOW_AGENTS_GOAL_FIT_MODE=block node "$GATE" >/dev/null 2>"$TMP/b3.err" <<JSON
134
+ {"hook_event_name":"Stop","cwd":"$D"}
135
+ JSON
136
+ then _fail "gate should BLOCK when trusted backstop re-run of declared manifest target fails"
137
+ else
138
+ status=$?
139
+ if [[ "$status" -eq 2 ]] && rg -q 'trusted backstop \(manifest\)' "$TMP/b3.err" && rg -q 'FAILED with exit 7' "$TMP/b3.err"; then
140
+ _pass "gate runs trusted declared manifest target as backstop and blocks on its failure"
141
+ else _fail "backstop did not catch declared-target failure: status=$status output=$(cat "$TMP/b3.err")"; fi
142
+ fi
143
+
144
+ # ============================================================================
145
+ # Part B4 — never-captured claim, no trusted command resolves → NOT_VERIFIED (never a silent pass)
146
+ # ============================================================================
147
+ echo "Part B4: never-captured claim, nothing trusted resolves → NOT_VERIFIED"
148
+ E="$TMP/notverified"; seed_repo "$E" t1
149
+ printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"custom","kind":"command","status":"pass","command":"./my-thing.sh","summary":"ran custom"}]}' > "$E/.flow-agents/t1/evidence.json"
150
+
151
+ if FLOW_AGENTS_GOAL_FIT_MODE=block node "$GATE" >/dev/null 2>"$TMP/b4.err" <<JSON
152
+ {"hook_event_name":"Stop","cwd":"$E"}
153
+ JSON
154
+ then _fail "gate should not silently pass an un-captured, un-verifiable claimed-pass command"
155
+ else
156
+ status=$?
157
+ if [[ "$status" -eq 2 ]] && rg -q 'NOT_VERIFIED' "$TMP/b4.err" && rg -q 'no trusted command' "$TMP/b4.err"; then
158
+ _pass "gate records NOT_VERIFIED (never a guess) when no trusted command resolves"
159
+ else _fail "NOT_VERIFIED path returned unexpected result: status=$status output=$(cat "$TMP/b4.err")"; fi
160
+ fi
161
+
162
+ # ============================================================================
163
+ # Part B5 — arbitrary model command is opt-in only (FLOW_AGENTS_GOAL_FIT_RECHECK)
164
+ # ============================================================================
165
+ echo "Part B5: free-form model command re-run is opt-in only"
166
+ F="$TMP/recheck"; seed_repo "$F" t1
167
+ printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"custom","kind":"command","status":"pass","command":"exit 5","summary":"ran custom"}]}' > "$F/.flow-agents/t1/evidence.json"
168
+ # Opt-in ON: the model's free-form "exit 5" is re-run and fails → block.
169
+ if FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_RECHECK=true node "$GATE" >/dev/null 2>"$TMP/b5.err" <<JSON
170
+ {"hook_event_name":"Stop","cwd":"$F"}
171
+ JSON
172
+ then _fail "with RECHECK=true the failing model command should block"
173
+ else
174
+ status=$?
175
+ if [[ "$status" -eq 2 ]] && rg -q 'FLOW_AGENTS_GOAL_FIT_RECHECK' "$TMP/b5.err"; then
176
+ _pass "FLOW_AGENTS_GOAL_FIT_RECHECK=true opts into re-running the model's free-form command"
177
+ else _fail "recheck opt-in path returned unexpected result: status=$status output=$(cat "$TMP/b5.err")"; fi
178
+ fi
179
+
180
+ if [[ "$errors" -eq 0 ]]; then
181
+ echo "Evidence capture hook integration passed."
182
+ exit 0
183
+ fi
184
+ echo "Evidence capture hook integration failed: $errors issue(s)."
185
+ exit 1
@@ -53,6 +53,7 @@ expect_fail() {
53
53
 
54
54
  echo "=== Flow Kit Repository Fixture Checks ==="
55
55
  expect_pass "valid-local-kit"
56
+ expect_pass "valid-unknown-extension"
56
57
  expect_fail "invalid-schema-version" '\.schema_version must be "1\.0"'
57
58
  expect_fail "invalid-missing-schema-version" '\.schema_version must be "1\.0"'
58
59
  expect_fail "invalid-id" '\.id must be a kebab-case string'
@@ -63,6 +64,7 @@ expect_fail "invalid-absolute-path" 'flows\[0\]\.path must be relative'
63
64
  expect_fail "invalid-traversal" "flows\\[0\\]\\.path must not contain"
64
65
  expect_fail "invalid-malformed-json" 'invalid JSON'
65
66
  expect_fail "invalid-asset-section" '\.docs must be a list'
67
+ expect_fail "invalid-missing-extension-asset" 'docs\[0\]\.path points at missing asset'
66
68
  expect_fail "invalid-duplicate-flow" "flows\\[1\\]\\.path duplicates"
67
69
 
68
70
  echo ""
@@ -0,0 +1,273 @@
1
+ #!/usr/bin/env bash
2
+ # test_flowdef_session_activation.sh — Integration eval for ADR 0016 Step 1.
3
+ #
4
+ # Proves that ensure-session --flow-id builder.build activates the FlowDefinition-
5
+ # driven path so producers fire, gates enforce on builder.* claims, and advance-state
6
+ # correctly sets active_step_id via the phase_map at each phase.
7
+ #
8
+ # Tests:
9
+ # 1. ensure-session --flow-id builder.build writes active_flow_id + default
10
+ # active_step_id (pull-work) to current.json.
11
+ # 2. advance-state through phases (planning→execution→verification) sets correct
12
+ # active_step_id via phase_map at each transition.
13
+ # 3. At the verify step, record-gate-claim for tests-evidence produces
14
+ # builder.verify.tests (status=verified) in the bundle — producer fires.
15
+ # 4. A TAMPERED builder.verify.tests bundle at the verify step BLOCKS (exit 2)
16
+ # with the tamper warning naming the declared claimType.
17
+ # 5. Fallback: session without --flow-id produces only workflow.* claims (the
18
+ # retained safety net for non-flow sessions).
19
+ #
20
+ # Deterministic, no model spend, self-cleaning.
21
+ # Usage: bash evals/integration/test_flowdef_session_activation.sh
22
+
23
+ set -uo pipefail
24
+
25
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
26
+ source "$ROOT/evals/lib/node.sh"
27
+ GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
28
+
29
+ export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
30
+
31
+ TMP="$(mktemp -d)"
32
+ errors=0
33
+
34
+ _pass() { echo " ✓ $1"; }
35
+ _fail() { echo " ✗ $1"; errors=$((errors + 1)); }
36
+
37
+ cleanup() { rm -rf "$TMP"; }
38
+ trap cleanup EXIT
39
+
40
+ WRITER="workflow-sidecar"
41
+
42
+ # ─── TEST 1: ensure-session --flow-id activates the flow ─────────────────────
43
+ echo ""
44
+ echo "=== 1. ensure-session --flow-id builder.build activates FlowDefinition-driven path ==="
45
+
46
+ MAIN_AROOT="$TMP/main-aroot"
47
+ SLUG="activation-test"
48
+ SESSION_DIR="$MAIN_AROOT/$SLUG"
49
+ mkdir -p "$MAIN_AROOT"
50
+
51
+ flow_agents_node "$WRITER" ensure-session \
52
+ --artifact-root "$MAIN_AROOT" \
53
+ --task-slug "$SLUG" \
54
+ --title "Step 1 activation test" \
55
+ --summary "Test that --flow-id builder.build activates the FlowDefinition-driven path." \
56
+ --criterion "All gates produce declared claims" \
57
+ --flow-id builder.build \
58
+ --timestamp "2026-06-01T00:00:00Z" >/dev/null 2>&1
59
+
60
+ node -e "
61
+ const fs = require('fs');
62
+ const c = JSON.parse(fs.readFileSync('$MAIN_AROOT/current.json', 'utf8'));
63
+ if (c.active_flow_id !== 'builder.build') throw new Error('expected active_flow_id=builder.build, got ' + c.active_flow_id);
64
+ if (!c.active_step_id) throw new Error('expected active_step_id to be set (first step default), got ' + c.active_step_id);
65
+ console.log('current.json: active_flow_id=' + c.active_flow_id + ' active_step_id=' + c.active_step_id);
66
+ " 2>&1 \
67
+ && _pass "ensure-session --flow-id builder.build writes active_flow_id + default active_step_id to current.json" \
68
+ || _fail "ensure-session --flow-id builder.build did NOT write active_flow_id to current.json"
69
+
70
+ # ─── TEST 2: advance-state sets active_step_id via phase_map ─────────────────
71
+ echo ""
72
+ echo "=== 2. advance-state through phases sets active_step_id via phase_map ==="
73
+
74
+ flow_agents_node "$WRITER" init-plan "$SESSION_DIR/$SLUG--deliver.md" \
75
+ --source-request "Test" --summary "Testing" \
76
+ --timestamp "2026-06-01T00:00:30Z" >/dev/null 2>&1
77
+
78
+ test_phase_step() {
79
+ local phase="$1" expected_step="$2"
80
+ flow_agents_node "$WRITER" advance-state "$SESSION_DIR" \
81
+ --status in_progress --phase "$phase" \
82
+ --summary "Testing phase $phase." \
83
+ --next-action "Continue." \
84
+ --flow-definition builder.build \
85
+ --timestamp "2026-06-01T00:01:00Z" >/dev/null 2>&1
86
+ local actual
87
+ actual=$(node -e "
88
+ const fs = require('fs');
89
+ const c = JSON.parse(fs.readFileSync('$MAIN_AROOT/current.json', 'utf8'));
90
+ console.log(c.active_step_id || '');
91
+ " 2>/dev/null)
92
+ if [ "$actual" = "$expected_step" ]; then
93
+ _pass "advance-state phase=$phase → active_step_id=$expected_step"
94
+ else
95
+ _fail "advance-state phase=$phase → got active_step_id=$actual (expected $expected_step)"
96
+ fi
97
+ }
98
+
99
+ test_phase_step "planning" "plan"
100
+ test_phase_step "execution" "execute"
101
+ test_phase_step "verification" "verify"
102
+
103
+ # ─── TEST 3: at verify step, record-gate-claim produces builder.verify.tests ──
104
+ echo ""
105
+ echo "=== 3. verify step: producer fires — record-gate-claim produces builder.verify.tests ==="
106
+
107
+ if flow_agents_node "$WRITER" record-gate-claim "$SESSION_DIR" \
108
+ --status pass \
109
+ --summary "All tests pass." \
110
+ --expectation "tests-evidence" \
111
+ --timestamp "2026-06-01T00:02:00Z" >/dev/null 2>&1; then
112
+ _pass "record-gate-claim at verify step succeeds (expectation=tests-evidence)"
113
+ else
114
+ _fail "record-gate-claim at verify step FAILED"
115
+ fi
116
+
117
+ node -e "
118
+ const fs = require('fs');
119
+ const bundlePath = '$SESSION_DIR/trust.bundle';
120
+ if (!fs.existsSync(bundlePath)) throw new Error('trust.bundle not found');
121
+ const bundle = JSON.parse(fs.readFileSync(bundlePath, 'utf8'));
122
+ const declared = (bundle.claims || []).find(c => c.claimType === 'builder.verify.tests');
123
+ if (!declared) throw new Error('MISSING builder.verify.tests; claims: ' + (bundle.claims||[]).map(c=>c.claimType).join(', '));
124
+ if (declared.status !== 'verified') throw new Error('expected status=verified, got ' + declared.status);
125
+ console.log('builder.verify.tests: subjectType=' + declared.subjectType + ' status=' + declared.status + ' value=' + declared.value);
126
+ " 2>&1 \
127
+ && _pass "bundle contains builder.verify.tests (subjectType=flow-step, status=verified, value=pass)" \
128
+ || _fail "bundle missing or incorrect builder.verify.tests claim"
129
+
130
+ # ─── TEST 4: tampered bundle at verify step BLOCKS ────────────────────────────
131
+ echo ""
132
+ echo "=== 4. tamper-blocks: builder.verify.tests — tampered bundle triggers gate exit 2 ==="
133
+
134
+ TAMPER_DIR="$TMP/tamper-verify"
135
+ TAMPER_SLUG="tamper-verify-test"
136
+ mkdir -p "$TAMPER_DIR"
137
+ printf '# Test repo\n' > "$TAMPER_DIR/AGENTS.md"
138
+ mkdir -p "$TAMPER_DIR/.flow-agents/$TAMPER_SLUG"
139
+
140
+ flow_agents_node "$WRITER" ensure-session \
141
+ --artifact-root "$TAMPER_DIR/.flow-agents" \
142
+ --task-slug "$TAMPER_SLUG" \
143
+ --title "Tamper verify test" \
144
+ --summary "Testing tamper detection at verify step." \
145
+ --flow-id builder.build \
146
+ --step-id verify \
147
+ --timestamp "2026-06-01T02:00:00Z" >/dev/null 2>&1
148
+
149
+ flow_agents_node "$WRITER" init-plan "$TAMPER_DIR/.flow-agents/$TAMPER_SLUG/$TAMPER_SLUG--deliver.md" \
150
+ --source-request "Test" --summary "Tamper test" \
151
+ --timestamp "2026-06-01T02:00:00Z" >/dev/null 2>&1
152
+
153
+ flow_agents_node "$WRITER" advance-state "$TAMPER_DIR/.flow-agents/$TAMPER_SLUG" \
154
+ --status in_progress --phase verification \
155
+ --summary "At verify." --next-action "Continue." \
156
+ --flow-definition builder.build \
157
+ --timestamp "2026-06-01T02:00:30Z" >/dev/null 2>&1
158
+
159
+ # Write TAMPERED trust.bundle: stored verified, evidence passing=false
160
+ python3 - "$TAMPER_DIR/.flow-agents/$TAMPER_SLUG/trust.bundle" << 'PY'
161
+ import json, sys
162
+ bundle = {
163
+ "schemaVersion": 3,
164
+ "source": "flow-agents/workflow-sidecar",
165
+ "claims": [{
166
+ "id": "c1",
167
+ "subjectId": "tamper-verify-test/verify-tests",
168
+ "subjectType": "flow-step",
169
+ "claimType": "builder.verify.tests",
170
+ "fieldOrBehavior": "Tests pass",
171
+ "value": "pass",
172
+ "impactLevel": "high",
173
+ "status": "verified",
174
+ "createdAt": "2026-06-01T02:00:00Z",
175
+ "updatedAt": "2026-06-01T02:00:00Z"
176
+ }],
177
+ "evidence": [{
178
+ "id": "ev1",
179
+ "claimId": "c1",
180
+ "evidenceType": "test_output",
181
+ "method": "validation",
182
+ "sourceRef": "command-log.jsonl",
183
+ "excerptOrSummary": "tests FAILED",
184
+ "observedAt": "2026-06-01T02:00:00Z",
185
+ "collectedBy": "harness",
186
+ "passing": False,
187
+ "blocking": True
188
+ }],
189
+ "policies": [],
190
+ "events": [{
191
+ "id": "evt1",
192
+ "claimId": "c1",
193
+ "status": "verified",
194
+ "actor": "agent",
195
+ "method": "workflow-check",
196
+ "evidenceIds": ["ev1"],
197
+ "createdAt": "2026-06-01T02:00:00Z"
198
+ }]
199
+ }
200
+ json.dump(bundle, open(sys.argv[1], 'w'))
201
+ PY
202
+
203
+ set +e
204
+ tamper_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
205
+ node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$TAMPER_DIR\"}")"
206
+ tamper_exit="$?"
207
+ set -e
208
+
209
+ if [ "$tamper_exit" -eq 2 ]; then
210
+ _pass "gate BLOCKS tampered builder.verify.tests bundle (exit 2)"
211
+ else
212
+ _fail "gate did NOT block tampered bundle: exit=$tamper_exit"
213
+ fi
214
+
215
+ if echo "$tamper_out" | grep -qE "stored status.*does not match recompute|possible tampered bundle|caught false-completion"; then
216
+ _pass "gate emits tamper warning for builder.verify.tests"
217
+ else
218
+ _fail "gate tamper warning missing from output: $tamper_out"
219
+ fi
220
+
221
+ if echo "$tamper_out" | grep -q "builder.verify.tests"; then
222
+ _pass "gate tamper warning names declared claimType builder.verify.tests"
223
+ else
224
+ _fail "gate tamper warning does not name builder.verify.tests: $tamper_out"
225
+ fi
226
+
227
+ # ─── TEST 5: Fallback — session without --flow-id (workflow.* only, safety net) ─
228
+ echo ""
229
+ echo "=== 5. Fallback: session without --flow-id produces only workflow.* claims (safety net intact) ==="
230
+
231
+ FALLBACK_AROOT="$TMP/fallback-aroot"
232
+ FALLBACK_SLUG="fallback-test"
233
+ FALLBACK_DIR="$FALLBACK_AROOT/$FALLBACK_SLUG"
234
+ mkdir -p "$FALLBACK_AROOT"
235
+
236
+ flow_agents_node "$WRITER" ensure-session \
237
+ --artifact-root "$FALLBACK_AROOT" \
238
+ --task-slug "$FALLBACK_SLUG" \
239
+ --title "Fallback no-flow test" \
240
+ --summary "No --flow-id: workflow.* fallback is the safety net for non-flow sessions." \
241
+ --timestamp "2026-06-01T10:00:00Z" >/dev/null 2>&1
242
+
243
+ flow_agents_node "$WRITER" init-plan "$FALLBACK_DIR/$FALLBACK_SLUG--deliver.md" \
244
+ --source-request "Test" --summary "Testing fallback." \
245
+ --timestamp "2026-06-01T10:00:00Z" >/dev/null 2>&1
246
+
247
+ flow_agents_node "$WRITER" record-evidence "$FALLBACK_DIR" \
248
+ --verdict pass \
249
+ --check-json '{"id":"fallback-check","kind":"test","status":"pass","summary":"Fallback test passes"}' \
250
+ --timestamp "2026-06-01T10:01:00Z" >/dev/null 2>&1
251
+
252
+ node -e "
253
+ const fs = require('fs');
254
+ const bundle = JSON.parse(fs.readFileSync('$FALLBACK_DIR/trust.bundle', 'utf8'));
255
+ const claims = bundle.claims || [];
256
+ const wfClaim = claims.find(c => c.claimType === 'workflow.check.test');
257
+ const builderClaims = claims.filter(c => c.claimType.startsWith('builder.'));
258
+ if (!wfClaim) throw new Error('MISSING workflow.check.test in fallback session');
259
+ if (builderClaims.length > 0) throw new Error('UNEXPECTED builder.* claims in fallback session: ' + builderClaims.map(c=>c.claimType).join(', '));
260
+ if (wfClaim.id.endsWith('-legacy')) throw new Error('workflow.check.test should not have -legacy suffix when no flow active');
261
+ console.log('fallback: only workflow.check.test present (no builder.* claims, no -legacy suffix)');
262
+ " 2>&1 \
263
+ && _pass "fallback (no --flow-id): only workflow.check.test produced, builder.* absent (producers dormant)" \
264
+ || _fail "fallback (no --flow-id): unexpected claims in trust.bundle"
265
+
266
+ # ─── Summary ──────────────────────────────────────────────────────────────────
267
+ echo ""
268
+ if [ "$errors" -eq 0 ]; then
269
+ echo "test_flowdef_session_activation: all checks passed."
270
+ exit 0
271
+ fi
272
+ echo "test_flowdef_session_activation: $errors check(s) FAILED."
273
+ exit 1