@kontourai/flow-agents 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/.github/CODEOWNERS +29 -0
  2. package/.github/actions/trust-verify/action.yml +145 -0
  3. package/.github/workflows/ci.yml +11 -4
  4. package/.github/workflows/kit-gates-demo.yml +2 -2
  5. package/.github/workflows/publish-npm.yml +10 -2
  6. package/.github/workflows/release-please.yml +1 -1
  7. package/.github/workflows/trust-reconcile.yml +113 -0
  8. package/AGENTS.md +13 -0
  9. package/CHANGELOG.md +95 -0
  10. package/CONTRIBUTING.md +4 -4
  11. package/README.md +1 -0
  12. package/agents/tool-planner.json +1 -1
  13. package/build/src/cli/init.js +242 -20
  14. package/build/src/cli/validate-workflow-artifacts.js +19 -2
  15. package/build/src/cli/verify.d.ts +1 -0
  16. package/build/src/cli/verify.js +90 -0
  17. package/build/src/cli/workflow-sidecar.d.ts +300 -8
  18. package/build/src/cli/workflow-sidecar.js +1934 -83
  19. package/build/src/cli.js +2 -3
  20. package/build/src/lib/flow-resolver.d.ts +82 -0
  21. package/build/src/lib/flow-resolver.js +237 -0
  22. package/build/src/tools/build-universal-bundles.js +34 -22
  23. package/build/src/tools/generate-context-map.js +3 -16
  24. package/build/src/tools/validate-source-tree.d.ts +1 -1
  25. package/build/src/tools/validate-source-tree.js +42 -162
  26. package/context/contracts/artifact-contract.md +10 -0
  27. package/context/contracts/delivery-contract.md +1 -0
  28. package/context/contracts/review-contract.md +1 -0
  29. package/context/contracts/verification-contract.md +2 -0
  30. package/context/gate-awareness.md +39 -0
  31. package/context/scripts/hooks/stop-goal-fit.js +632 -70
  32. package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
  33. package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
  34. package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
  35. package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
  36. package/docs/adr/0007-skill-audit.md +1 -1
  37. package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
  38. package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
  39. package/docs/adr/0011-mcp-posture.md +100 -0
  40. package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
  41. package/docs/adr/0013-context-lifecycle.md +151 -0
  42. package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
  43. package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
  44. package/docs/adr/0016-three-hard-boundary-model.md +71 -0
  45. package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
  46. package/docs/agent-system-guidebook.md +5 -12
  47. package/docs/context-map.md +4 -10
  48. package/docs/index.md +3 -2
  49. package/docs/integrations/framework-adapter.md +19 -6
  50. package/docs/integrations/index.md +2 -2
  51. package/docs/north-star.md +4 -4
  52. package/docs/operating-layers.md +3 -3
  53. package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
  54. package/docs/repository-structure.md +2 -2
  55. package/docs/skills-map.md +1 -0
  56. package/docs/spec/runtime-hook-surface.md +62 -9
  57. package/docs/standards-register.md +3 -3
  58. package/docs/survey-utterance-check.md +1 -1
  59. package/docs/trust-anchor-adoption.md +197 -0
  60. package/docs/verifiable-trust.md +95 -0
  61. package/docs/veritas-integration.md +2 -2
  62. package/docs/workflow-usage-guide.md +69 -0
  63. package/evals/acceptance/DEMO-false-completion.md +144 -0
  64. package/evals/acceptance/demo-cast.sh +92 -0
  65. package/evals/acceptance/demo-false-completion.sh +72 -0
  66. package/evals/acceptance/demo-real-evidence.sh +104 -0
  67. package/evals/acceptance/demo.tape +29 -0
  68. package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
  69. package/evals/acceptance/prove-capture-teeth.sh +114 -0
  70. package/evals/acceptance/prove-teeth.sh +105 -0
  71. package/evals/ci/antigaming-suite.sh +54 -0
  72. package/evals/ci/run-baseline.sh +2 -0
  73. package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
  74. package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
  75. package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
  76. package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
  77. package/evals/integration/test_builder_step_producers.sh +379 -0
  78. package/evals/integration/test_bundle_install.sh +35 -71
  79. package/evals/integration/test_bundle_lifecycle.sh +39 -2
  80. package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
  81. package/evals/integration/test_checkpoint_signing.sh +489 -0
  82. package/evals/integration/test_claim_lookup.sh +352 -0
  83. package/evals/integration/test_command_log_integrity.sh +275 -0
  84. package/evals/integration/test_context_map.sh +0 -2
  85. package/evals/integration/test_dual_emit_flow_step.sh +278 -0
  86. package/evals/integration/test_enforcer_expects_driven.sh +281 -0
  87. package/evals/integration/test_evidence_capture_hook.sh +185 -0
  88. package/evals/integration/test_flow_kit_repository.sh +2 -0
  89. package/evals/integration/test_flowdef_session_activation.sh +273 -0
  90. package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
  91. package/evals/integration/test_gate_bypass_chain.sh +448 -0
  92. package/evals/integration/test_gate_lockdown.sh +1137 -0
  93. package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
  94. package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
  95. package/evals/integration/test_goal_fit_hook.sh +69 -4
  96. package/evals/integration/test_goal_fit_rederive.sh +263 -0
  97. package/evals/integration/test_install_merge.sh +1176 -0
  98. package/evals/integration/test_mint_attestation.sh +373 -0
  99. package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
  100. package/evals/integration/test_publish_delivery.sh +269 -0
  101. package/evals/integration/test_reconcile_soundness.sh +528 -0
  102. package/evals/integration/test_resolvefirststep_security.sh +208 -0
  103. package/evals/integration/test_session_resume_roundtrip.sh +286 -0
  104. package/evals/integration/test_trust_checkpoint.sh +325 -0
  105. package/evals/integration/test_trust_reconcile.sh +293 -0
  106. package/evals/integration/test_verify_cli.sh +208 -0
  107. package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
  108. package/evals/lib/node.sh +0 -6
  109. package/evals/run.sh +45 -0
  110. package/evals/static/test_workflow_skills.sh +6 -13
  111. package/install.sh +0 -7
  112. package/integrations/strands-ts/README.md +25 -15
  113. package/integrations/veritas/flow-agents.adapter.json +1 -2
  114. package/kits/builder/flows/build.flow.json +59 -12
  115. package/kits/builder/kit.json +85 -15
  116. package/kits/builder/skills/continue-work/SKILL.md +116 -0
  117. package/kits/builder/skills/deliver/SKILL.md +36 -6
  118. package/kits/builder/skills/design-probe/SKILL.md +28 -0
  119. package/kits/builder/skills/execute-plan/SKILL.md +9 -1
  120. package/kits/builder/skills/gate-review/SKILL.md +234 -0
  121. package/kits/builder/skills/learning-review/SKILL.md +30 -0
  122. package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
  123. package/kits/builder/skills/plan-work/SKILL.md +13 -1
  124. package/kits/builder/skills/pull-work/SKILL.md +19 -0
  125. package/kits/knowledge/adapters/default-store/index.js +38 -0
  126. package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
  127. package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
  128. package/kits/knowledge/docs/store-contract.md +314 -0
  129. package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
  130. package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
  131. package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
  132. package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
  133. package/kits/knowledge/evals/entities/suite.test.js +40 -0
  134. package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
  135. package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
  136. package/kits/knowledge/evals/retirement/suite.test.js +145 -0
  137. package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
  138. package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
  139. package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
  140. package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
  141. package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
  142. package/kits/knowledge/kit.json +51 -1
  143. package/package.json +4 -4
  144. package/packaging/conformance/README.md +10 -2
  145. package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
  146. package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
  147. package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
  148. package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
  149. package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
  150. package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
  151. package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
  152. package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
  153. package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
  154. package/packaging/conformance/run-conformance.js +1 -1
  155. package/scripts/README.md +2 -1
  156. package/scripts/build-universal-bundles.js +0 -1
  157. package/scripts/ci/mint-attestation.js +221 -0
  158. package/scripts/ci/trust-reconcile.js +545 -0
  159. package/scripts/hooks/config-protection.js +423 -1
  160. package/scripts/hooks/evidence-capture.js +348 -0
  161. package/scripts/hooks/lib/liveness-read.js +113 -0
  162. package/scripts/hooks/run-hook.js +6 -1
  163. package/scripts/hooks/stop-goal-fit.js +1471 -79
  164. package/scripts/hooks/workflow-steering.js +135 -5
  165. package/scripts/install-codex-home.sh +39 -0
  166. package/scripts/install-merge.js +330 -0
  167. package/src/cli/init.ts +218 -20
  168. package/src/cli/validate-workflow-artifacts.ts +18 -2
  169. package/src/cli/verify.ts +100 -0
  170. package/src/cli/workflow-sidecar.ts +2064 -77
  171. package/src/cli.ts +2 -3
  172. package/src/lib/flow-resolver.ts +284 -0
  173. package/src/tools/build-universal-bundles.ts +34 -21
  174. package/src/tools/generate-context-map.ts +3 -17
  175. package/src/tools/validate-source-tree.ts +44 -104
  176. package/build/src/tools/filter-installed-packs.d.ts +0 -2
  177. package/build/src/tools/filter-installed-packs.js +0 -135
  178. package/packaging/packs.json +0 -49
  179. package/scripts/filter-installed-packs.js +0 -2
  180. package/src/tools/filter-installed-packs.ts +0 -132
@@ -0,0 +1,352 @@
1
+ #!/usr/bin/env bash
2
+ # test_claim_lookup.sh — Integration tests for the `claim` subcommand (#162).
3
+ #
4
+ # Verifies:
5
+ # AC1: status + value + failing evidence (with execution block) + policy + derivation drilldown
6
+ # AC1: --json flag emits structured ClaimExplanation object
7
+ # AC1: unknown claim id exits 1 with clear error listing available ids
8
+ # AC1: missing bundle exits 1 with clear error
9
+ # AC3: gate-hint in stop-goal-fit.js disputed warning contains workflow:sidecar -- claim
10
+ set -uo pipefail
11
+
12
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
13
+ source "$ROOT/evals/lib/node.sh"
14
+
15
+ TMPDIR_EVAL="$(mktemp -d)"
16
+ errors=0
17
+
18
+ cleanup() { rm -rf "$TMPDIR_EVAL"; }
19
+ trap cleanup EXIT
20
+
21
+ _pass() { echo " ✓ $1"; }
22
+ _fail() { echo " ✗ $1"; errors=$((errors + 1)); }
23
+
24
+ echo "=== Claim Lookup Tests (issue #162) ==="
25
+
26
+ # ── helpers ──────────────────────────────────────────────────────────────────
27
+
28
+ jq_node() {
29
+ local file="$1"; local expr="$2"
30
+ node -e "
31
+ const d=JSON.parse(require('fs').readFileSync('${file}','utf8'));
32
+ const r=(${expr})(d);
33
+ if(r===undefined||r===null){process.exit(2);}
34
+ if(typeof r==='boolean'||typeof r==='number'||typeof r==='string'){
35
+ process.stdout.write(String(r)+'\n');
36
+ }else{
37
+ process.stdout.write(JSON.stringify(r)+'\n');
38
+ }"
39
+ }
40
+
41
+ # Seed a trust.bundle with a DISPUTED claim including a failing execution block and a policy.
42
+ seed_disputed_bundle() {
43
+ local dir="$1" slug="$2"
44
+ local ts="2026-06-25T00:00:00Z"
45
+ local claimId="${slug}/unit-tests.flow-agents.workflow.unit tests pass"
46
+ mkdir -p "$dir"
47
+ cat > "$dir/trust.bundle" <<JSON
48
+ {
49
+ "schemaVersion": 3,
50
+ "source": "claim-lookup-test;statusFunctionVersion=1",
51
+ "claims": [
52
+ {
53
+ "id": "$claimId",
54
+ "subjectType": "workflow-check",
55
+ "subjectId": "${slug}/unit-tests",
56
+ "surface": "flow-agents.workflow",
57
+ "claimType": "workflow.check.test",
58
+ "fieldOrBehavior": "unit tests pass",
59
+ "value": "fail",
60
+ "status": "disputed",
61
+ "impactLevel": "high",
62
+ "verificationPolicyId": "policy:workflow.check.test",
63
+ "createdAt": "$ts",
64
+ "updatedAt": "$ts"
65
+ }
66
+ ],
67
+ "evidence": [
68
+ {
69
+ "id": "ev:${claimId}",
70
+ "claimId": "${claimId}",
71
+ "evidenceType": "test_output",
72
+ "label": "npm test output",
73
+ "method": "validation",
74
+ "excerptOrSummary": "8 tests failed",
75
+ "status": "disputed",
76
+ "execution": {
77
+ "runner": "npm test",
78
+ "label": "npm test",
79
+ "isError": true,
80
+ "exitCode": 1
81
+ },
82
+ "sourceRef": "command-log.jsonl",
83
+ "createdAt": "$ts"
84
+ }
85
+ ],
86
+ "events": [
87
+ {
88
+ "id": "evt:${claimId}",
89
+ "claimId": "${claimId}",
90
+ "status": "disputed",
91
+ "actor": "test",
92
+ "method": "validation",
93
+ "evidenceIds": ["ev:${claimId}"],
94
+ "createdAt": "$ts",
95
+ "verifiedAt": "$ts"
96
+ }
97
+ ],
98
+ "policies": [
99
+ {
100
+ "id": "policy:workflow.check.test",
101
+ "claimType": "workflow.check.test",
102
+ "requiredEvidence": ["test_output"],
103
+ "requiredMethods": ["validation"],
104
+ "acceptanceCriteria": ["A verified verification event must support a workflow.check.test claim."],
105
+ "reviewAuthority": "system",
106
+ "validityRule": { "kind": "manual" },
107
+ "stalenessTriggers": [],
108
+ "conflictRules": [],
109
+ "impactLevel": "high"
110
+ }
111
+ ]
112
+ }
113
+ JSON
114
+ }
115
+
116
+ # ── Test 1: AC1 — text output has status + value + evidence + policy + drilldown ──
117
+
118
+ echo ""
119
+ echo "── Test 1: text output (status + evidence + policy + drilldown) ──"
120
+
121
+ AC1_DIR="$TMPDIR_EVAL/ac1"
122
+ AC1_SLUG="claim-lookup-ac1"
123
+ seed_disputed_bundle "$AC1_DIR" "$AC1_SLUG"
124
+ AC1_CLAIM_ID="${AC1_SLUG}/unit-tests.flow-agents.workflow.unit tests pass"
125
+
126
+ AC1_OUT="$TMPDIR_EVAL/ac1.out"
127
+ if flow_agents_node workflow-sidecar claim "$AC1_CLAIM_ID" "$AC1_DIR" >"$AC1_OUT" 2>&1; then
128
+ _pass "AC1: claim command exits 0 for known disputed claim"
129
+ else
130
+ _fail "AC1: claim command failed: $(cat "$AC1_OUT")"
131
+ fi
132
+
133
+ if grep -q "Status: disputed" "$AC1_OUT"; then
134
+ _pass "AC1: output contains derived status (disputed)"
135
+ else
136
+ _fail "AC1: output missing derived status: $(head -3 "$AC1_OUT")"
137
+ fi
138
+
139
+ if grep -q "Value: fail" "$AC1_OUT"; then
140
+ _pass "AC1: output contains raw value"
141
+ else
142
+ _fail "AC1: output missing value"
143
+ fi
144
+
145
+ if grep -q "exitCode: 1" "$AC1_OUT" && grep -q "isError: true" "$AC1_OUT"; then
146
+ _pass "AC1: failing evidence execution block shown (exitCode + isError)"
147
+ else
148
+ _fail "AC1: execution block missing from evidence output: $(grep -i "exitCode\|isError\|Evidence" "$AC1_OUT" || echo '(not found)')"
149
+ fi
150
+
151
+ if grep -q "Governing Policy (policy:workflow.check.test)" "$AC1_OUT"; then
152
+ _pass "AC1: governing policy section present"
153
+ else
154
+ _fail "AC1: governing policy section missing"
155
+ fi
156
+
157
+ if grep -q "requiredEvidence:" "$AC1_OUT" && grep -q "acceptanceCriteria:" "$AC1_OUT" && grep -q "reviewAuthority:" "$AC1_OUT"; then
158
+ _pass "AC1: policy fields (requiredEvidence, acceptanceCriteria, reviewAuthority) present"
159
+ else
160
+ _fail "AC1: policy fields incomplete: $(grep -E "required|acceptance|review" "$AC1_OUT" || echo '(not found)')"
161
+ fi
162
+
163
+ if grep -q "Derivation Drilldown:" "$AC1_OUT"; then
164
+ _pass "AC1: derivation drilldown section present"
165
+ else
166
+ _fail "AC1: derivation drilldown section missing"
167
+ fi
168
+
169
+ # ── Test 2: AC1 — --json flag emits structured ClaimExplanation ──
170
+
171
+ echo ""
172
+ echo "── Test 2: --json flag emits structured ClaimExplanation object ──"
173
+
174
+ AC2_JSON="$TMPDIR_EVAL/ac1.json"
175
+ if flow_agents_node workflow-sidecar claim "$AC1_CLAIM_ID" "$AC1_DIR" --json >"$AC2_JSON" 2>&1; then
176
+ _pass "AC2: --json exits 0"
177
+ else
178
+ _fail "AC2: --json failed: $(cat "$AC2_JSON")"
179
+ fi
180
+
181
+ # Validate JSON structure
182
+ FOUND="$(jq_node "$AC2_JSON" 'd => d.found' 2>/dev/null || echo '')"
183
+ STATUS="$(jq_node "$AC2_JSON" 'd => d.status' 2>/dev/null || echo '')"
184
+ VALUE="$(jq_node "$AC2_JSON" 'd => d.value' 2>/dev/null || echo '')"
185
+ HAS_POLICY="$(jq_node "$AC2_JSON" 'd => d.policy !== null && d.policy.id !== undefined' 2>/dev/null || echo '')"
186
+ EVIDENCE_LEN="$(jq_node "$AC2_JSON" 'd => d.evidence.length' 2>/dev/null || echo '')"
187
+ EXEC_EXITCODE="$(jq_node "$AC2_JSON" 'd => d.evidence[0] && d.evidence[0].execution && d.evidence[0].execution.exitCode' 2>/dev/null || echo '')"
188
+ HAS_WHY="$(jq_node "$AC2_JSON" 'd => typeof d.why === "object" && d.why !== null' 2>/dev/null || echo '')"
189
+
190
+ [[ "$FOUND" == "true" ]] && _pass "AC2: found=true in JSON" || _fail "AC2: expected found=true, got '$FOUND'"
191
+ [[ "$STATUS" == "disputed" ]] && _pass "AC2: status=disputed in JSON" || _fail "AC2: expected status=disputed, got '$STATUS'"
192
+ [[ "$VALUE" == "fail" ]] && _pass "AC2: value=fail in JSON" || _fail "AC2: expected value=fail, got '$VALUE'"
193
+ [[ "$HAS_POLICY" == "true" ]] && _pass "AC2: policy object present in JSON" || _fail "AC2: policy missing: $HAS_POLICY"
194
+ [[ "$EVIDENCE_LEN" == "1" ]] && _pass "AC2: evidence array has 1 item" || _fail "AC2: expected 1 evidence item, got '$EVIDENCE_LEN'"
195
+ [[ "$EXEC_EXITCODE" == "1" ]] && _pass "AC2: evidence[0].execution.exitCode=1 in JSON" || _fail "AC2: expected exitCode=1, got '$EXEC_EXITCODE'"
196
+ [[ "$HAS_WHY" == "true" ]] && _pass "AC2: why object present in JSON" || _fail "AC2: why object missing"
197
+
198
+ # ── Test 3: AC1 — unknown id exits 1 with clear error listing available ids ──
199
+
200
+ echo ""
201
+ echo "── Test 3: unknown claim id → clear error + list of available ids ──"
202
+
203
+ AC3_OUT="$TMPDIR_EVAL/ac3.out"
204
+ if flow_agents_node workflow-sidecar claim "nonexistent-claim-id" "$AC1_DIR" >"$AC3_OUT" 2>&1; then
205
+ _fail "AC3: expected exit 1 for unknown claim id but got 0"
206
+ else
207
+ _pass "AC3: exits 1 for unknown claim id"
208
+ fi
209
+
210
+ if grep -q "unknown claim id: nonexistent-claim-id" "$AC3_OUT"; then
211
+ _pass "AC3: error message names the unknown id"
212
+ else
213
+ _fail "AC3: error message missing id: $(cat "$AC3_OUT")"
214
+ fi
215
+
216
+ if grep -q "Available claim ids" "$AC3_OUT"; then
217
+ _pass "AC3: error lists available claim ids"
218
+ else
219
+ _fail "AC3: error does not list available ids: $(cat "$AC3_OUT")"
220
+ fi
221
+
222
+ # ── Test 4: AC1 — missing bundle exits 1 ──
223
+
224
+ echo ""
225
+ echo "── Test 4: missing bundle → clear error ──"
226
+
227
+ AC4_OUT="$TMPDIR_EVAL/ac4.out"
228
+ if flow_agents_node workflow-sidecar claim "any-id" "$TMPDIR_EVAL/nonexistent" >"$AC4_OUT" 2>&1; then
229
+ _fail "AC4: expected exit 1 for missing bundle but got 0"
230
+ else
231
+ _pass "AC4: exits 1 for missing bundle"
232
+ fi
233
+
234
+ if grep -q "no trust.bundle at" "$AC4_OUT"; then
235
+ _pass "AC4: error message mentions missing trust.bundle"
236
+ else
237
+ _fail "AC4: error message missing: $(cat "$AC4_OUT")"
238
+ fi
239
+
240
+ # ── Test 5: AC3 — gate-hint in stop-goal-fit.js warning ──
241
+ # Use a bundle with an acceptance criterion claim (not a check claim) so the
242
+ # bundleEnforcement warning is not deduplicated by captureCrossReference.
243
+ # FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip prevents backstop re-runs for hermeticity.
244
+
245
+ echo ""
246
+ echo "── Test 5: gate-hint appears in stop-goal-fit.js disputed warning ──"
247
+
248
+ AC5_PROJ="$TMPDIR_EVAL/gate-hint-proj"
249
+ AC5_SLUG="gate-hint-test"
250
+ AC5_DIR="$AC5_PROJ/.flow-agents/$AC5_SLUG"
251
+ mkdir -p "$AC5_DIR"
252
+
253
+ # Write a minimal bundle with a disputed acceptance criterion claim.
254
+ # Using workflow.acceptance.criterion (not workflow.check.*) so the subjectId
255
+ # won't match any evidence check id and bundleEnforcement won't be deduped.
256
+ cat > "$AC5_DIR/trust.bundle" <<'BUNDLE'
257
+ {
258
+ "schemaVersion": 3,
259
+ "source": "claim-lookup-test",
260
+ "claims": [
261
+ {
262
+ "id": "gate-hint-test/AC1.flow-agents.workflow.acceptance criterion verified",
263
+ "subjectType": "workflow-criterion",
264
+ "subjectId": "gate-hint-test/AC1",
265
+ "surface": "flow-agents.workflow",
266
+ "claimType": "workflow.acceptance.criterion",
267
+ "fieldOrBehavior": "acceptance criterion verified",
268
+ "value": "fail",
269
+ "status": "disputed",
270
+ "impactLevel": "high",
271
+ "verificationPolicyId": "policy:workflow.acceptance.criterion",
272
+ "createdAt": "2026-06-25T00:00:00Z",
273
+ "updatedAt": "2026-06-25T00:00:00Z"
274
+ }
275
+ ],
276
+ "evidence": [],
277
+ "events": [
278
+ {
279
+ "id": "evt:gate-hint-test/AC1",
280
+ "claimId": "gate-hint-test/AC1.flow-agents.workflow.acceptance criterion verified",
281
+ "status": "disputed",
282
+ "actor": "test",
283
+ "method": "validation",
284
+ "evidenceIds": [],
285
+ "createdAt": "2026-06-25T00:00:00Z",
286
+ "verifiedAt": "2026-06-25T00:00:00Z"
287
+ }
288
+ ],
289
+ "policies": [
290
+ {
291
+ "id": "policy:workflow.acceptance.criterion",
292
+ "claimType": "workflow.acceptance.criterion",
293
+ "requiredEvidence": ["human_attestation"],
294
+ "acceptanceCriteria": ["A criterion must have a verified event."],
295
+ "reviewAuthority": "system",
296
+ "validityRule": { "kind": "manual" },
297
+ "stalenessTriggers": [],
298
+ "conflictRules": [],
299
+ "impactLevel": "high"
300
+ }
301
+ ]
302
+ }
303
+ BUNDLE
304
+
305
+ cat > "$AC5_DIR/state.json" <<'JSON'
306
+ {"schema_version":"1.0","task_slug":"gate-hint-test","status":"delivered","phase":"done","updated_at":"2026-06-25T00:00:00Z","next_action":{"status":"done","summary":"done"}}
307
+ JSON
308
+
309
+ cat > "$AC5_DIR/gate-hint-test--deliver.md" <<'MD'
310
+ # Gate Hint Test
311
+
312
+ branch: main
313
+ status: delivered
314
+ type: deliver
315
+
316
+ ## Definition Of Done
317
+ - [x] all tests pass
318
+
319
+ ## Goal Fit Gate
320
+ - [x] criteria verified
321
+
322
+ ### Verdict: PASS
323
+ MD
324
+
325
+ AC5_OUT="$TMPDIR_EVAL/ac5.out"
326
+ # FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip prevents backstop re-runs for hermeticity.
327
+ printf '{"hook_event_name":"Stop","cwd":"%s"}' "$AC5_PROJ" \
328
+ | FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$AC5_OUT" 2>&1 || true
329
+
330
+ if grep -q "workflow:sidecar -- claim" "$AC5_OUT"; then
331
+ _pass "AC5: gate-hint 'workflow:sidecar -- claim' appears in stop-goal-fit output"
332
+ else
333
+ _fail "AC5: gate-hint missing from stop-goal-fit output: $(cat "$AC5_OUT")"
334
+ fi
335
+
336
+ if grep -q "trust.bundle claim disputed" "$AC5_OUT"; then
337
+ _pass "AC5: disputed warning present in stop-goal-fit output"
338
+ else
339
+ _fail "AC5: disputed warning missing: $(cat "$AC5_OUT")"
340
+ fi
341
+
342
+ # ── Results ──────────────────────────────────────────────────────────────────
343
+
344
+ echo ""
345
+ echo "──────────────────────────────────"
346
+ echo "claim lookup tests: $((errors)) failed"
347
+ if [[ "$errors" -eq 0 ]]; then
348
+ echo "ALL PASSED"
349
+ exit 0
350
+ else
351
+ exit 1
352
+ fi
@@ -0,0 +1,275 @@
1
+ #!/usr/bin/env bash
2
+ # test_command_log_integrity.sh — Hash-chain tamper-evidence for command-log.jsonl (Increment B2)
3
+ #
4
+ # Proves that command-log.jsonl's hash-chain makes alteration DETECTABLE:
5
+ # 1. Altered entry (flip exitCode/observedResult without fixing hash) → broken → gate blocks.
6
+ # 2. Removed/reordered entry → linkage breaks → broken → gate blocks.
7
+ # 3. Legit untampered chain → ok → normal behavior; genuine fail still caught.
8
+ # 4. Legacy unchained log (pre-B2) → legacy → existing behavior unchanged.
9
+ #
10
+ # HONEST FRAMING (in comments and code): this is tamper-EVIDENCE, not tamper-PROOF.
11
+ # An agent that recomputes the whole chain can forge a valid chain. The real
12
+ # tamper-proof boundary is the signed checkpoint (B1, already merged). This chain
13
+ # raises the local bar and catches casual/accidental tampering and corruption.
14
+ #
15
+ # Usage: bash evals/integration/test_command_log_integrity.sh
16
+ set -uo pipefail
17
+
18
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
19
+ CAPTURE="$ROOT/scripts/hooks/evidence-capture.js"
20
+ GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
21
+
22
+ export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
23
+
24
+ TMP="$(mktemp -d)"
25
+ errors=0
26
+ _pass() { echo " ✓ $1"; }
27
+ _fail() { echo " ✗ $1"; errors=$((errors + 1)); }
28
+
29
+ cleanup() { rm -rf "$TMP"; }
30
+ trap cleanup EXIT
31
+
32
+ # ── helper: seed a minimal delivered workflow artifact ────────────────────────
33
+ seed_repo() { # $1=dir $2=slug
34
+ local p="$1" slug="$2"
35
+ mkdir -p "$p/.flow-agents/$slug"
36
+ printf '# Repo\n' > "$p/AGENTS.md"
37
+ printf '%s' "{\"schema_version\":\"1.0\",\"task_slug\":\"$slug\",\"status\":\"delivered\",\"phase\":\"done\",\"updated_at\":\"2026-06-23T00:00:00Z\",\"next_action\":{\"status\":\"done\",\"summary\":\"done\"}}" \
38
+ > "$p/.flow-agents/$slug/state.json"
39
+ cat > "$p/.flow-agents/$slug/$slug--deliver.md" << MD
40
+ # $slug
41
+
42
+ branch: main
43
+ status: delivered
44
+ type: deliver
45
+
46
+ ## Definition Of Done
47
+ - [x] tests pass
48
+
49
+ ## Goal Fit Gate
50
+ - [x] acceptance verified
51
+
52
+ ### Verdict: PASS
53
+ MD
54
+ }
55
+
56
+ # Write two chained entries to command-log.jsonl via evidence-capture.js.
57
+ # Returns the log file path.
58
+ write_chained_log() { # $1=repo_dir $2=slug
59
+ local p="$1" slug="$2"
60
+ # Entry 0: npm test passes
61
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm test"},"tool_response":{"exitCode":0,"stdout":"ok"}}' "$p" \
62
+ | node "$CAPTURE" >/dev/null 2>&1
63
+ # Entry 1: npm run lint FAILS
64
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm run lint"},"tool_response":{"exitCode":1,"stderr":"lint errors"}}' "$p" \
65
+ | node "$CAPTURE" >/dev/null 2>&1
66
+ }
67
+
68
+ # ─── Test 1: altered entry detected (flip exitCode/observedResult, keep old hash) ──────
69
+ echo "Test 1: altered entry (flip fail→pass without fixing hash) → broken → gate blocks"
70
+
71
+ T1="$TMP/t1"; seed_repo "$T1" t1
72
+ write_chained_log "$T1" t1
73
+
74
+ LOG="$T1/.flow-agents/t1/command-log.jsonl"
75
+
76
+ if [[ -f "$LOG" ]]; then _pass "T1: command-log.jsonl written"; else _fail "T1: command-log.jsonl missing"; fi
77
+
78
+ # Verify clean chain (before tamper)
79
+ chain_status=$(node -e "const g = require('$GATE'); const r = g.verifyCommandLogChain('$T1/.flow-agents/t1'); console.log(r.status);")
80
+ if [[ "$chain_status" == "ok" ]]; then
81
+ _pass "T1: untampered chain verifies as ok"
82
+ else
83
+ _fail "T1: expected ok, got $chain_status"
84
+ fi
85
+
86
+ # Tamper: flip entry 1 (lint, FAIL) to look like a PASS — change exitCode and observedResult
87
+ # but do NOT update _chain.hash → chain is broken.
88
+ python3 - "$LOG" << 'PY'
89
+ import json, sys
90
+ lines = open(sys.argv[1]).read().strip().split('\n')
91
+ e1 = json.loads(lines[1])
92
+ e1['exitCode'] = 0 # hide the failure
93
+ e1['observedResult'] = 'pass' # claim it passed
94
+ # _chain.hash is NOT updated — deliberate, this is the tamper
95
+ lines[1] = json.dumps(e1)
96
+ open(sys.argv[1], 'w').write('\n'.join(lines) + '\n')
97
+ PY
98
+
99
+ # Verify broken chain
100
+ chain_after=$(node -e "const g = require('$GATE'); const r = g.verifyCommandLogChain('$T1/.flow-agents/t1'); console.log(r.status + ':' + r.brokenAt);")
101
+ if [[ "$chain_after" == "broken:1" ]]; then
102
+ _pass "T1: tampered entry detected → broken at entry 1"
103
+ else
104
+ _fail "T1: expected broken:1, got $chain_after"
105
+ fi
106
+
107
+ # Seed evidence.json claiming npm test passed (the untampered entry)
108
+ # The tampered entry (lint) was a FAIL flipped to PASS — so the log now shows a false pass.
109
+ # Since chain is broken, gate should block with integrity warning and NOT trust log passes.
110
+ printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"npm-test","kind":"command","status":"pass","command":"npm test","summary":"passed"}]}' \
111
+ > "$T1/.flow-agents/t1/evidence.json"
112
+
113
+ set +e
114
+ gate_out=$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
115
+ node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T1\"}")
116
+ gate_exit=$?
117
+ set -e
118
+
119
+ if [[ "$gate_exit" -eq 2 ]]; then
120
+ _pass "T1: gate blocks (exit 2) when chain is broken"
121
+ else
122
+ _fail "T1: gate should block on broken chain, exit=$gate_exit output=$gate_out"
123
+ fi
124
+
125
+ if echo "$gate_out" | grep -q "command-log integrity check FAILED"; then
126
+ _pass "T1: gate emits integrity-failure warning"
127
+ else
128
+ _fail "T1: missing integrity-failure warning: $gate_out"
129
+ fi
130
+
131
+ if echo "$gate_out" | grep -q "NOT trusted"; then
132
+ _pass "T1: gate emits 'NOT trusted' signal for claimed passes"
133
+ else
134
+ _fail "T1: missing NOT trusted signal: $gate_out"
135
+ fi
136
+
137
+ # ─── Test 2: removed/reordered entry detected ─────────────────────────────────────
138
+ echo ""
139
+ echo "Test 2: removed/reordered entry → linkage breaks → broken → gate flags it"
140
+
141
+ T2="$TMP/t2"; seed_repo "$T2" t2
142
+ write_chained_log "$T2" t2
143
+
144
+ LOG2="$T2/.flow-agents/t2/command-log.jsonl"
145
+ lines_before=$(wc -l < "$LOG2" | tr -d ' ')
146
+
147
+ # Reorder: swap entry 0 and entry 1
148
+ python3 - "$LOG2" << 'PY'
149
+ import sys
150
+ lines = open(sys.argv[1]).read().strip().split('\n')
151
+ # swap
152
+ lines[0], lines[1] = lines[1], lines[0]
153
+ open(sys.argv[1], 'w').write('\n'.join(lines) + '\n')
154
+ PY
155
+
156
+ chain_reorder=$(node -e "const g = require('$GATE'); const r = g.verifyCommandLogChain('$T2/.flow-agents/t2'); console.log(r.status);")
157
+ if [[ "$chain_reorder" == "broken" ]]; then
158
+ _pass "T2: reordered entries detected → broken"
159
+ else
160
+ _fail "T2: expected broken on reorder, got $chain_reorder"
161
+ fi
162
+
163
+ # Test: delete middle entry (restore then delete entry 0 so entry 1's prevHash is wrong)
164
+ write_chained_log "$T2" t2 # re-append fresh entries (now 4 total — but that's fine for test)
165
+ # Write a fresh log with just 2 entries and then delete the first
166
+ LOG2_FRESH="$T2/.flow-agents/t2/command-log.jsonl"
167
+ python3 - "$LOG2_FRESH" << 'PY'
168
+ import sys
169
+ lines = [l for l in open(sys.argv[1]).read().strip().split('\n') if l.strip()]
170
+ # Keep only the last 2 entries (fresh from second write_chained_log call above)
171
+ last2 = lines[-2:]
172
+ # Delete entry[0] of the last2 → only entry[1] remains, whose prevHash won't match genesis
173
+ open(sys.argv[1], 'w').write(last2[1] + '\n')
174
+ PY
175
+
176
+ chain_delete=$(node -e "const g = require('$GATE'); const r = g.verifyCommandLogChain('$T2/.flow-agents/t2'); console.log(r.status);")
177
+ if [[ "$chain_delete" == "broken" ]]; then
178
+ _pass "T2: removed predecessor entry detected → broken (prevHash mismatch)"
179
+ else
180
+ _fail "T2: expected broken on removed predecessor, got $chain_delete"
181
+ fi
182
+
183
+ # ─── Test 3: legit untampered chain — ok — genuine fail still caught ─────────────────
184
+ echo ""
185
+ echo "Test 3: legit untampered chain → ok → genuine fail still caught (capture-teeth)"
186
+
187
+ T3="$TMP/t3"; seed_repo "$T3" t3
188
+ # Write entry 0 (pass) and entry 1 (fail)
189
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm test"},"tool_response":{"exitCode":0}}' "$T3" \
190
+ | node "$CAPTURE" >/dev/null 2>&1
191
+ printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm run build"},"tool_response":{"exitCode":1}}' "$T3" \
192
+ | node "$CAPTURE" >/dev/null 2>&1
193
+
194
+ chain_legit=$(node -e "const g = require('$GATE'); const r = g.verifyCommandLogChain('$T3/.flow-agents/t3'); console.log(r.status);")
195
+ if [[ "$chain_legit" == "ok" ]]; then
196
+ _pass "T3: untampered chained log verifies ok"
197
+ else
198
+ _fail "T3: expected ok, got $chain_legit"
199
+ fi
200
+
201
+ # Evidence claims npm run build passed (it actually failed → capture log shows fail → block)
202
+ printf '%s' '{"schema_version":"1.0","task_slug":"t3","verdict":"pass","checks":[{"id":"build","kind":"command","status":"pass","command":"npm run build","summary":"build passed"}]}' \
203
+ > "$T3/.flow-agents/t3/evidence.json"
204
+
205
+ set +e
206
+ gate3_out=$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
207
+ node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T3\"}")
208
+ gate3_exit=$?
209
+ set -e
210
+
211
+ if [[ "$gate3_exit" -eq 2 ]]; then
212
+ _pass "T3: gate blocks on genuine fail caught by capture log (ok chain, capture teeth active)"
213
+ else
214
+ _fail "T3: gate should block on captured fail, exit=$gate3_exit output=$gate3_out"
215
+ fi
216
+
217
+ if echo "$gate3_out" | grep -q "capture log CONTRADICTS claimed pass"; then
218
+ _pass "T3: gate emits capture-log contradicts warning (genuine fail caught)"
219
+ else
220
+ _fail "T3: missing capture-log contradicts warning: $gate3_out"
221
+ fi
222
+
223
+ if ! echo "$gate3_out" | grep -q "command-log integrity check FAILED"; then
224
+ _pass "T3: no false integrity-failure warning for untampered chain"
225
+ else
226
+ _fail "T3: spurious integrity-failure warning emitted: $gate3_out"
227
+ fi
228
+
229
+ # ─── Test 4: backward-compat — legacy unchained log → legacy → existing behavior ────
230
+ echo ""
231
+ echo "Test 4: legacy unchained log (no _chain) → legacy → existing behavior unchanged"
232
+
233
+ T4="$TMP/t4"; seed_repo "$T4" t4
234
+
235
+ # Write a legacy-style log (no _chain field) — exactly like pre-B2 fixtures
236
+ printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' \
237
+ > "$T4/.flow-agents/t4/command-log.jsonl"
238
+
239
+ chain_legacy=$(node -e "const g = require('$GATE'); const r = g.verifyCommandLogChain('$T4/.flow-agents/t4'); console.log(r.status);")
240
+ if [[ "$chain_legacy" == "legacy" ]]; then
241
+ _pass "T4: unchained (legacy) log returns legacy status"
242
+ else
243
+ _fail "T4: expected legacy, got $chain_legacy"
244
+ fi
245
+
246
+ # Evidence claims npm test passed, but legacy log shows it failed → still blocks
247
+ printf '%s' '{"schema_version":"1.0","task_slug":"t4","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"passed"}]}' \
248
+ > "$T4/.flow-agents/t4/evidence.json"
249
+
250
+ set +e
251
+ gate4_out=$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
252
+ node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T4\"}")
253
+ gate4_exit=$?
254
+ set -e
255
+
256
+ if [[ "$gate4_exit" -eq 2 ]] && echo "$gate4_out" | grep -q "capture log CONTRADICTS"; then
257
+ _pass "T4: legacy log still catches false-completion (existing behavior preserved)"
258
+ else
259
+ _fail "T4: legacy log failed to catch false-completion: exit=$gate4_exit output=$gate4_out"
260
+ fi
261
+
262
+ if ! echo "$gate4_out" | grep -q "command-log integrity check FAILED"; then
263
+ _pass "T4: no integrity-failure warning for legacy (unchained) log"
264
+ else
265
+ _fail "T4: spurious integrity warning for legacy log: $gate4_out"
266
+ fi
267
+
268
+ # ─── Summary ─────────────────────────────────────────────────────────────────
269
+ echo ""
270
+ if [[ "$errors" -eq 0 ]]; then
271
+ echo "command-log integrity tests passed."
272
+ exit 0
273
+ fi
274
+ echo "command-log integrity tests FAILED: $errors issue(s)."
275
+ exit 1
@@ -38,10 +38,8 @@ for expected in \
38
38
  'Support Skills' \
39
39
  'Agents' \
40
40
  'Optional Powers' \
41
- 'Packs' \
42
41
  'Context Loading Rules' \
43
42
  'npm run context-map:check' \
44
- 'packaging/packs.json' \
45
43
  'workflow-release.schema.json' \
46
44
  'workflow-learning.schema.json' \
47
45
  'plan-work' \