@kontourai/flow-agents 1.4.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/CODEOWNERS +29 -0
- package/.github/actions/trust-verify/action.yml +145 -0
- package/.github/workflows/ci.yml +11 -4
- package/.github/workflows/kit-gates-demo.yml +2 -2
- package/.github/workflows/publish-npm.yml +10 -2
- package/.github/workflows/release-please.yml +1 -1
- package/.github/workflows/runtime-compat.yml +1 -1
- package/.github/workflows/trust-reconcile.yml +113 -0
- package/AGENTS.md +13 -0
- package/CHANGELOG.md +103 -0
- package/CONTRIBUTING.md +4 -4
- package/README.md +1 -0
- package/agents/tool-planner.json +1 -1
- package/build/src/cli/init.js +242 -20
- package/build/src/cli/validate-workflow-artifacts.js +19 -2
- package/build/src/cli/verify.d.ts +1 -0
- package/build/src/cli/verify.js +90 -0
- package/build/src/cli/workflow-sidecar.d.ts +316 -8
- package/build/src/cli/workflow-sidecar.js +1996 -91
- package/build/src/cli.js +2 -3
- package/build/src/lib/flow-resolver.d.ts +111 -0
- package/build/src/lib/flow-resolver.js +308 -0
- package/build/src/tools/build-universal-bundles.js +34 -22
- package/build/src/tools/generate-context-map.js +3 -16
- package/build/src/tools/validate-source-tree.d.ts +1 -1
- package/build/src/tools/validate-source-tree.js +42 -162
- package/context/contracts/artifact-contract.md +10 -0
- package/context/contracts/delivery-contract.md +1 -0
- package/context/contracts/review-contract.md +1 -0
- package/context/contracts/verification-contract.md +2 -0
- package/context/gate-awareness.md +39 -0
- package/context/scripts/hooks/stop-goal-fit.js +632 -70
- package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
- package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
- package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
- package/docs/adr/0007-skill-audit.md +1 -1
- package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
- package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
- package/docs/adr/0011-mcp-posture.md +100 -0
- package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
- package/docs/adr/0013-context-lifecycle.md +151 -0
- package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
- package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
- package/docs/adr/0016-three-hard-boundary-model.md +71 -0
- package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
- package/docs/agent-system-guidebook.md +5 -12
- package/docs/context-map.md +4 -10
- package/docs/index.md +3 -2
- package/docs/integrations/framework-adapter.md +19 -6
- package/docs/integrations/index.md +2 -2
- package/docs/north-star.md +4 -4
- package/docs/operating-layers.md +3 -3
- package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
- package/docs/repository-structure.md +2 -2
- package/docs/skills-map.md +1 -0
- package/docs/spec/runtime-hook-surface.md +62 -9
- package/docs/standards-register.md +3 -3
- package/docs/survey-utterance-check.md +1 -1
- package/docs/trust-anchor-adoption.md +197 -0
- package/docs/verifiable-trust.md +95 -0
- package/docs/veritas-integration.md +2 -2
- package/docs/workflow-usage-guide.md +69 -0
- package/evals/acceptance/DEMO-false-completion.md +144 -0
- package/evals/acceptance/demo-cast.sh +92 -0
- package/evals/acceptance/demo-false-completion.sh +72 -0
- package/evals/acceptance/demo-real-evidence.sh +104 -0
- package/evals/acceptance/demo.tape +29 -0
- package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
- package/evals/acceptance/prove-capture-teeth.sh +114 -0
- package/evals/acceptance/prove-teeth.sh +105 -0
- package/evals/ci/antigaming-suite.sh +55 -0
- package/evals/ci/run-baseline.sh +2 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
- package/evals/integration/test_builder_step_producers.sh +379 -0
- package/evals/integration/test_bundle_install.sh +35 -71
- package/evals/integration/test_bundle_lifecycle.sh +39 -2
- package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
- package/evals/integration/test_checkpoint_signing.sh +489 -0
- package/evals/integration/test_claim_lookup.sh +352 -0
- package/evals/integration/test_command_log_fork_classification.sh +134 -0
- package/evals/integration/test_command_log_integrity.sh +275 -0
- package/evals/integration/test_context_map.sh +0 -2
- package/evals/integration/test_dual_emit_flow_step.sh +278 -0
- package/evals/integration/test_enforcer_expects_driven.sh +281 -0
- package/evals/integration/test_evidence_capture_hook.sh +185 -0
- package/evals/integration/test_flow_kit_repository.sh +2 -0
- package/evals/integration/test_flowdef_session_activation.sh +273 -0
- package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
- package/evals/integration/test_gate_bypass_chain.sh +448 -0
- package/evals/integration/test_gate_lockdown.sh +1137 -0
- package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
- package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
- package/evals/integration/test_goal_fit_hook.sh +69 -4
- package/evals/integration/test_goal_fit_rederive.sh +263 -0
- package/evals/integration/test_install_merge.sh +1176 -0
- package/evals/integration/test_kit_identity_trust.sh +393 -0
- package/evals/integration/test_mint_attestation.sh +373 -0
- package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
- package/evals/integration/test_publish_delivery.sh +269 -0
- package/evals/integration/test_reconcile_soundness.sh +528 -0
- package/evals/integration/test_resolvefirststep_security.sh +208 -0
- package/evals/integration/test_session_resume_roundtrip.sh +286 -0
- package/evals/integration/test_trust_checkpoint.sh +325 -0
- package/evals/integration/test_trust_reconcile.sh +293 -0
- package/evals/integration/test_verify_cli.sh +208 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
- package/evals/lib/node.sh +0 -6
- package/evals/run.sh +47 -0
- package/evals/static/test_workflow_skills.sh +6 -13
- package/install.sh +0 -7
- package/integrations/strands-ts/README.md +25 -15
- package/integrations/veritas/flow-agents.adapter.json +1 -2
- package/kits/builder/flows/build.flow.json +59 -12
- package/kits/builder/kit.json +85 -15
- package/kits/builder/skills/continue-work/SKILL.md +116 -0
- package/kits/builder/skills/deliver/SKILL.md +36 -6
- package/kits/builder/skills/design-probe/SKILL.md +28 -0
- package/kits/builder/skills/execute-plan/SKILL.md +9 -1
- package/kits/builder/skills/gate-review/SKILL.md +234 -0
- package/kits/builder/skills/learning-review/SKILL.md +30 -0
- package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
- package/kits/builder/skills/plan-work/SKILL.md +13 -1
- package/kits/builder/skills/pull-work/SKILL.md +19 -0
- package/kits/knowledge/adapters/default-store/index.js +38 -0
- package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
- package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
- package/kits/knowledge/docs/store-contract.md +314 -0
- package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
- package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
- package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
- package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
- package/kits/knowledge/evals/entities/suite.test.js +40 -0
- package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
- package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
- package/kits/knowledge/evals/retirement/suite.test.js +145 -0
- package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
- package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
- package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
- package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
- package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
- package/kits/knowledge/kit.json +51 -1
- package/package.json +6 -6
- package/packaging/conformance/README.md +10 -2
- package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
- package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
- package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
- package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
- package/packaging/conformance/run-conformance.js +1 -1
- package/scripts/README.md +2 -1
- package/scripts/build-universal-bundles.js +0 -1
- package/scripts/ci/mint-attestation.js +221 -0
- package/scripts/ci/trust-reconcile.js +545 -0
- package/scripts/hooks/config-protection.js +423 -1
- package/scripts/hooks/evidence-capture.js +348 -0
- package/scripts/hooks/lib/liveness-read.js +113 -0
- package/scripts/hooks/run-hook.js +6 -1
- package/scripts/hooks/stop-goal-fit.js +1524 -79
- package/scripts/hooks/workflow-steering.js +135 -5
- package/scripts/install-codex-home.sh +39 -0
- package/scripts/install-merge.js +330 -0
- package/scripts/repair-command-log.js +115 -0
- package/src/cli/init.ts +218 -20
- package/src/cli/validate-workflow-artifacts.ts +18 -2
- package/src/cli/verify.ts +100 -0
- package/src/cli/workflow-sidecar.ts +2127 -84
- package/src/cli.ts +2 -3
- package/src/lib/flow-resolver.ts +369 -0
- package/src/tools/build-universal-bundles.ts +34 -21
- package/src/tools/generate-context-map.ts +3 -17
- package/src/tools/validate-source-tree.ts +44 -104
- package/build/src/tools/filter-installed-packs.d.ts +0 -2
- package/build/src/tools/filter-installed-packs.js +0 -135
- package/packaging/packs.json +0 -49
- package/scripts/filter-installed-packs.js +0 -2
- package/src/tools/filter-installed-packs.ts +0 -132
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_enforcer_expects_driven.sh — Integration eval for ADR 0016 Abstraction A P-c.
|
|
3
|
+
#
|
|
4
|
+
# Proves:
|
|
5
|
+
# 1. A TAMPERED declared-type bundle BLOCKS (exit 2) with the tamper/disputed
|
|
6
|
+
# warning. Session has current.json with active_flow_id=builder.build,
|
|
7
|
+
# active_step_id=verify. trust.bundle has a builder.verify.tests claim with
|
|
8
|
+
# stored status "verified" but evidence passing=false (re-derives to disputed).
|
|
9
|
+
# This exercises the expects[] claim-selection path in bundleEnforcement.
|
|
10
|
+
# 2. A CLEAN declared-type bundle PASSES (exit 0). Same session, same claimType,
|
|
11
|
+
# but passing evidence → re-derives to verified.
|
|
12
|
+
# 3. A NO-ACTIVE-FLOW bundle uses the workflow.* fallback (the workflow.check.*
|
|
13
|
+
# path): a tampered workflow.check.command claim still BLOCKS. current.json
|
|
14
|
+
# has no active_flow_id/active_step_id.
|
|
15
|
+
#
|
|
16
|
+
# Deterministic, no model spend, self-cleaning.
|
|
17
|
+
# Usage: bash evals/integration/test_enforcer_expects_driven.sh
|
|
18
|
+
|
|
19
|
+
set -uo pipefail
|
|
20
|
+
|
|
21
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
22
|
+
GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
|
|
23
|
+
|
|
24
|
+
export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
|
|
25
|
+
|
|
26
|
+
TMP="$(mktemp -d)"
|
|
27
|
+
errors=0
|
|
28
|
+
_pass() { echo " ✓ $1"; }
|
|
29
|
+
_fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
30
|
+
|
|
31
|
+
cleanup() { rm -rf "$TMP"; }
|
|
32
|
+
trap cleanup EXIT
|
|
33
|
+
|
|
34
|
+
# ─── helper: seed a minimal delivered workflow artifact ───────────────────────
|
|
35
|
+
seed_repo() { # $1=dir $2=slug
|
|
36
|
+
local p="$1" slug="$2"
|
|
37
|
+
mkdir -p "$p/.flow-agents/$slug"
|
|
38
|
+
printf '# Repo\n' > "$p/AGENTS.md"
|
|
39
|
+
printf '%s' "{\"schema_version\":\"1.0\",\"task_slug\":\"$slug\",\"status\":\"delivered\",\"phase\":\"done\",\"updated_at\":\"2026-06-26T00:00:00Z\",\"next_action\":{\"status\":\"done\",\"summary\":\"done\"}}" \
|
|
40
|
+
> "$p/.flow-agents/$slug/state.json"
|
|
41
|
+
cat > "$p/.flow-agents/$slug/$slug--deliver.md" << MD
|
|
42
|
+
# $slug
|
|
43
|
+
|
|
44
|
+
branch: main
|
|
45
|
+
status: delivered
|
|
46
|
+
type: deliver
|
|
47
|
+
|
|
48
|
+
## Definition Of Done
|
|
49
|
+
- [x] tests pass
|
|
50
|
+
|
|
51
|
+
## Goal Fit Gate
|
|
52
|
+
- [x] acceptance verified
|
|
53
|
+
|
|
54
|
+
### Verdict: PASS
|
|
55
|
+
MD
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# ─── Test 1: TAMPERED declared-type bundle BLOCKS via expects[] path ─────────
|
|
59
|
+
# current.json has active_flow_id=builder.build, active_step_id=verify.
|
|
60
|
+
# The trust.bundle has builder.verify.tests (declared by verify-gate expects[]),
|
|
61
|
+
# stored status "verified" but evidence passing=false → re-derives to "disputed".
|
|
62
|
+
# The enforcer must use the expects[] path and BLOCK with the tamper warning.
|
|
63
|
+
echo "Test 1: tampered declared-type bundle (builder.verify.tests, stored verified, evidence→disputed) must BLOCK via expects[] path"
|
|
64
|
+
|
|
65
|
+
T1_DIR="$TMP/t1"
|
|
66
|
+
seed_repo "$T1_DIR" "declares-tampered"
|
|
67
|
+
|
|
68
|
+
# current.json: active flow
|
|
69
|
+
printf '%s' '{"artifact_dir":"declares-tampered","active_flow_id":"builder.build","active_step_id":"verify"}' \
|
|
70
|
+
> "$T1_DIR/.flow-agents/current.json"
|
|
71
|
+
|
|
72
|
+
python3 - "$T1_DIR/.flow-agents/declares-tampered/trust.bundle" << 'PY'
|
|
73
|
+
import json, sys
|
|
74
|
+
bundle = {
|
|
75
|
+
"schemaVersion": 3,
|
|
76
|
+
"source": "flow-agents/workflow-sidecar",
|
|
77
|
+
"claims": [{
|
|
78
|
+
"id": "c1",
|
|
79
|
+
"subjectId": "declares-tampered/tests",
|
|
80
|
+
"subjectType": "flow-step",
|
|
81
|
+
"claimType": "builder.verify.tests",
|
|
82
|
+
"fieldOrBehavior": "build/verify tests",
|
|
83
|
+
"value": "pass",
|
|
84
|
+
"impactLevel": "high",
|
|
85
|
+
"status": "verified", # tampered: edited from "disputed" → "verified"
|
|
86
|
+
"createdAt": "2026-06-26T00:00:00Z",
|
|
87
|
+
"updatedAt": "2026-06-26T00:00:00Z"
|
|
88
|
+
}],
|
|
89
|
+
"evidence": [{
|
|
90
|
+
"id": "ev1",
|
|
91
|
+
"claimId": "c1",
|
|
92
|
+
"evidenceType": "test_output",
|
|
93
|
+
"method": "validation",
|
|
94
|
+
"sourceRef": "command-log.jsonl",
|
|
95
|
+
"excerptOrSummary": "npm test failed with exit 1",
|
|
96
|
+
"observedAt": "2026-06-26T00:00:00Z",
|
|
97
|
+
"collectedBy": "harness",
|
|
98
|
+
"passing": False,
|
|
99
|
+
"blocking": True
|
|
100
|
+
}],
|
|
101
|
+
"policies": [],
|
|
102
|
+
"events": [{
|
|
103
|
+
"id": "evt1",
|
|
104
|
+
"claimId": "c1",
|
|
105
|
+
"status": "verified",
|
|
106
|
+
"actor": "agent",
|
|
107
|
+
"method": "workflow-check",
|
|
108
|
+
"evidenceIds": ["ev1"],
|
|
109
|
+
"createdAt": "2026-06-26T00:00:00Z"
|
|
110
|
+
}]
|
|
111
|
+
}
|
|
112
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
113
|
+
PY
|
|
114
|
+
|
|
115
|
+
set +e
|
|
116
|
+
t1_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
117
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T1_DIR\"}")"
|
|
118
|
+
t1_exit="$?"
|
|
119
|
+
set -e
|
|
120
|
+
|
|
121
|
+
if [ "$t1_exit" -eq 2 ]; then
|
|
122
|
+
_pass "tampered declared-type bundle blocks (exit 2)"
|
|
123
|
+
else
|
|
124
|
+
_fail "tampered declared-type bundle did NOT block: exit=$t1_exit output=$t1_out"
|
|
125
|
+
fi
|
|
126
|
+
|
|
127
|
+
if echo "$t1_out" | grep -qE "stored status.*does not match recompute|possible tampered bundle"; then
|
|
128
|
+
_pass "tampered declared-type bundle emits tamper warning"
|
|
129
|
+
else
|
|
130
|
+
_fail "tampered declared-type bundle missing tamper warning: $t1_out"
|
|
131
|
+
fi
|
|
132
|
+
|
|
133
|
+
if echo "$t1_out" | grep -q "caught false-completion"; then
|
|
134
|
+
_pass "tampered declared-type bundle emits caught false-completion"
|
|
135
|
+
else
|
|
136
|
+
_fail "tampered declared-type bundle missing caught false-completion: $t1_out"
|
|
137
|
+
fi
|
|
138
|
+
|
|
139
|
+
if echo "$t1_out" | grep -q "builder.verify.tests"; then
|
|
140
|
+
_pass "tampered declared-type bundle warning names the declared claimType"
|
|
141
|
+
else
|
|
142
|
+
_fail "tampered declared-type bundle warning does not mention builder.verify.tests: $t1_out"
|
|
143
|
+
fi
|
|
144
|
+
|
|
145
|
+
# ─── Test 2: CLEAN declared-type bundle PASSES ───────────────────────────────
|
|
146
|
+
# Same session, same claimType, but passing evidence → re-derives to verified.
|
|
147
|
+
# Must NOT block.
|
|
148
|
+
echo ""
|
|
149
|
+
echo "Test 2: clean declared-type bundle (builder.verify.tests, passing evidence→verified) must ALLOW"
|
|
150
|
+
|
|
151
|
+
T2_DIR="$TMP/t2"
|
|
152
|
+
seed_repo "$T2_DIR" "declares-clean"
|
|
153
|
+
|
|
154
|
+
printf '%s' '{"artifact_dir":"declares-clean","active_flow_id":"builder.build","active_step_id":"verify"}' \
|
|
155
|
+
> "$T2_DIR/.flow-agents/current.json"
|
|
156
|
+
|
|
157
|
+
python3 - "$T2_DIR/.flow-agents/declares-clean/trust.bundle" << 'PY'
|
|
158
|
+
import json, sys
|
|
159
|
+
bundle = {
|
|
160
|
+
"schemaVersion": 3,
|
|
161
|
+
"source": "flow-agents/workflow-sidecar",
|
|
162
|
+
"claims": [{
|
|
163
|
+
"id": "c2",
|
|
164
|
+
"subjectId": "declares-clean/tests",
|
|
165
|
+
"subjectType": "flow-step",
|
|
166
|
+
"claimType": "builder.verify.tests",
|
|
167
|
+
"fieldOrBehavior": "build/verify tests",
|
|
168
|
+
"value": "pass",
|
|
169
|
+
"impactLevel": "high",
|
|
170
|
+
"status": "verified",
|
|
171
|
+
"createdAt": "2026-06-26T00:00:00Z",
|
|
172
|
+
"updatedAt": "2026-06-26T00:00:00Z"
|
|
173
|
+
}],
|
|
174
|
+
"evidence": [{
|
|
175
|
+
"id": "ev2",
|
|
176
|
+
"claimId": "c2",
|
|
177
|
+
"evidenceType": "test_output",
|
|
178
|
+
"method": "validation",
|
|
179
|
+
"sourceRef": "command-log.jsonl",
|
|
180
|
+
"excerptOrSummary": "npm test passed",
|
|
181
|
+
"observedAt": "2026-06-26T00:00:00Z",
|
|
182
|
+
"collectedBy": "harness",
|
|
183
|
+
"passing": True,
|
|
184
|
+
"blocking": False
|
|
185
|
+
}],
|
|
186
|
+
"policies": [],
|
|
187
|
+
"events": [{
|
|
188
|
+
"id": "evt2",
|
|
189
|
+
"claimId": "c2",
|
|
190
|
+
"status": "verified",
|
|
191
|
+
"actor": "agent",
|
|
192
|
+
"method": "workflow-check",
|
|
193
|
+
"evidenceIds": ["ev2"],
|
|
194
|
+
"createdAt": "2026-06-26T00:00:00Z"
|
|
195
|
+
}]
|
|
196
|
+
}
|
|
197
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
198
|
+
PY
|
|
199
|
+
|
|
200
|
+
set +e
|
|
201
|
+
t2_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
202
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T2_DIR\"}")"
|
|
203
|
+
t2_exit="$?"
|
|
204
|
+
set -e
|
|
205
|
+
|
|
206
|
+
if [ "$t2_exit" -ne 2 ]; then
|
|
207
|
+
_pass "clean declared-type bundle not blocked (exit $t2_exit)"
|
|
208
|
+
else
|
|
209
|
+
_fail "clean declared-type bundle false-blocked (exit 2): $t2_out"
|
|
210
|
+
fi
|
|
211
|
+
|
|
212
|
+
if echo "$t2_out" | grep -q "caught false-completion"; then
|
|
213
|
+
_fail "clean declared-type bundle incorrectly emits false-completion: $t2_out"
|
|
214
|
+
else
|
|
215
|
+
_pass "clean declared-type bundle does not emit false-completion"
|
|
216
|
+
fi
|
|
217
|
+
|
|
218
|
+
# ─── Test 3: NO-ACTIVE-FLOW bundle uses workflow.* fallback path ─────────────
|
|
219
|
+
# current.json has NO active_flow_id/active_step_id (or no current.json at all).
|
|
220
|
+
# The trust.bundle has workflow.check.command claims with stored "disputed".
|
|
221
|
+
# Must still BLOCK via the workflow.* path (no regression from #133).
|
|
222
|
+
echo ""
|
|
223
|
+
echo "Test 3: no-active-flow bundle must use workflow.* fallback and still BLOCK"
|
|
224
|
+
|
|
225
|
+
T3_DIR="$TMP/t3"
|
|
226
|
+
seed_repo "$T3_DIR" "no-flow"
|
|
227
|
+
|
|
228
|
+
# No current.json flow keys (empty current.json that is still valid)
|
|
229
|
+
printf '%s' '{"artifact_dir":"no-flow"}' \
|
|
230
|
+
> "$T3_DIR/.flow-agents/current.json"
|
|
231
|
+
|
|
232
|
+
python3 - "$T3_DIR/.flow-agents/no-flow/trust.bundle" << 'PY'
|
|
233
|
+
import json, sys
|
|
234
|
+
bundle = {
|
|
235
|
+
"schemaVersion": 3,
|
|
236
|
+
"source": "flow-agents/workflow-sidecar",
|
|
237
|
+
"claims": [{
|
|
238
|
+
"id": "c3",
|
|
239
|
+
"subjectId": "no-flow/unit-tests",
|
|
240
|
+
"subjectType": "workflow-check",
|
|
241
|
+
"claimType": "workflow.check.command",
|
|
242
|
+
"fieldOrBehavior": "unit tests",
|
|
243
|
+
"value": "fail",
|
|
244
|
+
"impactLevel": "high",
|
|
245
|
+
"status": "disputed", # stored as disputed (not tampered — correctly flagged)
|
|
246
|
+
"createdAt": "2026-06-26T00:00:00Z",
|
|
247
|
+
"updatedAt": "2026-06-26T00:00:00Z"
|
|
248
|
+
}],
|
|
249
|
+
"evidence": [],
|
|
250
|
+
"policies": [],
|
|
251
|
+
"events": []
|
|
252
|
+
}
|
|
253
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
254
|
+
PY
|
|
255
|
+
|
|
256
|
+
set +e
|
|
257
|
+
t3_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
258
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T3_DIR\"}")"
|
|
259
|
+
t3_exit="$?"
|
|
260
|
+
set -e
|
|
261
|
+
|
|
262
|
+
if [ "$t3_exit" -eq 2 ]; then
|
|
263
|
+
_pass "no-active-flow bundle still blocks via workflow.* fallback (exit 2)"
|
|
264
|
+
else
|
|
265
|
+
_fail "no-active-flow bundle did NOT block (exit $t3_exit): $t3_out"
|
|
266
|
+
fi
|
|
267
|
+
|
|
268
|
+
if echo "$t3_out" | grep -q "caught false-completion"; then
|
|
269
|
+
_pass "no-active-flow bundle emits caught false-completion"
|
|
270
|
+
else
|
|
271
|
+
_fail "no-active-flow bundle missing caught false-completion: $t3_out"
|
|
272
|
+
fi
|
|
273
|
+
|
|
274
|
+
# ─── Summary ─────────────────────────────────────────────────────────────────
|
|
275
|
+
echo ""
|
|
276
|
+
if [ "$errors" -eq 0 ]; then
|
|
277
|
+
echo "P-c enforcer expects-driven tests passed."
|
|
278
|
+
exit 0
|
|
279
|
+
fi
|
|
280
|
+
echo "P-c enforcer expects-driven tests FAILED: $errors issue(s)."
|
|
281
|
+
exit 1
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_evidence_capture_hook.sh — Capture-first evidence determinism contracts.
|
|
3
|
+
#
|
|
4
|
+
# Part A: evidence-capture.js deterministically records command executions to
|
|
5
|
+
# .flow-agents/<slug>/command-log.jsonl (machine-recorded, not model-claimed).
|
|
6
|
+
# Part B: stop-goal-fit.js cross-references evidence.json claimed-pass command
|
|
7
|
+
# checks against the capture log, and re-runs a TRUSTED backstop command
|
|
8
|
+
# only when the log has no execution for a claimed-pass command.
|
|
9
|
+
set -uo pipefail
|
|
10
|
+
|
|
11
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
12
|
+
CAPTURE="$ROOT/scripts/hooks/evidence-capture.js"
|
|
13
|
+
GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
|
|
14
|
+
|
|
15
|
+
# Disable the block escape hatch so repeated independent assertions never trip it.
|
|
16
|
+
export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
|
|
17
|
+
|
|
18
|
+
TMP="$(mktemp -d)"
|
|
19
|
+
errors=0
|
|
20
|
+
_pass() { echo " ✓ $1"; }
|
|
21
|
+
_fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
22
|
+
|
|
23
|
+
# ---- helpers -------------------------------------------------------------
|
|
24
|
+
seed_repo() { # $1 dir, $2 slug
|
|
25
|
+
local p="$1" slug="$2"
|
|
26
|
+
mkdir -p "$p/.flow-agents/$slug"
|
|
27
|
+
printf '# Repo\n' > "$p/AGENTS.md"
|
|
28
|
+
printf '%s' "{\"schema_version\":\"1.0\",\"task_slug\":\"$slug\",\"status\":\"delivered\",\"phase\":\"done\",\"updated_at\":\"2026-06-23T00:00:00Z\",\"next_action\":{\"status\":\"done\",\"summary\":\"done\"}}" > "$p/.flow-agents/$slug/state.json"
|
|
29
|
+
cat > "$p/.flow-agents/$slug/$slug--deliver.md" <<MD
|
|
30
|
+
# $slug
|
|
31
|
+
|
|
32
|
+
branch: main
|
|
33
|
+
status: delivered
|
|
34
|
+
type: deliver
|
|
35
|
+
|
|
36
|
+
## Definition Of Done
|
|
37
|
+
- [x] tests pass
|
|
38
|
+
|
|
39
|
+
## Goal Fit Gate
|
|
40
|
+
- [x] acceptance verified
|
|
41
|
+
|
|
42
|
+
### Verdict: PASS
|
|
43
|
+
MD
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
capture() { # stdin = payload json
|
|
47
|
+
node "$CAPTURE" >/dev/null 2>&1
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# ============================================================================
|
|
51
|
+
# Part A — deterministic capture
|
|
52
|
+
# ============================================================================
|
|
53
|
+
A="$TMP/capture"; seed_repo "$A" t1
|
|
54
|
+
echo "Part A: deterministic capture"
|
|
55
|
+
|
|
56
|
+
printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm test"},"tool_response":{"exitCode":0,"stdout":"ok"}}' "$A" | capture
|
|
57
|
+
printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm run lint"},"error":"command failed"}' "$A" | capture
|
|
58
|
+
printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"make build"},"tool_response":{"exit_code":2}}' "$A" | capture
|
|
59
|
+
# A non-command tool (Write) must NOT be captured.
|
|
60
|
+
printf '{"hook_event_name":"PostToolUse","tool_name":"Write","cwd":"%s","tool_input":{"file_path":"/tmp/x"}}' "$A" | capture
|
|
61
|
+
|
|
62
|
+
LOG="$A/.flow-agents/t1/command-log.jsonl"
|
|
63
|
+
if [[ -f "$LOG" ]]; then _pass "capture writes command-log.jsonl"; else _fail "capture did not write command-log.jsonl"; fi
|
|
64
|
+
|
|
65
|
+
lines=$(wc -l < "$LOG" | tr -d ' ')
|
|
66
|
+
if [[ "$lines" == "3" ]]; then _pass "capture records 3 command executions (Write tool excluded)"; else _fail "expected 3 log lines, got $lines"; fi
|
|
67
|
+
|
|
68
|
+
if rg -q '"command":"npm test","observedResult":"pass","exitCode":0' "$LOG"; then
|
|
69
|
+
_pass "clean exit 0 recorded as observedResult:pass exitCode:0"
|
|
70
|
+
else _fail "passing command not recorded correctly: $(cat "$LOG")"; fi
|
|
71
|
+
|
|
72
|
+
if rg -q '"command":"npm run lint","observedResult":"fail","exitCode":null' "$LOG"; then
|
|
73
|
+
_pass "error field with no exit code recorded as fail exitCode:null"
|
|
74
|
+
else _fail "errored command not recorded correctly"; fi
|
|
75
|
+
|
|
76
|
+
if rg -q '"command":"make build","observedResult":"fail","exitCode":2' "$LOG"; then
|
|
77
|
+
_pass "non-zero exit recorded as fail with exitCode"
|
|
78
|
+
else _fail "non-zero-exit command not recorded correctly"; fi
|
|
79
|
+
|
|
80
|
+
if rg -q '"source":"postToolUse-capture"' "$LOG"; then _pass "records source:postToolUse-capture"; else _fail "missing source field"; fi
|
|
81
|
+
|
|
82
|
+
# Capture is non-blocking: it always exits 0 and echoes stdin.
|
|
83
|
+
out=$(printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"echo hi"},"error":"boom"}' "$A" | node "$CAPTURE"; echo "EXIT=$?")
|
|
84
|
+
if rg -q 'EXIT=0' <<<"$out" && rg -q 'echo hi' <<<"$out"; then
|
|
85
|
+
_pass "capture is non-blocking (exit 0, echoes stdin) even on a failing command"
|
|
86
|
+
else _fail "capture should be non-blocking and echo stdin"; fi
|
|
87
|
+
|
|
88
|
+
# ============================================================================
|
|
89
|
+
# Part B1 — gate cross-references log: claimed pass but log shows FAIL → block
|
|
90
|
+
# ============================================================================
|
|
91
|
+
echo "Part B1: log contradicts claimed pass → block"
|
|
92
|
+
B="$TMP/contradict"; seed_repo "$B" t1
|
|
93
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$B/.flow-agents/t1/evidence.json"
|
|
94
|
+
printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' > "$B/.flow-agents/t1/command-log.jsonl"
|
|
95
|
+
|
|
96
|
+
if FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip node "$GATE" >/dev/null 2>"$TMP/b1.err" <<JSON
|
|
97
|
+
{"hook_event_name":"Stop","cwd":"$B"}
|
|
98
|
+
JSON
|
|
99
|
+
then _fail "gate should BLOCK when capture log contradicts claimed pass"
|
|
100
|
+
else
|
|
101
|
+
status=$?
|
|
102
|
+
if [[ "$status" -eq 2 ]] && rg -q 'capture log CONTRADICTS claimed pass' "$TMP/b1.err" && rg -q 'caught false-completion' "$TMP/b1.err"; then
|
|
103
|
+
_pass "gate blocks (exit 2) caught false-completion via capture log"
|
|
104
|
+
else _fail "gate returned unexpected result: status=$status output=$(cat "$TMP/b1.err")"; fi
|
|
105
|
+
fi
|
|
106
|
+
|
|
107
|
+
# ============================================================================
|
|
108
|
+
# Part B2 — gate cross-references log: claimed pass and log shows PASS → no re-run
|
|
109
|
+
# ============================================================================
|
|
110
|
+
echo "Part B2: log confirms claimed pass → satisfied, no re-run"
|
|
111
|
+
C="$TMP/confirm"; seed_repo "$C" t1
|
|
112
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$C/.flow-agents/t1/evidence.json"
|
|
113
|
+
printf '%s\n' '{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' > "$C/.flow-agents/t1/command-log.jsonl"
|
|
114
|
+
# A poisoned npm on PATH proves the gate does NOT re-run when the log confirms.
|
|
115
|
+
POISON="$TMP/poison"; mkdir -p "$POISON"
|
|
116
|
+
printf '#!/usr/bin/env bash\necho "npm should not run" >&2\nexit 99\n' > "$POISON/npm"; chmod +x "$POISON/npm"
|
|
117
|
+
PATH="$POISON:$PATH" FLOW_AGENTS_GOAL_FIT_MODE=block node "$GATE" >/dev/null 2>"$TMP/b2.err" <<JSON
|
|
118
|
+
{"hook_event_name":"Stop","cwd":"$C"}
|
|
119
|
+
JSON
|
|
120
|
+
if rg -q 'CONTRADICTS|backstop|npm should not run' "$TMP/b2.err"; then
|
|
121
|
+
_fail "gate should NOT re-run or warn when the capture log confirms the pass: $(cat "$TMP/b2.err")"
|
|
122
|
+
else _pass "gate trusts the log on a confirmed pass and does not re-run the backstop"; fi
|
|
123
|
+
|
|
124
|
+
# ============================================================================
|
|
125
|
+
# Part B3 — never-captured claimed-pass command → trusted backstop re-run (declared manifest target FAILS) → block
|
|
126
|
+
# ============================================================================
|
|
127
|
+
echo "Part B3: never-captured claim → trusted manifest backstop catches a fail"
|
|
128
|
+
D="$TMP/backstop"; seed_repo "$D" t1
|
|
129
|
+
printf '%s' '{"name":"x","scripts":{"test":"exit 7"}}' > "$D/package.json"
|
|
130
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$D/.flow-agents/t1/evidence.json"
|
|
131
|
+
# command-log.jsonl intentionally absent — the command was never actually run.
|
|
132
|
+
|
|
133
|
+
if FLOW_AGENTS_GOAL_FIT_MODE=block node "$GATE" >/dev/null 2>"$TMP/b3.err" <<JSON
|
|
134
|
+
{"hook_event_name":"Stop","cwd":"$D"}
|
|
135
|
+
JSON
|
|
136
|
+
then _fail "gate should BLOCK when trusted backstop re-run of declared manifest target fails"
|
|
137
|
+
else
|
|
138
|
+
status=$?
|
|
139
|
+
if [[ "$status" -eq 2 ]] && rg -q 'trusted backstop \(manifest\)' "$TMP/b3.err" && rg -q 'FAILED with exit 7' "$TMP/b3.err"; then
|
|
140
|
+
_pass "gate runs trusted declared manifest target as backstop and blocks on its failure"
|
|
141
|
+
else _fail "backstop did not catch declared-target failure: status=$status output=$(cat "$TMP/b3.err")"; fi
|
|
142
|
+
fi
|
|
143
|
+
|
|
144
|
+
# ============================================================================
|
|
145
|
+
# Part B4 — never-captured claim, no trusted command resolves → NOT_VERIFIED (never a silent pass)
|
|
146
|
+
# ============================================================================
|
|
147
|
+
echo "Part B4: never-captured claim, nothing trusted resolves → NOT_VERIFIED"
|
|
148
|
+
E="$TMP/notverified"; seed_repo "$E" t1
|
|
149
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"custom","kind":"command","status":"pass","command":"./my-thing.sh","summary":"ran custom"}]}' > "$E/.flow-agents/t1/evidence.json"
|
|
150
|
+
|
|
151
|
+
if FLOW_AGENTS_GOAL_FIT_MODE=block node "$GATE" >/dev/null 2>"$TMP/b4.err" <<JSON
|
|
152
|
+
{"hook_event_name":"Stop","cwd":"$E"}
|
|
153
|
+
JSON
|
|
154
|
+
then _fail "gate should not silently pass an un-captured, un-verifiable claimed-pass command"
|
|
155
|
+
else
|
|
156
|
+
status=$?
|
|
157
|
+
if [[ "$status" -eq 2 ]] && rg -q 'NOT_VERIFIED' "$TMP/b4.err" && rg -q 'no trusted command' "$TMP/b4.err"; then
|
|
158
|
+
_pass "gate records NOT_VERIFIED (never a guess) when no trusted command resolves"
|
|
159
|
+
else _fail "NOT_VERIFIED path returned unexpected result: status=$status output=$(cat "$TMP/b4.err")"; fi
|
|
160
|
+
fi
|
|
161
|
+
|
|
162
|
+
# ============================================================================
|
|
163
|
+
# Part B5 — arbitrary model command is opt-in only (FLOW_AGENTS_GOAL_FIT_RECHECK)
|
|
164
|
+
# ============================================================================
|
|
165
|
+
echo "Part B5: free-form model command re-run is opt-in only"
|
|
166
|
+
F="$TMP/recheck"; seed_repo "$F" t1
|
|
167
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"t1","verdict":"pass","checks":[{"id":"custom","kind":"command","status":"pass","command":"exit 5","summary":"ran custom"}]}' > "$F/.flow-agents/t1/evidence.json"
|
|
168
|
+
# Opt-in ON: the model's free-form "exit 5" is re-run and fails → block.
|
|
169
|
+
if FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_RECHECK=true node "$GATE" >/dev/null 2>"$TMP/b5.err" <<JSON
|
|
170
|
+
{"hook_event_name":"Stop","cwd":"$F"}
|
|
171
|
+
JSON
|
|
172
|
+
then _fail "with RECHECK=true the failing model command should block"
|
|
173
|
+
else
|
|
174
|
+
status=$?
|
|
175
|
+
if [[ "$status" -eq 2 ]] && rg -q 'FLOW_AGENTS_GOAL_FIT_RECHECK' "$TMP/b5.err"; then
|
|
176
|
+
_pass "FLOW_AGENTS_GOAL_FIT_RECHECK=true opts into re-running the model's free-form command"
|
|
177
|
+
else _fail "recheck opt-in path returned unexpected result: status=$status output=$(cat "$TMP/b5.err")"; fi
|
|
178
|
+
fi
|
|
179
|
+
|
|
180
|
+
if [[ "$errors" -eq 0 ]]; then
|
|
181
|
+
echo "Evidence capture hook integration passed."
|
|
182
|
+
exit 0
|
|
183
|
+
fi
|
|
184
|
+
echo "Evidence capture hook integration failed: $errors issue(s)."
|
|
185
|
+
exit 1
|
|
@@ -53,6 +53,7 @@ expect_fail() {
|
|
|
53
53
|
|
|
54
54
|
echo "=== Flow Kit Repository Fixture Checks ==="
|
|
55
55
|
expect_pass "valid-local-kit"
|
|
56
|
+
expect_pass "valid-unknown-extension"
|
|
56
57
|
expect_fail "invalid-schema-version" '\.schema_version must be "1\.0"'
|
|
57
58
|
expect_fail "invalid-missing-schema-version" '\.schema_version must be "1\.0"'
|
|
58
59
|
expect_fail "invalid-id" '\.id must be a kebab-case string'
|
|
@@ -63,6 +64,7 @@ expect_fail "invalid-absolute-path" 'flows\[0\]\.path must be relative'
|
|
|
63
64
|
expect_fail "invalid-traversal" "flows\\[0\\]\\.path must not contain"
|
|
64
65
|
expect_fail "invalid-malformed-json" 'invalid JSON'
|
|
65
66
|
expect_fail "invalid-asset-section" '\.docs must be a list'
|
|
67
|
+
expect_fail "invalid-missing-extension-asset" 'docs\[0\]\.path points at missing asset'
|
|
66
68
|
expect_fail "invalid-duplicate-flow" "flows\\[1\\]\\.path duplicates"
|
|
67
69
|
|
|
68
70
|
echo ""
|