@kontourai/flow-agents 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/CODEOWNERS +29 -0
- package/.github/actions/trust-verify/action.yml +145 -0
- package/.github/workflows/ci.yml +11 -4
- package/.github/workflows/kit-gates-demo.yml +2 -2
- package/.github/workflows/publish-npm.yml +10 -2
- package/.github/workflows/release-please.yml +1 -1
- package/.github/workflows/trust-reconcile.yml +113 -0
- package/AGENTS.md +13 -0
- package/CHANGELOG.md +95 -0
- package/CONTRIBUTING.md +4 -4
- package/README.md +1 -0
- package/agents/tool-planner.json +1 -1
- package/build/src/cli/init.js +242 -20
- package/build/src/cli/validate-workflow-artifacts.js +19 -2
- package/build/src/cli/verify.d.ts +1 -0
- package/build/src/cli/verify.js +90 -0
- package/build/src/cli/workflow-sidecar.d.ts +300 -8
- package/build/src/cli/workflow-sidecar.js +1934 -83
- package/build/src/cli.js +2 -3
- package/build/src/lib/flow-resolver.d.ts +82 -0
- package/build/src/lib/flow-resolver.js +237 -0
- package/build/src/tools/build-universal-bundles.js +34 -22
- package/build/src/tools/generate-context-map.js +3 -16
- package/build/src/tools/validate-source-tree.d.ts +1 -1
- package/build/src/tools/validate-source-tree.js +42 -162
- package/context/contracts/artifact-contract.md +10 -0
- package/context/contracts/delivery-contract.md +1 -0
- package/context/contracts/review-contract.md +1 -0
- package/context/contracts/verification-contract.md +2 -0
- package/context/gate-awareness.md +39 -0
- package/context/scripts/hooks/stop-goal-fit.js +632 -70
- package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
- package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
- package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
- package/docs/adr/0007-skill-audit.md +1 -1
- package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
- package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
- package/docs/adr/0011-mcp-posture.md +100 -0
- package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
- package/docs/adr/0013-context-lifecycle.md +151 -0
- package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
- package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
- package/docs/adr/0016-three-hard-boundary-model.md +71 -0
- package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
- package/docs/agent-system-guidebook.md +5 -12
- package/docs/context-map.md +4 -10
- package/docs/index.md +3 -2
- package/docs/integrations/framework-adapter.md +19 -6
- package/docs/integrations/index.md +2 -2
- package/docs/north-star.md +4 -4
- package/docs/operating-layers.md +3 -3
- package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
- package/docs/repository-structure.md +2 -2
- package/docs/skills-map.md +1 -0
- package/docs/spec/runtime-hook-surface.md +62 -9
- package/docs/standards-register.md +3 -3
- package/docs/survey-utterance-check.md +1 -1
- package/docs/trust-anchor-adoption.md +197 -0
- package/docs/verifiable-trust.md +95 -0
- package/docs/veritas-integration.md +2 -2
- package/docs/workflow-usage-guide.md +69 -0
- package/evals/acceptance/DEMO-false-completion.md +144 -0
- package/evals/acceptance/demo-cast.sh +92 -0
- package/evals/acceptance/demo-false-completion.sh +72 -0
- package/evals/acceptance/demo-real-evidence.sh +104 -0
- package/evals/acceptance/demo.tape +29 -0
- package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
- package/evals/acceptance/prove-capture-teeth.sh +114 -0
- package/evals/acceptance/prove-teeth.sh +105 -0
- package/evals/ci/antigaming-suite.sh +54 -0
- package/evals/ci/run-baseline.sh +2 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
- package/evals/integration/test_builder_step_producers.sh +379 -0
- package/evals/integration/test_bundle_install.sh +35 -71
- package/evals/integration/test_bundle_lifecycle.sh +39 -2
- package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
- package/evals/integration/test_checkpoint_signing.sh +489 -0
- package/evals/integration/test_claim_lookup.sh +352 -0
- package/evals/integration/test_command_log_integrity.sh +275 -0
- package/evals/integration/test_context_map.sh +0 -2
- package/evals/integration/test_dual_emit_flow_step.sh +278 -0
- package/evals/integration/test_enforcer_expects_driven.sh +281 -0
- package/evals/integration/test_evidence_capture_hook.sh +185 -0
- package/evals/integration/test_flow_kit_repository.sh +2 -0
- package/evals/integration/test_flowdef_session_activation.sh +273 -0
- package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
- package/evals/integration/test_gate_bypass_chain.sh +448 -0
- package/evals/integration/test_gate_lockdown.sh +1137 -0
- package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
- package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
- package/evals/integration/test_goal_fit_hook.sh +69 -4
- package/evals/integration/test_goal_fit_rederive.sh +263 -0
- package/evals/integration/test_install_merge.sh +1176 -0
- package/evals/integration/test_mint_attestation.sh +373 -0
- package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
- package/evals/integration/test_publish_delivery.sh +269 -0
- package/evals/integration/test_reconcile_soundness.sh +528 -0
- package/evals/integration/test_resolvefirststep_security.sh +208 -0
- package/evals/integration/test_session_resume_roundtrip.sh +286 -0
- package/evals/integration/test_trust_checkpoint.sh +325 -0
- package/evals/integration/test_trust_reconcile.sh +293 -0
- package/evals/integration/test_verify_cli.sh +208 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
- package/evals/lib/node.sh +0 -6
- package/evals/run.sh +45 -0
- package/evals/static/test_workflow_skills.sh +6 -13
- package/install.sh +0 -7
- package/integrations/strands-ts/README.md +25 -15
- package/integrations/veritas/flow-agents.adapter.json +1 -2
- package/kits/builder/flows/build.flow.json +59 -12
- package/kits/builder/kit.json +85 -15
- package/kits/builder/skills/continue-work/SKILL.md +116 -0
- package/kits/builder/skills/deliver/SKILL.md +36 -6
- package/kits/builder/skills/design-probe/SKILL.md +28 -0
- package/kits/builder/skills/execute-plan/SKILL.md +9 -1
- package/kits/builder/skills/gate-review/SKILL.md +234 -0
- package/kits/builder/skills/learning-review/SKILL.md +30 -0
- package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
- package/kits/builder/skills/plan-work/SKILL.md +13 -1
- package/kits/builder/skills/pull-work/SKILL.md +19 -0
- package/kits/knowledge/adapters/default-store/index.js +38 -0
- package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
- package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
- package/kits/knowledge/docs/store-contract.md +314 -0
- package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
- package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
- package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
- package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
- package/kits/knowledge/evals/entities/suite.test.js +40 -0
- package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
- package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
- package/kits/knowledge/evals/retirement/suite.test.js +145 -0
- package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
- package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
- package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
- package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
- package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
- package/kits/knowledge/kit.json +51 -1
- package/package.json +4 -4
- package/packaging/conformance/README.md +10 -2
- package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
- package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
- package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
- package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
- package/packaging/conformance/run-conformance.js +1 -1
- package/scripts/README.md +2 -1
- package/scripts/build-universal-bundles.js +0 -1
- package/scripts/ci/mint-attestation.js +221 -0
- package/scripts/ci/trust-reconcile.js +545 -0
- package/scripts/hooks/config-protection.js +423 -1
- package/scripts/hooks/evidence-capture.js +348 -0
- package/scripts/hooks/lib/liveness-read.js +113 -0
- package/scripts/hooks/run-hook.js +6 -1
- package/scripts/hooks/stop-goal-fit.js +1471 -79
- package/scripts/hooks/workflow-steering.js +135 -5
- package/scripts/install-codex-home.sh +39 -0
- package/scripts/install-merge.js +330 -0
- package/src/cli/init.ts +218 -20
- package/src/cli/validate-workflow-artifacts.ts +18 -2
- package/src/cli/verify.ts +100 -0
- package/src/cli/workflow-sidecar.ts +2064 -77
- package/src/cli.ts +2 -3
- package/src/lib/flow-resolver.ts +284 -0
- package/src/tools/build-universal-bundles.ts +34 -21
- package/src/tools/generate-context-map.ts +3 -17
- package/src/tools/validate-source-tree.ts +44 -104
- package/build/src/tools/filter-installed-packs.d.ts +0 -2
- package/build/src/tools/filter-installed-packs.js +0 -135
- package/packaging/packs.json +0 -49
- package/scripts/filter-installed-packs.js +0 -2
- package/src/tools/filter-installed-packs.ts +0 -132
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# prove-capture-teeth-declared.sh — Permanent regression proof that the
|
|
3
|
+
# capture cross-reference gate BLOCKS declared-type false-completions.
|
|
4
|
+
#
|
|
5
|
+
# Bug closed: captureCrossReference previously called bundleClaimedPassCommandChecks
|
|
6
|
+
# WITHOUT declaredClaimTypes, so sessions with a FlowDefinition active (e.g.
|
|
7
|
+
# builder.build / verify step) could emit declared-type claims (builder.verify.tests)
|
|
8
|
+
# that the cross-reference was completely blind to. A command-log recording FAIL for
|
|
9
|
+
# "npm test" would NOT block even though the trust.bundle evidence claimed it passed.
|
|
10
|
+
# ADR 0016 P-c fix: captureCrossReference now accepts activeFlowStep and threads
|
|
11
|
+
# declaredClaimTypes into bundleClaimedPassCommandChecks, mirroring bundleEnforcement
|
|
12
|
+
# and sidecarGuidance.
|
|
13
|
+
#
|
|
14
|
+
# This eval:
|
|
15
|
+
# 1. Proves the fix BLOCKS (exit 2): declared-type evidence claims pass, command-log
|
|
16
|
+
# says FAIL → gate emits "caught false-completion".
|
|
17
|
+
# 2. Proves the control case SHIPS (exit 0): same fixture with a PASS log.
|
|
18
|
+
# 3. Proves the workflow.check.* path still BLOCKS (no regression on original case).
|
|
19
|
+
#
|
|
20
|
+
# Deterministic — no model spend, no bundle install required.
|
|
21
|
+
# Usage: bash evals/acceptance/prove-capture-teeth-declared.sh
|
|
22
|
+
set -uo pipefail
|
|
23
|
+
|
|
24
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
25
|
+
GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
|
|
26
|
+
|
|
27
|
+
export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
|
|
28
|
+
|
|
29
|
+
TMP="$(mktemp -d)"
|
|
30
|
+
errors=0
|
|
31
|
+
_pass() { echo " ✓ $1"; }
|
|
32
|
+
_fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
33
|
+
|
|
34
|
+
cleanup() { rm -rf "$TMP"; }
|
|
35
|
+
trap cleanup EXIT
|
|
36
|
+
|
|
37
|
+
# ─── helper: seed a minimal delivered workflow artifact ───────────────────────
|
|
38
|
+
seed_repo() { # $1=dir $2=slug
|
|
39
|
+
local p="$1" slug="$2"
|
|
40
|
+
mkdir -p "$p/.flow-agents/$slug"
|
|
41
|
+
printf '# Repo\n' > "$p/AGENTS.md"
|
|
42
|
+
printf '%s' "{\"schema_version\":\"1.0\",\"task_slug\":\"$slug\",\"status\":\"delivered\",\"phase\":\"done\",\"updated_at\":\"2026-06-27T00:00:00Z\",\"next_action\":{\"status\":\"done\",\"summary\":\"done\"}}" \
|
|
43
|
+
> "$p/.flow-agents/$slug/state.json"
|
|
44
|
+
cat > "$p/.flow-agents/$slug/$slug--deliver.md" << MD
|
|
45
|
+
# $slug
|
|
46
|
+
|
|
47
|
+
branch: main
|
|
48
|
+
status: delivered
|
|
49
|
+
type: deliver
|
|
50
|
+
|
|
51
|
+
## Definition Of Done
|
|
52
|
+
- [x] tests pass
|
|
53
|
+
|
|
54
|
+
## Goal Fit Gate
|
|
55
|
+
- [x] acceptance verified
|
|
56
|
+
|
|
57
|
+
### Verdict: PASS
|
|
58
|
+
MD
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# ─── helper: write the declared-type trust.bundle ─────────────────────────────
|
|
62
|
+
# Evidence item has execution.label="npm test" linked to a builder.verify.tests claim
|
|
63
|
+
# that asserts pass. The cross-reference must catch the command-log contradiction.
|
|
64
|
+
write_declared_bundle() { # $1=bundle-path
|
|
65
|
+
python3 - "$1" << 'PY'
|
|
66
|
+
import json, sys
|
|
67
|
+
bundle = {
|
|
68
|
+
"schemaVersion": 3,
|
|
69
|
+
"source": "flow-agents/workflow-sidecar",
|
|
70
|
+
"claims": [{
|
|
71
|
+
"id": "c1",
|
|
72
|
+
"subjectId": "declared-false/tests",
|
|
73
|
+
"subjectType": "flow-step",
|
|
74
|
+
"claimType": "builder.verify.tests",
|
|
75
|
+
"fieldOrBehavior": "npm test",
|
|
76
|
+
"value": "pass",
|
|
77
|
+
"impactLevel": "high",
|
|
78
|
+
"status": "verified",
|
|
79
|
+
"createdAt": "2026-06-27T00:00:00Z",
|
|
80
|
+
"updatedAt": "2026-06-27T00:00:00Z"
|
|
81
|
+
}],
|
|
82
|
+
"evidence": [{
|
|
83
|
+
"id": "ev1",
|
|
84
|
+
"claimId": "c1",
|
|
85
|
+
"evidenceType": "command_output",
|
|
86
|
+
"method": "capture",
|
|
87
|
+
"sourceRef": "command-log.jsonl",
|
|
88
|
+
"excerptOrSummary": "npm test passed (agent claimed)",
|
|
89
|
+
"observedAt": "2026-06-27T00:00:00Z",
|
|
90
|
+
"collectedBy": "agent",
|
|
91
|
+
"passing": True,
|
|
92
|
+
"execution": {
|
|
93
|
+
"label": "npm test",
|
|
94
|
+
"exitCode": 0
|
|
95
|
+
}
|
|
96
|
+
}],
|
|
97
|
+
"policies": [],
|
|
98
|
+
"events": []
|
|
99
|
+
}
|
|
100
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
101
|
+
PY
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Minimal FlowDefinition: verify-gate expects builder.verify.tests
|
|
105
|
+
# Using FLOW_AGENTS_FLOW_DEFS_DIR so the test is self-contained (no kits/ needed).
|
|
106
|
+
FLOW_DEFS_DIR="$TMP/flows"
|
|
107
|
+
mkdir -p "$FLOW_DEFS_DIR"
|
|
108
|
+
cat > "$FLOW_DEFS_DIR/builder.build.flow.json" << 'FLOWJSON'
|
|
109
|
+
{
|
|
110
|
+
"id": "builder.build",
|
|
111
|
+
"version": "1.0",
|
|
112
|
+
"gates": {
|
|
113
|
+
"verify-gate": {
|
|
114
|
+
"step": "verify",
|
|
115
|
+
"expects": [
|
|
116
|
+
{
|
|
117
|
+
"id": "tests-evidence",
|
|
118
|
+
"kind": "trust.bundle",
|
|
119
|
+
"required": true,
|
|
120
|
+
"bundle_claim": {
|
|
121
|
+
"claimType": "builder.verify.tests",
|
|
122
|
+
"subjectType": "flow-step",
|
|
123
|
+
"accepted_statuses": ["trusted", "accepted"]
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
]
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
FLOWJSON
|
|
131
|
+
|
|
132
|
+
# ─── Test 1: declared-type false-completion MUST BLOCK ────────────────────────
|
|
133
|
+
echo "Test 1: declared-type evidence claims pass, command-log records FAIL → must BLOCK"
|
|
134
|
+
echo " (This is the hole: pre-fix the gate was blind to builder.verify.tests claims)"
|
|
135
|
+
|
|
136
|
+
T1="$TMP/t1"
|
|
137
|
+
seed_repo "$T1" "declared-false"
|
|
138
|
+
|
|
139
|
+
# current.json: active FlowDefinition
|
|
140
|
+
printf '%s' '{"artifact_dir":"declared-false","active_flow_id":"builder.build","active_step_id":"verify"}' \
|
|
141
|
+
> "$T1/.flow-agents/current.json"
|
|
142
|
+
|
|
143
|
+
write_declared_bundle "$T1/.flow-agents/declared-false/trust.bundle"
|
|
144
|
+
|
|
145
|
+
# command-log: npm test recorded as FAIL — the independent truth source says FAILED
|
|
146
|
+
printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' \
|
|
147
|
+
> "$T1/.flow-agents/declared-false/command-log.jsonl"
|
|
148
|
+
|
|
149
|
+
set +e
|
|
150
|
+
t1_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block \
|
|
151
|
+
FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
152
|
+
FLOW_AGENTS_FLOW_DEFS_DIR="$FLOW_DEFS_DIR" \
|
|
153
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T1\"}")"
|
|
154
|
+
t1_exit="$?"
|
|
155
|
+
set -e
|
|
156
|
+
|
|
157
|
+
if [ "$t1_exit" -eq 2 ]; then
|
|
158
|
+
_pass "declared-type false-completion BLOCKED (exit 2)"
|
|
159
|
+
else
|
|
160
|
+
_fail "declared-type false-completion NOT blocked: exit=$t1_exit output=$t1_out"
|
|
161
|
+
fi
|
|
162
|
+
|
|
163
|
+
if echo "$t1_out" | grep -q "caught false-completion"; then
|
|
164
|
+
_pass "emits 'caught false-completion' message"
|
|
165
|
+
else
|
|
166
|
+
_fail "missing 'caught false-completion' in output: $t1_out"
|
|
167
|
+
fi
|
|
168
|
+
|
|
169
|
+
if echo "$t1_out" | grep -q "capture log CONTRADICTS claimed pass"; then
|
|
170
|
+
_pass "emits 'capture log CONTRADICTS claimed pass' message"
|
|
171
|
+
else
|
|
172
|
+
_fail "missing contradicts message in output: $t1_out"
|
|
173
|
+
fi
|
|
174
|
+
|
|
175
|
+
if echo "$t1_out" | grep -q "npm test"; then
|
|
176
|
+
_pass "warning names the contradicted command (npm test)"
|
|
177
|
+
else
|
|
178
|
+
_fail "warning does not name the command: $t1_out"
|
|
179
|
+
fi
|
|
180
|
+
|
|
181
|
+
# ─── Test 2: control — matching PASS log should SHIP (no false-block) ─────────
|
|
182
|
+
echo ""
|
|
183
|
+
echo "Test 2: same fixture but command-log records PASS → must SHIP (exit 0)"
|
|
184
|
+
|
|
185
|
+
T2="$TMP/t2"
|
|
186
|
+
seed_repo "$T2" "declared-pass"
|
|
187
|
+
|
|
188
|
+
printf '%s' '{"artifact_dir":"declared-pass","active_flow_id":"builder.build","active_step_id":"verify"}' \
|
|
189
|
+
> "$T2/.flow-agents/current.json"
|
|
190
|
+
|
|
191
|
+
# Reuse same bundle (trusts pass) but command-log confirms pass
|
|
192
|
+
python3 - "$T2/.flow-agents/declared-pass/trust.bundle" << 'PY'
|
|
193
|
+
import json, sys
|
|
194
|
+
bundle = {
|
|
195
|
+
"schemaVersion": 3,
|
|
196
|
+
"source": "flow-agents/workflow-sidecar",
|
|
197
|
+
"claims": [{
|
|
198
|
+
"id": "c2",
|
|
199
|
+
"subjectId": "declared-pass/tests",
|
|
200
|
+
"subjectType": "flow-step",
|
|
201
|
+
"claimType": "builder.verify.tests",
|
|
202
|
+
"fieldOrBehavior": "npm test",
|
|
203
|
+
"value": "pass",
|
|
204
|
+
"impactLevel": "high",
|
|
205
|
+
"status": "verified",
|
|
206
|
+
"createdAt": "2026-06-27T00:00:00Z",
|
|
207
|
+
"updatedAt": "2026-06-27T00:00:00Z"
|
|
208
|
+
}],
|
|
209
|
+
"evidence": [{
|
|
210
|
+
"id": "ev2",
|
|
211
|
+
"claimId": "c2",
|
|
212
|
+
"evidenceType": "command_output",
|
|
213
|
+
"method": "capture",
|
|
214
|
+
"sourceRef": "command-log.jsonl",
|
|
215
|
+
"excerptOrSummary": "npm test passed",
|
|
216
|
+
"observedAt": "2026-06-27T00:00:00Z",
|
|
217
|
+
"collectedBy": "agent",
|
|
218
|
+
"passing": True,
|
|
219
|
+
"execution": {
|
|
220
|
+
"label": "npm test",
|
|
221
|
+
"exitCode": 0
|
|
222
|
+
}
|
|
223
|
+
}],
|
|
224
|
+
"policies": [],
|
|
225
|
+
"events": []
|
|
226
|
+
}
|
|
227
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
228
|
+
PY
|
|
229
|
+
|
|
230
|
+
# command-log: npm test recorded as PASS — confirming evidence
|
|
231
|
+
printf '%s\n' '{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' \
|
|
232
|
+
> "$T2/.flow-agents/declared-pass/command-log.jsonl"
|
|
233
|
+
|
|
234
|
+
set +e
|
|
235
|
+
t2_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block \
|
|
236
|
+
FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
237
|
+
FLOW_AGENTS_FLOW_DEFS_DIR="$FLOW_DEFS_DIR" \
|
|
238
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T2\"}")"
|
|
239
|
+
t2_exit="$?"
|
|
240
|
+
set -e
|
|
241
|
+
|
|
242
|
+
if [ "$t2_exit" -ne 2 ]; then
|
|
243
|
+
_pass "confirming log clears the cross-reference (no false-block, exit $t2_exit)"
|
|
244
|
+
else
|
|
245
|
+
_fail "confirming log incorrectly blocked (exit 2): $t2_out"
|
|
246
|
+
fi
|
|
247
|
+
|
|
248
|
+
if echo "$t2_out" | grep -q "caught false-completion"; then
|
|
249
|
+
_fail "confirming log incorrectly emits false-completion: $t2_out"
|
|
250
|
+
else
|
|
251
|
+
_pass "confirming log does not emit false-completion"
|
|
252
|
+
fi
|
|
253
|
+
|
|
254
|
+
# ─── Test 3: workflow.check.* path still BLOCKS (regression guard) ────────────
|
|
255
|
+
echo ""
|
|
256
|
+
echo "Test 3: workflow.check.* false-completion still BLOCKS (no regression on original case)"
|
|
257
|
+
|
|
258
|
+
T3="$TMP/t3"
|
|
259
|
+
seed_repo "$T3" "wf-false"
|
|
260
|
+
|
|
261
|
+
# No current.json active flow → loadActiveFlowStep returns null → workflow.* fallback
|
|
262
|
+
printf '%s' '{"artifact_dir":"wf-false"}' \
|
|
263
|
+
> "$T3/.flow-agents/current.json"
|
|
264
|
+
|
|
265
|
+
python3 - "$T3/.flow-agents/wf-false/trust.bundle" << 'PY'
|
|
266
|
+
import json, sys
|
|
267
|
+
bundle = {
|
|
268
|
+
"schemaVersion": 3,
|
|
269
|
+
"source": "flow-agents/workflow-sidecar",
|
|
270
|
+
"claims": [{
|
|
271
|
+
"id": "c3",
|
|
272
|
+
"subjectId": "wf-false/unit-tests",
|
|
273
|
+
"subjectType": "workflow-check",
|
|
274
|
+
"claimType": "workflow.check.command",
|
|
275
|
+
"fieldOrBehavior": "npm test",
|
|
276
|
+
"value": "pass",
|
|
277
|
+
"impactLevel": "high",
|
|
278
|
+
"status": "verified",
|
|
279
|
+
"createdAt": "2026-06-27T00:00:00Z",
|
|
280
|
+
"updatedAt": "2026-06-27T00:00:00Z"
|
|
281
|
+
}],
|
|
282
|
+
"evidence": [{
|
|
283
|
+
"id": "ev3",
|
|
284
|
+
"claimId": "c3",
|
|
285
|
+
"evidenceType": "command_output",
|
|
286
|
+
"method": "capture",
|
|
287
|
+
"sourceRef": "command-log.jsonl",
|
|
288
|
+
"excerptOrSummary": "npm test passed (agent claimed)",
|
|
289
|
+
"observedAt": "2026-06-27T00:00:00Z",
|
|
290
|
+
"collectedBy": "agent",
|
|
291
|
+
"passing": True,
|
|
292
|
+
"execution": {
|
|
293
|
+
"label": "npm test",
|
|
294
|
+
"exitCode": 0
|
|
295
|
+
}
|
|
296
|
+
}],
|
|
297
|
+
"policies": [],
|
|
298
|
+
"events": []
|
|
299
|
+
}
|
|
300
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
301
|
+
PY
|
|
302
|
+
|
|
303
|
+
# command-log: npm test recorded as FAIL
|
|
304
|
+
printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' \
|
|
305
|
+
> "$T3/.flow-agents/wf-false/command-log.jsonl"
|
|
306
|
+
|
|
307
|
+
set +e
|
|
308
|
+
t3_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block \
|
|
309
|
+
FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
310
|
+
FLOW_AGENTS_FLOW_DEFS_DIR="$FLOW_DEFS_DIR" \
|
|
311
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T3\"}")"
|
|
312
|
+
t3_exit="$?"
|
|
313
|
+
set -e
|
|
314
|
+
|
|
315
|
+
if [ "$t3_exit" -eq 2 ]; then
|
|
316
|
+
_pass "workflow.check.* false-completion still BLOCKS (no regression)"
|
|
317
|
+
else
|
|
318
|
+
_fail "workflow.check.* false-completion NOT blocked: exit=$t3_exit output=$t3_out"
|
|
319
|
+
fi
|
|
320
|
+
|
|
321
|
+
if echo "$t3_out" | grep -q "caught false-completion"; then
|
|
322
|
+
_pass "workflow.check.* path still emits 'caught false-completion'"
|
|
323
|
+
else
|
|
324
|
+
_fail "workflow.check.* path missing 'caught false-completion': $t3_out"
|
|
325
|
+
fi
|
|
326
|
+
|
|
327
|
+
# ─── Summary ──────────────────────────────────────────────────────────────────
|
|
328
|
+
echo ""
|
|
329
|
+
if [ "$errors" -eq 0 ]; then
|
|
330
|
+
echo "prove-capture-teeth-declared: all tests passed."
|
|
331
|
+
echo "PROOF: declared-type false-completions are blocked; workflow.check.* path unaffected."
|
|
332
|
+
exit 0
|
|
333
|
+
fi
|
|
334
|
+
echo "prove-capture-teeth-declared: FAILED ($errors issue(s))."
|
|
335
|
+
exit 1
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# prove-capture-teeth.sh — Deterministic proof (no model spend) that capture-first
|
|
3
|
+
# evidence determinism has teeth through the SHIPPED bundles: an agent claims a
|
|
4
|
+
# command passed, but the deterministically-captured command-log shows it actually
|
|
5
|
+
# FAILED → Stop is blocked. Also proves the trusted backstop catches a never-run
|
|
6
|
+
# claimed-pass command, and that a matching capture log lets Stop through.
|
|
7
|
+
#
|
|
8
|
+
# Mirrors prove-teeth.sh: installs each bundle and runs the installed hook commands
|
|
9
|
+
# with seeded .flow-agents state, exactly as the runtime would on PostToolUse / Stop.
|
|
10
|
+
#
|
|
11
|
+
# Usage: bash evals/acceptance/prove-capture-teeth.sh
|
|
12
|
+
set -uo pipefail
|
|
13
|
+
|
|
14
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
15
|
+
pass=0; fail=0
|
|
16
|
+
_p(){ echo " ✓ $1"; pass=$((pass+1)); }
|
|
17
|
+
_f(){ echo " ✗ $1"; fail=$((fail+1)); }
|
|
18
|
+
|
|
19
|
+
echo "Building bundles..."
|
|
20
|
+
(cd "$ROOT" && npm run build:bundles >/dev/null 2>&1) || { echo "build failed"; exit 1; }
|
|
21
|
+
|
|
22
|
+
hook_cmd(){ # $1 settings/hooks json, $2 event, $3 script needle
|
|
23
|
+
python3 - "$1" "$2" "$3" <<'PY'
|
|
24
|
+
import json,sys
|
|
25
|
+
s=json.load(open(sys.argv[1]))
|
|
26
|
+
for g in s.get("hooks",{}).get(sys.argv[2],[]):
|
|
27
|
+
for h in g["hooks"]:
|
|
28
|
+
if sys.argv[3] in h["command"]:
|
|
29
|
+
print(h["command"]); sys.exit(0)
|
|
30
|
+
sys.exit(0)
|
|
31
|
+
PY
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Seed: model CLAIMS the command passed (evidence.json) but the deterministic
|
|
35
|
+
# capture log recorded it as FAIL — a false-completion the gate must catch.
|
|
36
|
+
seed_capture_false_pass(){ # $1 project dir
|
|
37
|
+
local p="$1"; mkdir -p "$p/.flow-agents/cap-false"
|
|
38
|
+
[ -f "$p/AGENTS.md" ] || printf '# Repo\n' > "$p/AGENTS.md"
|
|
39
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"cap-false","status":"delivered","phase":"done","updated_at":"2026-06-23T00:00:00Z","next_action":{"status":"done","summary":"done"}}' > "$p/.flow-agents/cap-false/state.json"
|
|
40
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"cap-false","verdict":"pass","checks":[{"id":"unit-tests","kind":"command","status":"pass","command":"npm test","summary":"tests passed"}]}' > "$p/.flow-agents/cap-false/evidence.json"
|
|
41
|
+
printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' > "$p/.flow-agents/cap-false/command-log.jsonl"
|
|
42
|
+
cat > "$p/.flow-agents/cap-false/cap-false--deliver.md" <<'MD'
|
|
43
|
+
# Cap False
|
|
44
|
+
|
|
45
|
+
branch: main
|
|
46
|
+
status: delivered
|
|
47
|
+
type: deliver
|
|
48
|
+
|
|
49
|
+
## Definition Of Done
|
|
50
|
+
- [x] all unit tests pass
|
|
51
|
+
|
|
52
|
+
## Goal Fit Gate
|
|
53
|
+
- [x] acceptance criteria verified
|
|
54
|
+
|
|
55
|
+
### Verdict: PASS
|
|
56
|
+
MD
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
is_block(){ grep -q '"decision":"block"'; }
|
|
60
|
+
|
|
61
|
+
run_bundle(){ # $1 label, $2 install.sh, $3 settings-json-rel, $4 home-env-name
|
|
62
|
+
local label="$1" installer="$2" cfgrel="$3" homevar="$4"
|
|
63
|
+
echo ""
|
|
64
|
+
echo "── $label: shipped bundle install ──"
|
|
65
|
+
local home proj
|
|
66
|
+
home="$(mktemp -d)"; proj="$(mktemp -d)"
|
|
67
|
+
bash "$installer" "$home" >/dev/null 2>&1 || { _f "$label install.sh failed"; return; }
|
|
68
|
+
local cfg="$home/$cfgrel"
|
|
69
|
+
[ -f "$cfg" ] || { _f "$label config not found at $cfgrel after install"; return; }
|
|
70
|
+
[ -f "$home/scripts/hooks/evidence-capture.js" ] || { _f "$label bundle missing evidence-capture.js after install"; return; }
|
|
71
|
+
|
|
72
|
+
# --- Capture hook is wired on PostToolUse in the shipped config ---
|
|
73
|
+
local capcmd; capcmd="$(hook_cmd "$cfg" PostToolUse evidence-capture)"
|
|
74
|
+
[ -n "$capcmd" ] || { _f "$label: no PostToolUse evidence-capture hook in shipped config"; return; }
|
|
75
|
+
_p "$label ships evidence-capture on PostToolUse"
|
|
76
|
+
|
|
77
|
+
# The capture hook deterministically records a real command result through the
|
|
78
|
+
# installed adapter path.
|
|
79
|
+
mkdir -p "$proj/.flow-agents/live-cap"
|
|
80
|
+
[ -f "$proj/AGENTS.md" ] || printf '# Repo\n' > "$proj/AGENTS.md"
|
|
81
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"live-cap","status":"in_progress","phase":"verification","updated_at":"2026-06-23T00:00:00Z"}' > "$proj/.flow-agents/live-cap/state.json"
|
|
82
|
+
printf '{"hook_event_name":"PostToolUse","tool_name":"Bash","cwd":"%s","tool_input":{"command":"npm run lint"},"error":"command failed"}' "$proj" \
|
|
83
|
+
| env "$homevar=$home" CLAUDE_PROJECT_DIR="$home" bash -c "$capcmd" >/dev/null 2>&1 || true
|
|
84
|
+
if rg -q '"command":"npm run lint","observedResult":"fail"' "$proj/.flow-agents/live-cap/command-log.jsonl" 2>/dev/null; then
|
|
85
|
+
_p "$label capture hook records a real FAIL to command-log.jsonl through the installed adapter"
|
|
86
|
+
else
|
|
87
|
+
_f "$label capture hook did not record the command result: $(cat "$proj/.flow-agents/live-cap/command-log.jsonl" 2>/dev/null)"
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
# --- Teeth: claims-pass-but-log-shows-fail → Stop is BLOCKED ---
|
|
91
|
+
seed_capture_false_pass "$proj"
|
|
92
|
+
local stopcmd; stopcmd="$(hook_cmd "$cfg" Stop stop-goal-fit)"
|
|
93
|
+
[ -n "$stopcmd" ] || { _f "$label: no Stop stop-goal-fit hook in shipped config"; return; }
|
|
94
|
+
local blk; blk="$(printf '{"hook_event_name":"Stop","cwd":"%s"}' "$proj" | env "$homevar=$home" CLAUDE_PROJECT_DIR="$home" FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip bash -c "$stopcmd" 2>/dev/null)"
|
|
95
|
+
echo "$blk" | is_block && _p "$label BLOCKS a claimed-pass command that the capture log recorded as FAIL" || _f "$label did NOT block the captured false-completion: $blk"
|
|
96
|
+
|
|
97
|
+
# control: a matching capture log (pass) lets Stop through on the capture axis.
|
|
98
|
+
printf '%s\n' '{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-23T00:00:00Z","source":"postToolUse-capture"}' > "$proj/.flow-agents/cap-false/command-log.jsonl"
|
|
99
|
+
local okblk; okblk="$(printf '{"hook_event_name":"Stop","cwd":"%s"}' "$proj" | env "$homevar=$home" CLAUDE_PROJECT_DIR="$home" FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip bash -c "$stopcmd" 2>&1)"
|
|
100
|
+
if echo "$okblk" | grep -q 'caught false-completion'; then
|
|
101
|
+
_f "$label control: a confirming capture log should not raise a false-completion"
|
|
102
|
+
else
|
|
103
|
+
_p "$label control: a confirming capture log clears the false-completion (no re-run)"
|
|
104
|
+
fi
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
run_bundle "Claude Code" "$ROOT/dist/claude-code/install.sh" ".claude/settings.json" "CLAUDE_PROJECT_DIR"
|
|
108
|
+
run_bundle "Codex" "$ROOT/dist/codex/install.sh" ".codex/hooks.json" "CODEX_HOME"
|
|
109
|
+
|
|
110
|
+
echo ""
|
|
111
|
+
echo "──────────────────────────────────"
|
|
112
|
+
echo "prove-capture-teeth: $pass passed, $fail failed"
|
|
113
|
+
[ "$fail" -eq 0 ] && echo "PROOF: shipped bundles capture real command results and BLOCK claimed-pass-but-actually-failed completions." || true
|
|
114
|
+
exit $([ "$fail" -eq 0 ] && echo 0 || echo 1)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# prove-teeth.sh — End-to-end proof that the SHIPPED bundles enforce goal fit
|
|
3
|
+
# (block mode) and re-ground active goals (SessionStart re-injection), through
|
|
4
|
+
# the real install + adapter path, for Claude Code and Codex.
|
|
5
|
+
#
|
|
6
|
+
# This is deterministic (no live model spend): it installs each bundle and runs
|
|
7
|
+
# the installed hook commands with seeded .flow-agents state, exactly as the
|
|
8
|
+
# runtime would on a Stop / SessionStart event.
|
|
9
|
+
#
|
|
10
|
+
# Usage: bash evals/acceptance/prove-teeth.sh
|
|
11
|
+
set -uo pipefail
|
|
12
|
+
|
|
13
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
14
|
+
pass=0; fail=0
|
|
15
|
+
_p(){ echo " ✓ $1"; pass=$((pass+1)); }
|
|
16
|
+
_f(){ echo " ✗ $1"; fail=$((fail+1)); }
|
|
17
|
+
|
|
18
|
+
echo "Building bundles..."
|
|
19
|
+
(cd "$ROOT" && npm run build:bundles >/dev/null 2>&1) || { echo "build failed"; exit 1; }
|
|
20
|
+
|
|
21
|
+
# Extract an installed hook command by event + script-name substring.
|
|
22
|
+
hook_cmd(){ # $1 settings/hooks json, $2 event, $3 script needle
|
|
23
|
+
python3 - "$1" "$2" "$3" <<'PY'
|
|
24
|
+
import json,sys
|
|
25
|
+
s=json.load(open(sys.argv[1]))
|
|
26
|
+
for g in s.get("hooks",{}).get(sys.argv[2],[]):
|
|
27
|
+
for h in g["hooks"]:
|
|
28
|
+
if sys.argv[3] in h["command"]:
|
|
29
|
+
print(h["command"]); sys.exit(0)
|
|
30
|
+
sys.exit(0)
|
|
31
|
+
PY
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
seed_false_completion(){ # $1 project dir — evidence FAIL but markdown claims PASS
|
|
35
|
+
local p="$1"; mkdir -p "$p/.flow-agents/false-done"
|
|
36
|
+
[ -f "$p/AGENTS.md" ] || printf '# Repo\n' > "$p/AGENTS.md"
|
|
37
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"false-done","status":"in_progress","phase":"verification","updated_at":"2026-06-18T00:00:00Z","next_action":{"status":"continue","summary":"Make the failing tests pass."}}' > "$p/.flow-agents/false-done/state.json"
|
|
38
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"false-done","verdict":"fail","checks":[{"id":"unit-tests","kind":"test","status":"fail","summary":"3 unit tests still failing"}]}' > "$p/.flow-agents/false-done/evidence.json"
|
|
39
|
+
cat > "$p/.flow-agents/false-done/false-done--deliver.md" <<'MD'
|
|
40
|
+
# False Done
|
|
41
|
+
|
|
42
|
+
branch: main
|
|
43
|
+
status: executing
|
|
44
|
+
type: deliver
|
|
45
|
+
|
|
46
|
+
## Definition Of Done
|
|
47
|
+
- [x] all unit tests pass
|
|
48
|
+
|
|
49
|
+
## Goal Fit Gate
|
|
50
|
+
- [x] acceptance criteria verified
|
|
51
|
+
|
|
52
|
+
### Verdict: PASS
|
|
53
|
+
MD
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
seed_active_resume(){ # $1 project dir — active in_progress task with a concrete next step
|
|
57
|
+
local p="$1"; mkdir -p "$p/.flow-agents/resume-task"
|
|
58
|
+
[ -f "$p/AGENTS.md" ] || printf '# Repo\n' > "$p/AGENTS.md"
|
|
59
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"resume-task","status":"in_progress","phase":"execution","updated_at":"2026-06-18T00:00:00Z","next_action":{"status":"continue","summary":"Create a file named RESUMED.txt containing the word resumed.","target_phase":"verification"}}' > "$p/.flow-agents/resume-task/state.json"
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
is_block(){ grep -q '"decision":"block"'; }
|
|
63
|
+
has_reground(){ # stdin = adapter json; assert additionalContext re-grounds the goal
|
|
64
|
+
python3 -c "import json,sys
|
|
65
|
+
d=json.load(sys.stdin); ctx=d.get('hookSpecificOutput',{}).get('additionalContext','')
|
|
66
|
+
sys.exit(0 if ('STATE:' in ctx and 'resume-task' in ctx and 'RESUMED.txt' in ctx) else 1)"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
run_bundle(){ # $1 label, $2 install.sh, $3 settings-json-rel, $4 home-env-name
|
|
70
|
+
local label="$1" installer="$2" cfgrel="$3" homevar="$4"
|
|
71
|
+
echo ""
|
|
72
|
+
echo "── $label: shipped bundle install ──"
|
|
73
|
+
local home proj
|
|
74
|
+
home="$(mktemp -d)"; proj="$(mktemp -d)"
|
|
75
|
+
bash "$installer" "$home" >/dev/null 2>&1 || { _f "$label install.sh failed"; return; }
|
|
76
|
+
local cfg="$home/$cfgrel"
|
|
77
|
+
[ -f "$cfg" ] || { _f "$label config not found at $cfgrel after install"; return; }
|
|
78
|
+
[ -f "$home/scripts/hooks/stop-goal-fit.js" ] || { _f "$label bundle missing scripts/hooks after install"; return; }
|
|
79
|
+
|
|
80
|
+
# --- Teeth 1: false-completion block ---
|
|
81
|
+
seed_false_completion "$proj"
|
|
82
|
+
local stopcmd; stopcmd="$(hook_cmd "$cfg" Stop stop-goal-fit)"
|
|
83
|
+
[ -n "$stopcmd" ] || { _f "$label: no Stop stop-goal-fit hook in shipped config"; return; }
|
|
84
|
+
local blk; blk="$(printf '{"hook_event_name":"Stop","cwd":"%s"}' "$proj" | env "$homevar=$home" CLAUDE_PROJECT_DIR="$home" bash -c "$stopcmd" 2>/dev/null)"
|
|
85
|
+
echo "$blk" | is_block && _p "$label BLOCKS false completion by default (evidence=fail vs markdown PASS)" || _f "$label did NOT block: $blk"
|
|
86
|
+
# control: warn mode must pass through
|
|
87
|
+
local wrn; wrn="$(printf '{"hook_event_name":"Stop","cwd":"%s"}' "$proj" | env "$homevar=$home" CLAUDE_PROJECT_DIR="$home" FLOW_AGENTS_GOAL_FIT_MODE=warn bash -c "$stopcmd" 2>/dev/null)"
|
|
88
|
+
echo "$wrn" | is_block && _f "$label warn-mode override should NOT block" || _p "$label warn-mode override passes through (control)"
|
|
89
|
+
|
|
90
|
+
# --- Teeth 2: re-ground active goal on SessionStart ---
|
|
91
|
+
local sscmd; sscmd="$(hook_cmd "$cfg" SessionStart workflow-steering)"
|
|
92
|
+
[ -n "$sscmd" ] || { _f "$label: no SessionStart workflow-steering hook in shipped config"; return; }
|
|
93
|
+
seed_active_resume "$proj"
|
|
94
|
+
local rg; rg="$(printf '{"hook_event_name":"SessionStart","cwd":"%s","source":"compact"}' "$proj" | env "$homevar=$home" CLAUDE_PROJECT_DIR="$home" bash -c "$sscmd" 2>/dev/null)"
|
|
95
|
+
echo "$rg" | has_reground && _p "$label RE-GROUNDS the active goal on SessionStart (goal + next step survive compaction)" || _f "$label SessionStart did not re-ground: $rg"
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
run_bundle "Claude Code" "$ROOT/dist/claude-code/install.sh" ".claude/settings.json" "CLAUDE_PROJECT_DIR"
|
|
99
|
+
run_bundle "Codex" "$ROOT/dist/codex/install.sh" ".codex/hooks.json" "CODEX_HOME"
|
|
100
|
+
|
|
101
|
+
echo ""
|
|
102
|
+
echo "──────────────────────────────────"
|
|
103
|
+
echo "prove-teeth: $pass passed, $fail failed"
|
|
104
|
+
[ "$fail" -eq 0 ] && echo "PROOF: shipped Claude Code + Codex bundles enforce goal-fit and re-ground on compaction." || true
|
|
105
|
+
exit $([ "$fail" -eq 0 ] && echo 0 || echo 1)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Anti-gaming + trust regression suite — the tests that prove the gate, the CI
|
|
3
|
+
# anchor, and their protections actually work. These MUST run in a required CI
|
|
4
|
+
# lane so that weakening the gate code (stop-goal-fit.js, trust-reconcile.js,
|
|
5
|
+
# config-protection.js, evidence-capture.js, flow-resolver.ts, the producer) is
|
|
6
|
+
# caught: if any of these fail, the lane goes red and the change cannot merge.
|
|
7
|
+
#
|
|
8
|
+
# Convergence red-team (Round 5/7) found these ran in NO required check, so an
|
|
9
|
+
# agent could edit the gate and the proof-tests never ran. This closes that.
|
|
10
|
+
set -uo pipefail
|
|
11
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
12
|
+
cd "$ROOT_DIR"
|
|
13
|
+
|
|
14
|
+
TESTS=(
|
|
15
|
+
"evals/acceptance/prove-capture-teeth.sh"
|
|
16
|
+
"evals/acceptance/prove-capture-teeth-declared.sh"
|
|
17
|
+
"evals/integration/test_gate_lockdown.sh"
|
|
18
|
+
"evals/integration/test_gate_bypass_chain.sh"
|
|
19
|
+
"evals/integration/test_reconcile_soundness.sh"
|
|
20
|
+
"evals/integration/test_captured_fail_reconciliation.sh"
|
|
21
|
+
"evals/integration/test_command_log_integrity.sh"
|
|
22
|
+
"evals/integration/test_resolvefirststep_security.sh"
|
|
23
|
+
"evals/integration/test_enforcer_expects_driven.sh"
|
|
24
|
+
"evals/integration/test_goal_fit_rederive.sh"
|
|
25
|
+
"evals/integration/test_flowdef_session_activation.sh"
|
|
26
|
+
"evals/integration/test_trust_reconcile.sh"
|
|
27
|
+
"evals/integration/test_trust_checkpoint.sh"
|
|
28
|
+
"evals/integration/test_checkpoint_signing.sh"
|
|
29
|
+
"evals/integration/test_mint_attestation.sh"
|
|
30
|
+
"evals/integration/test_publish_delivery.sh"
|
|
31
|
+
"evals/integration/test_phase_map_and_gate_claim.sh"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
fail=0
|
|
35
|
+
for t in "${TESTS[@]}"; do
|
|
36
|
+
if [[ ! -f "$t" ]]; then
|
|
37
|
+
echo "MISSING anti-gaming test: $t — refusing to pass (a removed regression test is a red flag)"
|
|
38
|
+
fail=1
|
|
39
|
+
continue
|
|
40
|
+
fi
|
|
41
|
+
echo "=== anti-gaming: $t ==="
|
|
42
|
+
if bash "$t"; then
|
|
43
|
+
echo " PASS: $t"
|
|
44
|
+
else
|
|
45
|
+
echo " FAIL: $t"
|
|
46
|
+
fail=1
|
|
47
|
+
fi
|
|
48
|
+
done
|
|
49
|
+
|
|
50
|
+
if [[ "$fail" -ne 0 ]]; then
|
|
51
|
+
echo "ANTI-GAMING SUITE FAILED — the gate / CI anchor / protections regressed or a regression test was removed."
|
|
52
|
+
exit 1
|
|
53
|
+
fi
|
|
54
|
+
echo "ANTI-GAMING SUITE PASSED (${#TESTS[@]} tests)."
|
package/evals/ci/run-baseline.sh
CHANGED
|
@@ -39,6 +39,7 @@ CHECKS=(
|
|
|
39
39
|
"Telemetry doctor integration|bash evals/integration/test_telemetry_doctor.sh"
|
|
40
40
|
"Utterance check integration|bash evals/integration/test_utterance_check.sh"
|
|
41
41
|
"Pull work provider integration|bash evals/integration/test_pull_work_provider.sh"
|
|
42
|
+
"Anti-gaming and trust suite|bash evals/ci/antigaming-suite.sh"
|
|
42
43
|
"Usage feedback import integration|bash evals/integration/test_usage_feedback_import.sh"
|
|
43
44
|
"Usage feedback outcomes integration|bash evals/integration/test_usage_feedback_outcomes.sh"
|
|
44
45
|
"Usage feedback report integration|bash evals/integration/test_usage_feedback_report.sh"
|
|
@@ -82,6 +83,7 @@ LANE_RUNTIME_AND_KIT=(
|
|
|
82
83
|
"Telemetry doctor integration"
|
|
83
84
|
"Utterance check integration"
|
|
84
85
|
"Pull work provider integration"
|
|
86
|
+
"Anti-gaming and trust suite"
|
|
85
87
|
)
|
|
86
88
|
|
|
87
89
|
LANE_USAGE_FEEDBACK=(
|
package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "missing.extension.asset.review",
|
|
3
|
+
"version": "1.0",
|
|
4
|
+
"steps": [
|
|
5
|
+
{ "id": "review", "next": "done" },
|
|
6
|
+
{ "id": "done", "next": null }
|
|
7
|
+
],
|
|
8
|
+
"gates": {
|
|
9
|
+
"review-gate": {
|
|
10
|
+
"step": "review",
|
|
11
|
+
"expects": [
|
|
12
|
+
{
|
|
13
|
+
"id": "review-evidence",
|
|
14
|
+
"kind": "trust.bundle",
|
|
15
|
+
"required": true,
|
|
16
|
+
"description": "Review evidence has been recorded.",
|
|
17
|
+
"bundle_claim": {
|
|
18
|
+
"claimType": "example.review.evidence",
|
|
19
|
+
"subjectType": "artifact",
|
|
20
|
+
"accepted_statuses": ["trusted", "accepted"]
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|