@kontourai/flow-agents 1.4.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/CODEOWNERS +29 -0
- package/.github/actions/trust-verify/action.yml +145 -0
- package/.github/workflows/ci.yml +11 -4
- package/.github/workflows/kit-gates-demo.yml +2 -2
- package/.github/workflows/publish-npm.yml +10 -2
- package/.github/workflows/release-please.yml +1 -1
- package/.github/workflows/runtime-compat.yml +1 -1
- package/.github/workflows/trust-reconcile.yml +113 -0
- package/AGENTS.md +13 -0
- package/CHANGELOG.md +103 -0
- package/CONTRIBUTING.md +4 -4
- package/README.md +1 -0
- package/agents/tool-planner.json +1 -1
- package/build/src/cli/init.js +242 -20
- package/build/src/cli/validate-workflow-artifacts.js +19 -2
- package/build/src/cli/verify.d.ts +1 -0
- package/build/src/cli/verify.js +90 -0
- package/build/src/cli/workflow-sidecar.d.ts +316 -8
- package/build/src/cli/workflow-sidecar.js +1996 -91
- package/build/src/cli.js +2 -3
- package/build/src/lib/flow-resolver.d.ts +111 -0
- package/build/src/lib/flow-resolver.js +308 -0
- package/build/src/tools/build-universal-bundles.js +34 -22
- package/build/src/tools/generate-context-map.js +3 -16
- package/build/src/tools/validate-source-tree.d.ts +1 -1
- package/build/src/tools/validate-source-tree.js +42 -162
- package/context/contracts/artifact-contract.md +10 -0
- package/context/contracts/delivery-contract.md +1 -0
- package/context/contracts/review-contract.md +1 -0
- package/context/contracts/verification-contract.md +2 -0
- package/context/gate-awareness.md +39 -0
- package/context/scripts/hooks/stop-goal-fit.js +632 -70
- package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
- package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
- package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
- package/docs/adr/0007-skill-audit.md +1 -1
- package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
- package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
- package/docs/adr/0011-mcp-posture.md +100 -0
- package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
- package/docs/adr/0013-context-lifecycle.md +151 -0
- package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
- package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
- package/docs/adr/0016-three-hard-boundary-model.md +71 -0
- package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
- package/docs/agent-system-guidebook.md +5 -12
- package/docs/context-map.md +4 -10
- package/docs/index.md +3 -2
- package/docs/integrations/framework-adapter.md +19 -6
- package/docs/integrations/index.md +2 -2
- package/docs/north-star.md +4 -4
- package/docs/operating-layers.md +3 -3
- package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
- package/docs/repository-structure.md +2 -2
- package/docs/skills-map.md +1 -0
- package/docs/spec/runtime-hook-surface.md +62 -9
- package/docs/standards-register.md +3 -3
- package/docs/survey-utterance-check.md +1 -1
- package/docs/trust-anchor-adoption.md +197 -0
- package/docs/verifiable-trust.md +95 -0
- package/docs/veritas-integration.md +2 -2
- package/docs/workflow-usage-guide.md +69 -0
- package/evals/acceptance/DEMO-false-completion.md +144 -0
- package/evals/acceptance/demo-cast.sh +92 -0
- package/evals/acceptance/demo-false-completion.sh +72 -0
- package/evals/acceptance/demo-real-evidence.sh +104 -0
- package/evals/acceptance/demo.tape +29 -0
- package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
- package/evals/acceptance/prove-capture-teeth.sh +114 -0
- package/evals/acceptance/prove-teeth.sh +105 -0
- package/evals/ci/antigaming-suite.sh +55 -0
- package/evals/ci/run-baseline.sh +2 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
- package/evals/integration/test_builder_step_producers.sh +379 -0
- package/evals/integration/test_bundle_install.sh +35 -71
- package/evals/integration/test_bundle_lifecycle.sh +39 -2
- package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
- package/evals/integration/test_checkpoint_signing.sh +489 -0
- package/evals/integration/test_claim_lookup.sh +352 -0
- package/evals/integration/test_command_log_fork_classification.sh +134 -0
- package/evals/integration/test_command_log_integrity.sh +275 -0
- package/evals/integration/test_context_map.sh +0 -2
- package/evals/integration/test_dual_emit_flow_step.sh +278 -0
- package/evals/integration/test_enforcer_expects_driven.sh +281 -0
- package/evals/integration/test_evidence_capture_hook.sh +185 -0
- package/evals/integration/test_flow_kit_repository.sh +2 -0
- package/evals/integration/test_flowdef_session_activation.sh +273 -0
- package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
- package/evals/integration/test_gate_bypass_chain.sh +448 -0
- package/evals/integration/test_gate_lockdown.sh +1137 -0
- package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
- package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
- package/evals/integration/test_goal_fit_hook.sh +69 -4
- package/evals/integration/test_goal_fit_rederive.sh +263 -0
- package/evals/integration/test_install_merge.sh +1176 -0
- package/evals/integration/test_kit_identity_trust.sh +393 -0
- package/evals/integration/test_mint_attestation.sh +373 -0
- package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
- package/evals/integration/test_publish_delivery.sh +269 -0
- package/evals/integration/test_reconcile_soundness.sh +528 -0
- package/evals/integration/test_resolvefirststep_security.sh +208 -0
- package/evals/integration/test_session_resume_roundtrip.sh +286 -0
- package/evals/integration/test_trust_checkpoint.sh +325 -0
- package/evals/integration/test_trust_reconcile.sh +293 -0
- package/evals/integration/test_verify_cli.sh +208 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
- package/evals/lib/node.sh +0 -6
- package/evals/run.sh +47 -0
- package/evals/static/test_workflow_skills.sh +6 -13
- package/install.sh +0 -7
- package/integrations/strands-ts/README.md +25 -15
- package/integrations/veritas/flow-agents.adapter.json +1 -2
- package/kits/builder/flows/build.flow.json +59 -12
- package/kits/builder/kit.json +85 -15
- package/kits/builder/skills/continue-work/SKILL.md +116 -0
- package/kits/builder/skills/deliver/SKILL.md +36 -6
- package/kits/builder/skills/design-probe/SKILL.md +28 -0
- package/kits/builder/skills/execute-plan/SKILL.md +9 -1
- package/kits/builder/skills/gate-review/SKILL.md +234 -0
- package/kits/builder/skills/learning-review/SKILL.md +30 -0
- package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
- package/kits/builder/skills/plan-work/SKILL.md +13 -1
- package/kits/builder/skills/pull-work/SKILL.md +19 -0
- package/kits/knowledge/adapters/default-store/index.js +38 -0
- package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
- package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
- package/kits/knowledge/docs/store-contract.md +314 -0
- package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
- package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
- package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
- package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
- package/kits/knowledge/evals/entities/suite.test.js +40 -0
- package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
- package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
- package/kits/knowledge/evals/retirement/suite.test.js +145 -0
- package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
- package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
- package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
- package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
- package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
- package/kits/knowledge/kit.json +51 -1
- package/package.json +6 -6
- package/packaging/conformance/README.md +10 -2
- package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
- package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
- package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
- package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
- package/packaging/conformance/run-conformance.js +1 -1
- package/scripts/README.md +2 -1
- package/scripts/build-universal-bundles.js +0 -1
- package/scripts/ci/mint-attestation.js +221 -0
- package/scripts/ci/trust-reconcile.js +545 -0
- package/scripts/hooks/config-protection.js +423 -1
- package/scripts/hooks/evidence-capture.js +348 -0
- package/scripts/hooks/lib/liveness-read.js +113 -0
- package/scripts/hooks/run-hook.js +6 -1
- package/scripts/hooks/stop-goal-fit.js +1524 -79
- package/scripts/hooks/workflow-steering.js +135 -5
- package/scripts/install-codex-home.sh +39 -0
- package/scripts/install-merge.js +330 -0
- package/scripts/repair-command-log.js +115 -0
- package/src/cli/init.ts +218 -20
- package/src/cli/validate-workflow-artifacts.ts +18 -2
- package/src/cli/verify.ts +100 -0
- package/src/cli/workflow-sidecar.ts +2127 -84
- package/src/cli.ts +2 -3
- package/src/lib/flow-resolver.ts +369 -0
- package/src/tools/build-universal-bundles.ts +34 -21
- package/src/tools/generate-context-map.ts +3 -17
- package/src/tools/validate-source-tree.ts +44 -104
- package/build/src/tools/filter-installed-packs.d.ts +0 -2
- package/build/src/tools/filter-installed-packs.js +0 -135
- package/packaging/packs.json +0 -49
- package/scripts/filter-installed-packs.js +0 -2
- package/src/tools/filter-installed-packs.ts +0 -132
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_captured_fail_reconciliation.sh — Security regression proof for the
|
|
3
|
+
# namespace-agnostic captured-FAIL reconciliation (AC1) and related fixes.
|
|
4
|
+
#
|
|
5
|
+
# Closes the critical allowlist false-completion bypass:
|
|
6
|
+
# A kit-typed claim (e.g. builder.verify.tests) asserting a command passed, while
|
|
7
|
+
# command-log.jsonl recorded that command as FAIL, used to SHIP (exit 0) whenever
|
|
8
|
+
# the gate didn't resolve the exact flow declaring that exact claimType — including
|
|
9
|
+
# the default no-active-flow case (current.json has no active_flow_id) and any
|
|
10
|
+
# agent-chosen non-declared type.
|
|
11
|
+
#
|
|
12
|
+
# Tests:
|
|
13
|
+
# 1. BYPASS CLOSED: kit-typed claim (builder.verify.tests) + command-log FAIL,
|
|
14
|
+
# NO active flow → PRE-FIX exit 0, POST-FIX exit 2 (namespace-agnostic false-completion).
|
|
15
|
+
# 2. BYPASS CLOSED: agent-chosen non-declared type + active flow + command-log FAIL
|
|
16
|
+
# → POST-FIX exit 2.
|
|
17
|
+
# 3. NO OVER-BLOCK (a): clean session, no captured fails → not blocked.
|
|
18
|
+
# 4. NO OVER-BLOCK (b): fail-then-re-run-to-pass (latest capture PASS) → not blocked.
|
|
19
|
+
# 5. NO OVER-BLOCK (c): acknowledged failure (evidence marks command disputed/failed) → not blocked.
|
|
20
|
+
# 6. NO OVER-BLOCK (d): no-command doc/policy session (NO evidence.execution.label,
|
|
21
|
+
# no command-log) → NOT blocked (fixes #216 over-block).
|
|
22
|
+
# 7. AC3 empty-expects regression: declared-only bundle + fake flow with expects:[]
|
|
23
|
+
# → gate misconfiguration HARD_BLOCK (two-part dependency: union form + empty-expects guard).
|
|
24
|
+
#
|
|
25
|
+
# Deterministic, no model spend, self-cleaning.
|
|
26
|
+
# Usage: bash evals/integration/test_captured_fail_reconciliation.sh
|
|
27
|
+
set -uo pipefail
|
|
28
|
+
|
|
29
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
30
|
+
GATE="$ROOT/scripts/hooks/stop-goal-fit.js"
|
|
31
|
+
|
|
32
|
+
export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
|
|
33
|
+
|
|
34
|
+
TMP="$(mktemp -d)"
|
|
35
|
+
errors=0
|
|
36
|
+
_pass() { echo " PASS: $1"; }
|
|
37
|
+
_fail() { echo " FAIL: $1"; errors=$((errors + 1)); }
|
|
38
|
+
|
|
39
|
+
cleanup() { rm -rf "$TMP"; }
|
|
40
|
+
trap cleanup EXIT
|
|
41
|
+
|
|
42
|
+
# ─── Helper: seed a delivered (terminal) workflow artifact ────────────────────
|
|
43
|
+
seed_delivered() { # $1=dir $2=slug
|
|
44
|
+
local p="$1" slug="$2"
|
|
45
|
+
mkdir -p "$p/.flow-agents/$slug"
|
|
46
|
+
printf '# Repo\n' > "$p/AGENTS.md"
|
|
47
|
+
printf '%s' "{\"schema_version\":\"1.0\",\"task_slug\":\"$slug\",\"status\":\"delivered\",\"phase\":\"done\",\"updated_at\":\"2026-06-27T00:00:00Z\",\"next_action\":{\"status\":\"done\",\"summary\":\"done\"}}" \
|
|
48
|
+
> "$p/.flow-agents/$slug/state.json"
|
|
49
|
+
cat > "$p/.flow-agents/$slug/$slug--deliver.md" << MD
|
|
50
|
+
# $slug
|
|
51
|
+
|
|
52
|
+
branch: main
|
|
53
|
+
status: delivered
|
|
54
|
+
type: deliver
|
|
55
|
+
|
|
56
|
+
## Definition Of Done
|
|
57
|
+
- [x] tests pass
|
|
58
|
+
|
|
59
|
+
## Goal Fit Gate
|
|
60
|
+
- [x] acceptance verified
|
|
61
|
+
|
|
62
|
+
### Verdict: PASS
|
|
63
|
+
MD
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# ─── Helper: write a bundle with kit-typed claim (builder.verify.tests) asserting pass ──
|
|
67
|
+
# Evidence item has execution.label="npm test" (the critical scenario).
|
|
68
|
+
write_kit_pass_bundle() { # $1=bundle_path $2=slug $3=claim_value(opt)
|
|
69
|
+
local claim_val="${3:-pass}"
|
|
70
|
+
python3 - "$1" "$2" "$claim_val" << 'PY'
|
|
71
|
+
import json, sys
|
|
72
|
+
bundle_path, slug, claim_val = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
73
|
+
bundle = {
|
|
74
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
75
|
+
"claims": [{
|
|
76
|
+
"id": "c1", "subjectId": slug + "/tests", "subjectType": "flow-step",
|
|
77
|
+
"claimType": "builder.verify.tests",
|
|
78
|
+
"fieldOrBehavior": "npm test",
|
|
79
|
+
"value": claim_val, "impactLevel": "high", "status": "verified",
|
|
80
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
81
|
+
}],
|
|
82
|
+
"evidence": [{
|
|
83
|
+
"id": "ev1", "claimId": "c1",
|
|
84
|
+
"evidenceType": "command_output", "method": "capture",
|
|
85
|
+
"sourceRef": "command-log.jsonl",
|
|
86
|
+
"excerptOrSummary": "npm test passed (agent claimed)",
|
|
87
|
+
"observedAt": "2026-06-27T00:00:00Z", "collectedBy": "agent",
|
|
88
|
+
"passing": True,
|
|
89
|
+
"execution": {"label": "npm test", "exitCode": 0}
|
|
90
|
+
}],
|
|
91
|
+
"policies": [], "events": []
|
|
92
|
+
}
|
|
93
|
+
json.dump(bundle, open(bundle_path, 'w'))
|
|
94
|
+
PY
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# ─── Helper: write a command-log with npm test FAIL ──────────────────────────
|
|
98
|
+
write_fail_log() { # $1=log_path
|
|
99
|
+
printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' > "$1"
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# ─── Helper: write a command-log with npm test PASS ──────────────────────────
|
|
103
|
+
write_pass_log() { # $1=log_path
|
|
104
|
+
printf '%s\n' '{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' > "$1"
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# ─── Helper: run gate in block mode ──────────────────────────────────────────
|
|
108
|
+
run_gate() { # $1=cwd, returns exit code; output on stdout
|
|
109
|
+
FLOW_AGENTS_GOAL_FIT_MODE=block \
|
|
110
|
+
FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
111
|
+
FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000 \
|
|
112
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$1\"}"
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
echo ""
|
|
116
|
+
echo "================================================================="
|
|
117
|
+
echo " Namespace-Agnostic Captured-FAIL Reconciliation"
|
|
118
|
+
echo " (AC1 allowlist bypass closure + AC2 no-over-block)"
|
|
119
|
+
echo "================================================================="
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
123
|
+
# Test 1: BYPASS CLOSED — kit-typed claim + command-log FAIL, NO active flow
|
|
124
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
125
|
+
echo ""
|
|
126
|
+
echo "=== 1. BYPASS CLOSED: kit-typed claim (builder.verify.tests) + command-log FAIL, NO active_flow_id ==="
|
|
127
|
+
echo " PRE-FIX: gate was blind to builder.verify.tests (not workflow.* and no active flow)"
|
|
128
|
+
echo " POST-FIX: capturedFailReconciliation catches it namespace-agnostically"
|
|
129
|
+
|
|
130
|
+
T1="$TMP/t1-bypass"
|
|
131
|
+
seed_delivered "$T1" "bypass-kit"
|
|
132
|
+
|
|
133
|
+
# NO active_flow_id in current.json
|
|
134
|
+
printf '%s' '{"artifact_dir":"bypass-kit"}' > "$T1/.flow-agents/current.json"
|
|
135
|
+
|
|
136
|
+
write_kit_pass_bundle "$T1/.flow-agents/bypass-kit/trust.bundle" "bypass-kit"
|
|
137
|
+
write_fail_log "$T1/.flow-agents/bypass-kit/command-log.jsonl"
|
|
138
|
+
|
|
139
|
+
echo ""
|
|
140
|
+
echo "--- 1a. PRE-FIX simulation: show the gate was blind ---"
|
|
141
|
+
node -e "
|
|
142
|
+
// PRE-FIX: captureCrossReference only checked workflow.* OR declared types.
|
|
143
|
+
// No active_flow_id → declaredClaimTypes = null → only workflow.* selected.
|
|
144
|
+
// builder.verify.tests does NOT start with 'workflow.' → NOT selected → missed.
|
|
145
|
+
const claimType = 'builder.verify.tests';
|
|
146
|
+
const declaredClaimTypes = null; // no active flow
|
|
147
|
+
|
|
148
|
+
// Old code: bundleClaimedPassCommandChecks only included claims in the allowlist
|
|
149
|
+
const inAllowlist = claimType.startsWith('workflow.')
|
|
150
|
+
|| (declaredClaimTypes != null && declaredClaimTypes.has(claimType));
|
|
151
|
+
console.log(' builder.verify.tests in allowlist (pre-fix):', inAllowlist);
|
|
152
|
+
console.log(' PRE-FIX: 0 claimed-pass checks → no cross-reference → exit 0 (BYPASS)');
|
|
153
|
+
if (inAllowlist) { console.error('ERROR: pre-fix simulation incorrect'); process.exit(1); }
|
|
154
|
+
" 2>&1 && _pass "PRE-FIX: builder.verify.tests NOT in allowlist → captureCrossReference blind (exit 0)" \
|
|
155
|
+
|| _fail "PRE-FIX simulation error"
|
|
156
|
+
|
|
157
|
+
echo ""
|
|
158
|
+
echo "--- 1b. POST-FIX: capturedFailReconciliation blocks namespace-agnostically ---"
|
|
159
|
+
set +e
|
|
160
|
+
t1_out="$(run_gate "$T1")"
|
|
161
|
+
t1_exit=$?
|
|
162
|
+
set -e
|
|
163
|
+
|
|
164
|
+
echo " POST-FIX exit code: $t1_exit (expected 2)"
|
|
165
|
+
if [ "$t1_exit" -eq 2 ]; then
|
|
166
|
+
_pass "POST-FIX: kit-typed false-completion BLOCKED (exit 2)"
|
|
167
|
+
else
|
|
168
|
+
_fail "POST-FIX: expected exit 2, got $t1_exit. output: ${t1_out:0:300}"
|
|
169
|
+
fi
|
|
170
|
+
|
|
171
|
+
if echo "$t1_out" | grep -q "caught false-completion"; then
|
|
172
|
+
_pass "POST-FIX: emits 'caught false-completion' (namespace-agnostic)"
|
|
173
|
+
else
|
|
174
|
+
_fail "POST-FIX: missing 'caught false-completion'. output: ${t1_out:0:300}"
|
|
175
|
+
fi
|
|
176
|
+
|
|
177
|
+
if echo "$t1_out" | grep -q "npm test"; then
|
|
178
|
+
_pass "POST-FIX: warning names the contradicted command (npm test)"
|
|
179
|
+
else
|
|
180
|
+
_fail "POST-FIX: warning does not name the command. output: ${t1_out:0:300}"
|
|
181
|
+
fi
|
|
182
|
+
|
|
183
|
+
if echo "$t1_out" | grep -q "builder.verify.tests"; then
|
|
184
|
+
_pass "POST-FIX: warning names the claimType (builder.verify.tests)"
|
|
185
|
+
else
|
|
186
|
+
_fail "POST-FIX: warning does not name the claimType. output: ${t1_out:0:300}"
|
|
187
|
+
fi
|
|
188
|
+
|
|
189
|
+
echo ""
|
|
190
|
+
echo "--- 1c. Exit code summary ---"
|
|
191
|
+
echo " PRE-FIX exit code (simulated): 0 — builder.verify.tests not in allowlist → gate blind"
|
|
192
|
+
echo " POST-FIX exit code (actual): $t1_exit — capturedFailReconciliation blocks regardless of namespace"
|
|
193
|
+
if [ "$t1_exit" -eq 2 ]; then
|
|
194
|
+
echo " Result: BYPASS CLOSED (pre=0, post=2)"
|
|
195
|
+
else
|
|
196
|
+
echo " Result: BYPASS STILL OPEN"
|
|
197
|
+
fi
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
201
|
+
# Test 2: BYPASS CLOSED — agent-chosen non-declared type + active flow + FAIL
|
|
202
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
203
|
+
echo ""
|
|
204
|
+
echo "=== 2. BYPASS CLOSED: agent-chosen non-declared type + active flow + command-log FAIL ==="
|
|
205
|
+
|
|
206
|
+
T2="$TMP/t2-nondeclared"
|
|
207
|
+
seed_delivered "$T2" "nondeclared"
|
|
208
|
+
|
|
209
|
+
# current.json: active flow (builder.build/verify)
|
|
210
|
+
printf '%s' '{"artifact_dir":"nondeclared","active_flow_id":"builder.build","active_step_id":"verify"}' \
|
|
211
|
+
> "$T2/.flow-agents/current.json"
|
|
212
|
+
|
|
213
|
+
# Fake flow defs dir (safe, not agent-writable)
|
|
214
|
+
FLOW_DEFS_DIR="$TMP/flows"
|
|
215
|
+
mkdir -p "$FLOW_DEFS_DIR"
|
|
216
|
+
cat > "$FLOW_DEFS_DIR/builder.build.flow.json" << 'FLOWJSON'
|
|
217
|
+
{
|
|
218
|
+
"id": "builder.build",
|
|
219
|
+
"version": "1.0",
|
|
220
|
+
"gates": {
|
|
221
|
+
"verify-gate": {
|
|
222
|
+
"step": "verify",
|
|
223
|
+
"expects": [
|
|
224
|
+
{
|
|
225
|
+
"id": "tests-evidence",
|
|
226
|
+
"kind": "trust.bundle",
|
|
227
|
+
"required": true,
|
|
228
|
+
"bundle_claim": {
|
|
229
|
+
"claimType": "builder.verify.tests",
|
|
230
|
+
"subjectType": "flow-step",
|
|
231
|
+
"accepted_statuses": ["trusted", "accepted"]
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
]
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
FLOWJSON
|
|
239
|
+
|
|
240
|
+
# Bundle: agent-chosen NON-declared claimType (e.g. "acme.custom.verify") claiming npm test passed
|
|
241
|
+
python3 - "$T2/.flow-agents/nondeclared/trust.bundle" "nondeclared" << 'PY'
|
|
242
|
+
import json, sys
|
|
243
|
+
bundle_path, slug = sys.argv[1], sys.argv[2]
|
|
244
|
+
bundle = {
|
|
245
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
246
|
+
"claims": [{
|
|
247
|
+
"id": "c1", "subjectId": slug + "/tests", "subjectType": "custom",
|
|
248
|
+
"claimType": "acme.custom.verify", # neither workflow.* NOR declared by the flow
|
|
249
|
+
"fieldOrBehavior": "npm test",
|
|
250
|
+
"value": "pass", "impactLevel": "high", "status": "verified",
|
|
251
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
252
|
+
}],
|
|
253
|
+
"evidence": [{
|
|
254
|
+
"id": "ev1", "claimId": "c1",
|
|
255
|
+
"evidenceType": "command_output", "method": "capture",
|
|
256
|
+
"sourceRef": "command-log.jsonl",
|
|
257
|
+
"excerptOrSummary": "npm test passed (agent claimed)",
|
|
258
|
+
"observedAt": "2026-06-27T00:00:00Z", "collectedBy": "agent",
|
|
259
|
+
"passing": True,
|
|
260
|
+
"execution": {"label": "npm test", "exitCode": 0}
|
|
261
|
+
}],
|
|
262
|
+
"policies": [], "events": []
|
|
263
|
+
}
|
|
264
|
+
json.dump(bundle, open(bundle_path, 'w'))
|
|
265
|
+
PY
|
|
266
|
+
write_fail_log "$T2/.flow-agents/nondeclared/command-log.jsonl"
|
|
267
|
+
|
|
268
|
+
set +e
|
|
269
|
+
t2_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
270
|
+
FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000 FLOW_AGENTS_FLOW_DEFS_DIR="$FLOW_DEFS_DIR" \
|
|
271
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T2\"}")"
|
|
272
|
+
t2_exit=$?
|
|
273
|
+
set -e
|
|
274
|
+
|
|
275
|
+
echo " Non-declared type (acme.custom.verify) + active flow + FAIL: exit=$t2_exit (expected 2)"
|
|
276
|
+
if [ "$t2_exit" -eq 2 ]; then
|
|
277
|
+
_pass "Non-declared type with FAIL: BLOCKED (exit 2)"
|
|
278
|
+
else
|
|
279
|
+
_fail "Non-declared type with FAIL: NOT blocked (exit $t2_exit). output: ${t2_out:0:300}"
|
|
280
|
+
fi
|
|
281
|
+
if echo "$t2_out" | grep -q "caught false-completion\|unaccounted at completion"; then
|
|
282
|
+
_pass "Non-declared type: 'caught false-completion' or 'unaccounted' emitted"
|
|
283
|
+
else
|
|
284
|
+
_fail "Non-declared type: expected blocking message not found. output: ${t2_out:0:300}"
|
|
285
|
+
fi
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
289
|
+
# Test 3: NO OVER-BLOCK (a) — clean session, no captured fails
|
|
290
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
291
|
+
echo ""
|
|
292
|
+
echo "=== 3. NO OVER-BLOCK (a): clean session, no captured fails ==="
|
|
293
|
+
|
|
294
|
+
T3="$TMP/t3-clean"
|
|
295
|
+
seed_delivered "$T3" "clean-sess"
|
|
296
|
+
printf '%s' '{"artifact_dir":"clean-sess"}' > "$T3/.flow-agents/current.json"
|
|
297
|
+
write_kit_pass_bundle "$T3/.flow-agents/clean-sess/trust.bundle" "clean-sess"
|
|
298
|
+
write_pass_log "$T3/.flow-agents/clean-sess/command-log.jsonl"
|
|
299
|
+
|
|
300
|
+
set +e
|
|
301
|
+
t3_out="$(run_gate "$T3")"
|
|
302
|
+
t3_exit=$?
|
|
303
|
+
set -e
|
|
304
|
+
|
|
305
|
+
blocked_new="$(echo "$t3_out" | grep -c "unaccounted at completion\|namespace-agnostic caught false-completion" || true)"
|
|
306
|
+
echo " Clean session (latest=PASS): exit=$t3_exit, new_logic_blocks=$blocked_new"
|
|
307
|
+
if [ "$blocked_new" -eq 0 ]; then
|
|
308
|
+
_pass "Clean session NOT blocked by new reconciliation logic"
|
|
309
|
+
else
|
|
310
|
+
_fail "Clean session INCORRECTLY blocked by new logic. output: ${t3_out:0:300}"
|
|
311
|
+
fi
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
315
|
+
# Test 4: NO OVER-BLOCK (b) — fail-then-re-run-to-pass (latest=PASS)
|
|
316
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
317
|
+
echo ""
|
|
318
|
+
echo "=== 4. NO OVER-BLOCK (b): fail-then-re-run-to-pass (latest capture PASS) ==="
|
|
319
|
+
|
|
320
|
+
T4="$TMP/t4-rerun"
|
|
321
|
+
seed_delivered "$T4" "rerun-pass"
|
|
322
|
+
printf '%s' '{"artifact_dir":"rerun-pass"}' > "$T4/.flow-agents/current.json"
|
|
323
|
+
write_kit_pass_bundle "$T4/.flow-agents/rerun-pass/trust.bundle" "rerun-pass"
|
|
324
|
+
# FAIL first, then PASS (re-run fixed it — latest is PASS)
|
|
325
|
+
{
|
|
326
|
+
printf '%s\n' '{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-27T00:00:00Z","source":"test"}'
|
|
327
|
+
printf '%s\n' '{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-27T00:00:01Z","source":"test"}'
|
|
328
|
+
} > "$T4/.flow-agents/rerun-pass/command-log.jsonl"
|
|
329
|
+
|
|
330
|
+
set +e
|
|
331
|
+
t4_out="$(run_gate "$T4")"
|
|
332
|
+
t4_exit=$?
|
|
333
|
+
set -e
|
|
334
|
+
|
|
335
|
+
blocked_new="$(echo "$t4_out" | grep -c "unaccounted at completion\|namespace-agnostic caught false-completion" || true)"
|
|
336
|
+
echo " Fail-then-re-run-to-pass (latest=PASS): exit=$t4_exit, new_logic_blocks=$blocked_new"
|
|
337
|
+
if [ "$blocked_new" -eq 0 ]; then
|
|
338
|
+
_pass "Fail-then-re-run-to-pass NOT blocked (latest capture PASS clears it)"
|
|
339
|
+
else
|
|
340
|
+
_fail "Fail-then-re-run-to-pass INCORRECTLY blocked. output: ${t4_out:0:300}"
|
|
341
|
+
fi
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
345
|
+
# Test 5: NO OVER-BLOCK (c) — acknowledged failure (evidence disputed/failed)
|
|
346
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
347
|
+
echo ""
|
|
348
|
+
echo "=== 5. NO OVER-BLOCK (c): acknowledged failure (evidence marks command disputed/failed) ==="
|
|
349
|
+
|
|
350
|
+
T5="$TMP/t5-ack"
|
|
351
|
+
seed_delivered "$T5" "ack-fail"
|
|
352
|
+
printf '%s' '{"artifact_dir":"ack-fail"}' > "$T5/.flow-agents/current.json"
|
|
353
|
+
|
|
354
|
+
# Bundle: claim acknowledges failure (status=disputed, value=fail)
|
|
355
|
+
python3 - "$T5/.flow-agents/ack-fail/trust.bundle" "ack-fail" << 'PY'
|
|
356
|
+
import json, sys
|
|
357
|
+
bundle_path, slug = sys.argv[1], sys.argv[2]
|
|
358
|
+
bundle = {
|
|
359
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
360
|
+
"claims": [{
|
|
361
|
+
"id": "c1", "subjectId": slug + "/tests", "subjectType": "flow-step",
|
|
362
|
+
"claimType": "builder.verify.tests",
|
|
363
|
+
"fieldOrBehavior": "npm test",
|
|
364
|
+
"value": "fail", # acknowledges failure
|
|
365
|
+
"impactLevel": "low", # low-impact avoids surface-unavailable block
|
|
366
|
+
"status": "disputed", # acknowledges failure
|
|
367
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
368
|
+
}],
|
|
369
|
+
"evidence": [{
|
|
370
|
+
"id": "ev1", "claimId": "c1",
|
|
371
|
+
"evidenceType": "command_output", "method": "capture",
|
|
372
|
+
"sourceRef": "command-log.jsonl",
|
|
373
|
+
"excerptOrSummary": "npm test failed (acknowledged in evidence)",
|
|
374
|
+
"observedAt": "2026-06-27T00:00:00Z", "collectedBy": "agent",
|
|
375
|
+
"passing": False,
|
|
376
|
+
"execution": {"label": "npm test", "exitCode": 1}
|
|
377
|
+
}],
|
|
378
|
+
"policies": [], "events": []
|
|
379
|
+
}
|
|
380
|
+
json.dump(bundle, open(bundle_path, 'w'))
|
|
381
|
+
PY
|
|
382
|
+
write_fail_log "$T5/.flow-agents/ack-fail/command-log.jsonl"
|
|
383
|
+
|
|
384
|
+
set +e
|
|
385
|
+
t5_out="$(run_gate "$T5")"
|
|
386
|
+
t5_exit=$?
|
|
387
|
+
set -e
|
|
388
|
+
|
|
389
|
+
blocked_new="$(echo "$t5_out" | grep -c "unaccounted at completion\|namespace-agnostic caught false-completion" || true)"
|
|
390
|
+
echo " Acknowledged failure (status=disputed, value=fail): exit=$t5_exit, new_logic_blocks=$blocked_new"
|
|
391
|
+
if [ "$blocked_new" -eq 0 ]; then
|
|
392
|
+
_pass "Acknowledged failure NOT blocked (agent owns the failure in evidence)"
|
|
393
|
+
else
|
|
394
|
+
_fail "Acknowledged failure INCORRECTLY blocked. output: ${t5_out:0:300}"
|
|
395
|
+
fi
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
399
|
+
# Test 6: NO OVER-BLOCK (d) — no-command doc/policy session (fixes #216)
|
|
400
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
401
|
+
echo ""
|
|
402
|
+
echo "=== 6. NO OVER-BLOCK (d): no-command doc/policy session (verified, no execution.label, no command-log) ==="
|
|
403
|
+
echo " (#216 fix: missing-log check must NOT fire when no command was expected)"
|
|
404
|
+
|
|
405
|
+
T6="$TMP/t6-nocommand"
|
|
406
|
+
mkdir -p "$T6/.flow-agents/nocommand"
|
|
407
|
+
printf '# Repo\n' > "$T6/AGENTS.md"
|
|
408
|
+
# State is verified (completing) but no commands were run
|
|
409
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"nocommand","status":"verified","phase":"verification","updated_at":"2026-06-27T00:00:00Z","next_action":{"status":"done","summary":"done"}}' \
|
|
410
|
+
> "$T6/.flow-agents/nocommand/state.json"
|
|
411
|
+
cat > "$T6/.flow-agents/nocommand/nocommand--deliver.md" << 'MD'
|
|
412
|
+
# nocommand
|
|
413
|
+
|
|
414
|
+
branch: main
|
|
415
|
+
status: verified
|
|
416
|
+
type: deliver
|
|
417
|
+
|
|
418
|
+
## Definition Of Done
|
|
419
|
+
- [x] policy document reviewed
|
|
420
|
+
|
|
421
|
+
## Goal Fit Gate
|
|
422
|
+
- [x] acceptance verified
|
|
423
|
+
|
|
424
|
+
### Verdict: PASS
|
|
425
|
+
MD
|
|
426
|
+
printf '%s' '{"artifact_dir":"nocommand"}' > "$T6/.flow-agents/current.json"
|
|
427
|
+
|
|
428
|
+
# Bundle with NO execution.label (doc/policy session — no commands run)
|
|
429
|
+
python3 - "$T6/.flow-agents/nocommand/trust.bundle" << 'PY'
|
|
430
|
+
import json, sys
|
|
431
|
+
bundle = {
|
|
432
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
433
|
+
"claims": [{
|
|
434
|
+
"id": "c1", "subjectId": "nocommand/review", "subjectType": "workflow-check",
|
|
435
|
+
"claimType": "workflow.check.review", "fieldOrBehavior": "policy doc reviewed",
|
|
436
|
+
"value": "pass", "impactLevel": "low", "status": "verified",
|
|
437
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
438
|
+
}],
|
|
439
|
+
"evidence": [{
|
|
440
|
+
"id": "ev1", "claimId": "c1",
|
|
441
|
+
"evidenceType": "review_output", "method": "manual",
|
|
442
|
+
"sourceRef": "docs/policy.md",
|
|
443
|
+
"excerptOrSummary": "Policy document reviewed and approved",
|
|
444
|
+
"observedAt": "2026-06-27T00:00:00Z", "collectedBy": "agent",
|
|
445
|
+
"passing": True
|
|
446
|
+
# NOTE: NO execution.label — no command was run
|
|
447
|
+
}],
|
|
448
|
+
"policies": [], "events": []
|
|
449
|
+
}
|
|
450
|
+
json.dump(bundle, open(sys.argv[1], 'w'))
|
|
451
|
+
PY
|
|
452
|
+
# NO command-log.jsonl
|
|
453
|
+
|
|
454
|
+
set +e
|
|
455
|
+
t6_out="$(run_gate "$T6")"
|
|
456
|
+
t6_exit=$?
|
|
457
|
+
set -e
|
|
458
|
+
|
|
459
|
+
blocked_missing_log="$(echo "$t6_out" | grep -c "expected capture log is missing" || true)"
|
|
460
|
+
blocked_new="$(echo "$t6_out" | grep -c "unaccounted at completion\|namespace-agnostic caught false-completion" || true)"
|
|
461
|
+
echo " No-command session (verified, no execution.label): exit=$t6_exit"
|
|
462
|
+
echo " blocked_by_missing_log=$blocked_missing_log, blocked_by_new_logic=$blocked_new"
|
|
463
|
+
if [ "$blocked_missing_log" -eq 0 ] && [ "$blocked_new" -eq 0 ]; then
|
|
464
|
+
_pass "#216 FIXED: no-command session NOT blocked by missing-log or new reconciliation"
|
|
465
|
+
else
|
|
466
|
+
_fail "#216 NOT FIXED or new regression: session blocked. output: ${t6_out:0:400}"
|
|
467
|
+
fi
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
471
|
+
# Test 7: AC3 empty-expects regression
|
|
472
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
473
|
+
echo ""
|
|
474
|
+
echo "=== 7. AC3 empty-expects regression: declared-only bundle + fake flow with expects:[] ==="
|
|
475
|
+
echo " (Two-part dependency: union form ALWAYS enforces workflow.* + empty-expects guard"
|
|
476
|
+
echo " emits gate-misconfiguration HARD_BLOCK for empty expects[])"
|
|
477
|
+
|
|
478
|
+
T7="$TMP/t7-empty-expects"
|
|
479
|
+
mkdir -p "$T7/.flow-agents/empty-expects"
|
|
480
|
+
printf '# Repo\n' > "$T7/AGENTS.md"
|
|
481
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"empty-expects","status":"in_progress","phase":"execution","updated_at":"2026-06-27T00:00:00Z","next_action":{"status":"in_progress","summary":"Testing"}}' \
|
|
482
|
+
> "$T7/.flow-agents/empty-expects/state.json"
|
|
483
|
+
cat > "$T7/.flow-agents/empty-expects/empty-expects--deliver.md" << 'MD'
|
|
484
|
+
# empty-expects
|
|
485
|
+
|
|
486
|
+
branch: main
|
|
487
|
+
status: in_progress
|
|
488
|
+
type: deliver
|
|
489
|
+
|
|
490
|
+
## Definition Of Done
|
|
491
|
+
- [ ] tests pass
|
|
492
|
+
MD
|
|
493
|
+
printf '%s' '{"artifact_dir":"empty-expects","active_flow_id":"builder.build","active_step_id":"verify"}' \
|
|
494
|
+
> "$T7/.flow-agents/current.json"
|
|
495
|
+
|
|
496
|
+
# Bundle with ONLY kit-typed claims (no workflow.*)
|
|
497
|
+
python3 - "$T7/.flow-agents/empty-expects/trust.bundle" "empty-expects" << 'PY'
|
|
498
|
+
import json, sys
|
|
499
|
+
bundle_path, slug = sys.argv[1], sys.argv[2]
|
|
500
|
+
bundle = {
|
|
501
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
502
|
+
"claims": [{
|
|
503
|
+
"id": "c1", "subjectId": slug + "/tests", "subjectType": "flow-step",
|
|
504
|
+
"claimType": "builder.verify.tests",
|
|
505
|
+
"fieldOrBehavior": "npm test",
|
|
506
|
+
"value": "pass", "impactLevel": "high", "status": "disputed",
|
|
507
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
508
|
+
}],
|
|
509
|
+
"evidence": [], "policies": [], "events": []
|
|
510
|
+
}
|
|
511
|
+
json.dump(bundle, open(bundle_path, 'w'))
|
|
512
|
+
PY
|
|
513
|
+
printf '' > "$T7/.flow-agents/empty-expects/command-log.jsonl"
|
|
514
|
+
|
|
515
|
+
# Fake flow with expects:[] (safe dir, not agent-writable)
|
|
516
|
+
FAKE_FLOWS="$TMP/fake-flows-ac3"
|
|
517
|
+
mkdir -p "$FAKE_FLOWS"
|
|
518
|
+
cat > "$FAKE_FLOWS/builder.build.flow.json" << 'FLOWJSON'
|
|
519
|
+
{"id":"builder.build","version":"0.0","gates":{"fake-gate":{"step":"verify","expects":[]}}}
|
|
520
|
+
FLOWJSON
|
|
521
|
+
|
|
522
|
+
set +e
|
|
523
|
+
t7_out="$(FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_BACKSTOP=skip \
|
|
524
|
+
FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000 FLOW_AGENTS_FLOW_DEFS_DIR="$FAKE_FLOWS" \
|
|
525
|
+
node "$GATE" 2>&1 <<< "{\"hook_event_name\":\"Stop\",\"cwd\":\"$T7\"}")"
|
|
526
|
+
t7_exit=$?
|
|
527
|
+
set -e
|
|
528
|
+
|
|
529
|
+
echo " Declared-only bundle + fake flow with expects:[]: exit=$t7_exit (expected 2)"
|
|
530
|
+
if [ "$t7_exit" -eq 2 ]; then
|
|
531
|
+
_pass "AC3: declared-only bundle + empty-expects flow → BLOCKS (exit 2)"
|
|
532
|
+
else
|
|
533
|
+
_fail "AC3: expected exit 2, got $t7_exit. output: ${t7_out:0:300}"
|
|
534
|
+
fi
|
|
535
|
+
|
|
536
|
+
if echo "$t7_out" | grep -q "gate misconfiguration"; then
|
|
537
|
+
_pass "AC3: 'gate misconfiguration' HARD_BLOCK emitted (empty expects[] guard)"
|
|
538
|
+
else
|
|
539
|
+
_fail "AC3: 'gate misconfiguration' NOT emitted. output: ${t7_out:0:300}"
|
|
540
|
+
fi
|
|
541
|
+
|
|
542
|
+
if echo "$t7_out" | grep -q "disputed\|caught false-completion\|not auto-releasing"; then
|
|
543
|
+
_pass "AC3: union form still enforces workflow.* claim (disputed builder.verify.tests caught)"
|
|
544
|
+
else
|
|
545
|
+
# The disputed builder.verify.tests is high-impact; surface may be unavailable
|
|
546
|
+
if echo "$t7_out" | grep -q "surface unavailable\|gate misconfiguration"; then
|
|
547
|
+
_pass "AC3: union form active (gate misconfiguration or surface-unavailable emitted for high-impact claim)"
|
|
548
|
+
else
|
|
549
|
+
_fail "AC3: union form not enforcing. output: ${t7_out:0:300}"
|
|
550
|
+
fi
|
|
551
|
+
fi
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
555
|
+
# Test 8: PROOF SCENARIO 1 — Status-gated dodge closed (Fix A: completing guard removed)
|
|
556
|
+
#
|
|
557
|
+
# PRE-FIX: capturedFailReconciliation had `if (!completing) return []`.
|
|
558
|
+
# A non-terminal status (e.g. 'blocked') would skip the check entirely —
|
|
559
|
+
# a kit-typed claim asserting pass for a FAIL command would SHIP.
|
|
560
|
+
# POST-FIX: completing guard removed; the check runs on EVERY stop regardless
|
|
561
|
+
# of state.json.status.
|
|
562
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
563
|
+
echo ""
|
|
564
|
+
echo "=== 8. PROOF SCENARIO 1 — Status-gated dodge closed (Fix A) ==="
|
|
565
|
+
echo " PRE-FIX: completing guard skipped reconciliation for non-terminal statuses"
|
|
566
|
+
echo " POST-FIX: guard removed → check runs on every stop (status-independent)"
|
|
567
|
+
|
|
568
|
+
T8="$TMP/t8-status-dodge"
|
|
569
|
+
mkdir -p "$T8/.flow-agents/status-dodge"
|
|
570
|
+
printf '# Repo\n' > "$T8/AGENTS.md"
|
|
571
|
+
# CRITICAL: status = 'blocked' (non-terminal — pre-fix would have returned [] here)
|
|
572
|
+
printf '%s' '{"schema_version":"1.0","task_slug":"status-dodge","status":"blocked","phase":"executing","updated_at":"2026-06-27T00:00:00Z","next_action":{"status":"in_progress","summary":"running"}}' \
|
|
573
|
+
> "$T8/.flow-agents/status-dodge/state.json"
|
|
574
|
+
cat > "$T8/.flow-agents/status-dodge/status-dodge--deliver.md" << 'MD'
|
|
575
|
+
# status-dodge
|
|
576
|
+
|
|
577
|
+
branch: main
|
|
578
|
+
status: blocked
|
|
579
|
+
type: deliver
|
|
580
|
+
|
|
581
|
+
## Definition Of Done
|
|
582
|
+
- [x] tests pass
|
|
583
|
+
|
|
584
|
+
## Goal Fit Gate
|
|
585
|
+
- [x] acceptance verified
|
|
586
|
+
|
|
587
|
+
### Verdict: PASS
|
|
588
|
+
MD
|
|
589
|
+
|
|
590
|
+
# Bundle: kit-typed claim asserting pass for "npm test"
|
|
591
|
+
python3 - "$T8/.flow-agents/status-dodge/trust.bundle" "status-dodge" << 'PY'
|
|
592
|
+
import json, sys
|
|
593
|
+
bundle_path, slug = sys.argv[1], sys.argv[2]
|
|
594
|
+
bundle = {
|
|
595
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
596
|
+
"claims": [{
|
|
597
|
+
"id": "c1", "subjectId": slug + "/tests", "subjectType": "flow-step",
|
|
598
|
+
"claimType": "builder.verify.tests",
|
|
599
|
+
"fieldOrBehavior": "npm test",
|
|
600
|
+
"value": "pass", "impactLevel": "high", "status": "verified",
|
|
601
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
602
|
+
}],
|
|
603
|
+
"evidence": [{
|
|
604
|
+
"id": "ev1", "claimId": "c1",
|
|
605
|
+
"evidenceType": "command_output", "method": "capture",
|
|
606
|
+
"sourceRef": "command-log.jsonl",
|
|
607
|
+
"excerptOrSummary": "npm test passed (agent claimed)",
|
|
608
|
+
"observedAt": "2026-06-27T00:00:00Z", "collectedBy": "agent",
|
|
609
|
+
"passing": True,
|
|
610
|
+
"execution": {"label": "npm test", "exitCode": 0}
|
|
611
|
+
}],
|
|
612
|
+
"policies": [], "events": []
|
|
613
|
+
}
|
|
614
|
+
json.dump(bundle, open(bundle_path, 'w'))
|
|
615
|
+
PY
|
|
616
|
+
# command-log: "npm test" FAIL (latest capture is FAIL — the agent lied)
|
|
617
|
+
write_fail_log "$T8/.flow-agents/status-dodge/command-log.jsonl"
|
|
618
|
+
|
|
619
|
+
echo ""
|
|
620
|
+
echo "--- 8a. PRE-FIX simulation (completing guard) ---"
|
|
621
|
+
# Old code: `const completing = TERMINAL_STATUSES.has(taskStatus) || taskStatus === 'verified'`
|
|
622
|
+
# With status='blocked', completing=false → return [] → gate blind
|
|
623
|
+
node -e "
|
|
624
|
+
const TERMINAL_STATUSES = new Set(['done','delivered','accepted','archived','complete','completed']);
|
|
625
|
+
const taskStatus = 'blocked';
|
|
626
|
+
const completing = TERMINAL_STATUSES.has(taskStatus) || taskStatus === 'verified';
|
|
627
|
+
console.log(' completing (pre-fix logic):', completing, '(false → capturedFailReconciliation skipped → gate blind)');
|
|
628
|
+
if (completing) { process.exit(1); }
|
|
629
|
+
" 2>&1 && _pass "PRE-FIX: status=blocked → completing=false → reconciliation skipped → gate blind" \
|
|
630
|
+
|| _fail "PRE-FIX simulation error"
|
|
631
|
+
|
|
632
|
+
echo ""
|
|
633
|
+
echo "--- 8b. POST-FIX: guard removed → blocks regardless of status ---"
|
|
634
|
+
set +e
|
|
635
|
+
t8_out="$(run_gate "$T8")"
|
|
636
|
+
t8_exit=$?
|
|
637
|
+
set -e
|
|
638
|
+
echo " POST-FIX exit: $t8_exit (expected 2, status=blocked, latest=FAIL, claim=pass)"
|
|
639
|
+
if [ "$t8_exit" -eq 2 ]; then
|
|
640
|
+
_pass "PROOF 1: status-gated dodge closed — POST-FIX blocks (exit 2) regardless of status=blocked"
|
|
641
|
+
else
|
|
642
|
+
_fail "PROOF 1 FAILED: status=blocked + FAIL + claim=pass should exit 2, got $t8_exit. output: ${t8_out:0:400}"
|
|
643
|
+
fi
|
|
644
|
+
if echo "$t8_out" | grep -q "caught false-completion\|namespace-agnostic"; then
|
|
645
|
+
_pass "PROOF 1: 'caught false-completion' emitted for status=blocked session"
|
|
646
|
+
else
|
|
647
|
+
_fail "PROOF 1: expected 'caught false-completion' message not found. output: ${t8_out:0:400}"
|
|
648
|
+
fi
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
652
|
+
# Test 9: PROOF SCENARIO 2 — Over-block removed (Fix B: Case B removed)
|
|
653
|
+
#
|
|
654
|
+
# PRE-FIX: Case B would HARD_BLOCK any captured FAIL with no matching claim —
|
|
655
|
+
# including incidental commands (grep no-match exit 1, git diff --exit-code, etc.).
|
|
656
|
+
# POST-FIX: Case B removed. Only Case A (claimed pass contradicts captured FAIL) blocks.
|
|
657
|
+
# A genuine incidental failure with no claim is NOT blocked.
|
|
658
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
659
|
+
echo ""
|
|
660
|
+
echo "=== 9. PROOF SCENARIO 2 — Over-block removed (Fix B: Case B removed) ==="
|
|
661
|
+
echo " PRE-FIX: 'unaccounted at completion' HARD_BLOCK fired for ANY unaccounted FAIL"
|
|
662
|
+
echo " POST-FIX: Case B removed → incidental fails with no claim NOT blocked"
|
|
663
|
+
|
|
664
|
+
T9="$TMP/t9-overblock"
|
|
665
|
+
seed_delivered "$T9" "overblock-sess"
|
|
666
|
+
printf '%s' '{"artifact_dir":"overblock-sess"}' > "$T9/.flow-agents/current.json"
|
|
667
|
+
# Bundle: only "npm test" claim asserting pass (no claim about the grep incidental fail)
|
|
668
|
+
write_kit_pass_bundle "$T9/.flow-agents/overblock-sess/trust.bundle" "overblock-sess"
|
|
669
|
+
# Log: "npm test" PASS + incidental "grep --quiet somepattern AGENTS.md" FAIL (exit 1)
|
|
670
|
+
printf '%s\n%s\n' \
|
|
671
|
+
'{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' \
|
|
672
|
+
'{"command":"grep --quiet somepattern AGENTS.md","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-27T00:00:01Z","source":"postToolUse-capture"}' \
|
|
673
|
+
> "$T9/.flow-agents/overblock-sess/command-log.jsonl"
|
|
674
|
+
|
|
675
|
+
set +e
|
|
676
|
+
t9_out="$(run_gate "$T9")"
|
|
677
|
+
t9_exit=$?
|
|
678
|
+
set -e
|
|
679
|
+
echo " POST-FIX exit: $t9_exit (expected 0 — incidental grep fail NOT a false-completion)"
|
|
680
|
+
if [ "$t9_exit" -ne 2 ]; then
|
|
681
|
+
_pass "PROOF 2: over-block removed — incidental fail with no claim NOT blocked (exit $t9_exit)"
|
|
682
|
+
else
|
|
683
|
+
if echo "$t9_out" | grep -q "unaccounted at completion"; then
|
|
684
|
+
_fail "PROOF 2 FAILED: 'unaccounted at completion' Case B still firing (should be removed). output: ${t9_out:0:400}"
|
|
685
|
+
else
|
|
686
|
+
_fail "PROOF 2 FAILED: blocked (exit 2) but NOT by unaccounted Case B — check output: ${t9_out:0:400}"
|
|
687
|
+
fi
|
|
688
|
+
fi
|
|
689
|
+
if echo "$t9_out" | grep -q "unaccounted at completion"; then
|
|
690
|
+
_fail "PROOF 2: 'unaccounted at completion' emitted (Case B must be removed)"
|
|
691
|
+
else
|
|
692
|
+
_pass "PROOF 2: 'unaccounted at completion' NOT emitted (Case B confirmed removed)"
|
|
693
|
+
fi
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
697
|
+
# Test 10: PROOF SCENARIO 3 — Fix-then-pass not blocked (Fix C: latest-wins)
|
|
698
|
+
#
|
|
699
|
+
# PRE-FIX: captureCrossReference used readCommandLog (sticky-FAIL), so a legit
|
|
700
|
+
# fix-then-rerun-to-pass session would still be blocked.
|
|
701
|
+
# POST-FIX: readLatestCommandLog is used; the LAST entry wins. A genuine re-run
|
|
702
|
+
# that produces a PASS clears the block.
|
|
703
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
704
|
+
echo ""
|
|
705
|
+
echo "=== 10. PROOF SCENARIO 3 — Fix-then-pass not blocked (Fix C: latest-wins) ==="
|
|
706
|
+
echo " PRE-FIX: sticky-FAIL in captureCrossReference kept a FAIL block even after re-run"
|
|
707
|
+
echo " POST-FIX: latest-wins → re-run to PASS clears the block"
|
|
708
|
+
|
|
709
|
+
T10="$TMP/t10-fixpass"
|
|
710
|
+
seed_delivered "$T10" "fixpass-sess"
|
|
711
|
+
printf '%s' '{"artifact_dir":"fixpass-sess"}' > "$T10/.flow-agents/current.json"
|
|
712
|
+
write_kit_pass_bundle "$T10/.flow-agents/fixpass-sess/trust.bundle" "fixpass-sess"
|
|
713
|
+
# Log: FAIL first, then PASS (genuine fix-then-re-run)
|
|
714
|
+
printf '%s\n%s\n' \
|
|
715
|
+
'{"command":"npm test","observedResult":"fail","exitCode":1,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' \
|
|
716
|
+
'{"command":"npm test","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-27T00:00:01Z","source":"postToolUse-capture"}' \
|
|
717
|
+
> "$T10/.flow-agents/fixpass-sess/command-log.jsonl"
|
|
718
|
+
|
|
719
|
+
set +e
|
|
720
|
+
t10_out="$(run_gate "$T10")"
|
|
721
|
+
t10_exit=$?
|
|
722
|
+
set -e
|
|
723
|
+
echo " POST-FIX exit: $t10_exit (expected 0 — latest capture PASS clears the earlier FAIL)"
|
|
724
|
+
if [ "$t10_exit" -ne 2 ]; then
|
|
725
|
+
_pass "PROOF 3: fix-then-pass NOT blocked — latest PASS clears earlier FAIL (exit $t10_exit)"
|
|
726
|
+
else
|
|
727
|
+
_fail "PROOF 3 FAILED: fix-then-pass session should exit 0 but got exit 2. output: ${t10_out:0:400}"
|
|
728
|
+
fi
|
|
729
|
+
if echo "$t10_out" | grep -q "caught false-completion\|CONTRADICTS"; then
|
|
730
|
+
_fail "PROOF 3: false-completion incorrectly emitted for fix-then-pass. output: ${t10_out:0:400}"
|
|
731
|
+
else
|
|
732
|
+
_pass "PROOF 3: no false-completion for fix-then-pass (latest PASS is the truth)"
|
|
733
|
+
fi
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
737
|
+
# Test 11: PROOF SCENARIO 4 — Exit-code laundering flagged (Fix D)
|
|
738
|
+
#
|
|
739
|
+
# A claim asserts pass for "npm test || true" (captured exit 0 — because || true
|
|
740
|
+
# masks the real exit code). This is not a trustworthy PASS signal.
|
|
741
|
+
# POST-FIX: hasLaunderingOperator detects || true → HARD_BLOCK "exit-code-laundered".
|
|
742
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
743
|
+
echo ""
|
|
744
|
+
echo "=== 11. PROOF SCENARIO 4 — Exit-code laundering flagged (Fix D) ==="
|
|
745
|
+
echo " PRE-FIX: 'npm test || true' captured exit 0 → claimed pass → gate satisfied"
|
|
746
|
+
echo " POST-FIX: hasLaunderingOperator detects || true → 'exit-code-laundered' HARD_BLOCK"
|
|
747
|
+
|
|
748
|
+
T11="$TMP/t11-laundering"
|
|
749
|
+
seed_delivered "$T11" "laundering-sess"
|
|
750
|
+
printf '%s' '{"artifact_dir":"laundering-sess"}' > "$T11/.flow-agents/current.json"
|
|
751
|
+
# Bundle: claim asserting pass for "npm test || true" — command string has laundering operator
|
|
752
|
+
python3 - "$T11/.flow-agents/laundering-sess/trust.bundle" "laundering-sess" << 'PY'
|
|
753
|
+
import json, sys
|
|
754
|
+
bundle_path, slug = sys.argv[1], sys.argv[2]
|
|
755
|
+
bundle = {
|
|
756
|
+
"schemaVersion": 3, "source": "flow-agents/workflow-sidecar",
|
|
757
|
+
"claims": [{
|
|
758
|
+
"id": "c1", "subjectId": slug + "/tests", "subjectType": "flow-step",
|
|
759
|
+
"claimType": "builder.verify.tests",
|
|
760
|
+
"fieldOrBehavior": "npm test || true",
|
|
761
|
+
"value": "pass", "impactLevel": "high", "status": "verified",
|
|
762
|
+
"createdAt": "2026-06-27T00:00:00Z", "updatedAt": "2026-06-27T00:00:00Z"
|
|
763
|
+
}],
|
|
764
|
+
"evidence": [{
|
|
765
|
+
"id": "ev1", "claimId": "c1",
|
|
766
|
+
"evidenceType": "command_output", "method": "capture",
|
|
767
|
+
"sourceRef": "command-log.jsonl",
|
|
768
|
+
"excerptOrSummary": "npm test || true: exit 0",
|
|
769
|
+
"observedAt": "2026-06-27T00:00:00Z", "collectedBy": "agent",
|
|
770
|
+
"passing": True,
|
|
771
|
+
"execution": {"label": "npm test || true", "exitCode": 0}
|
|
772
|
+
}],
|
|
773
|
+
"policies": [], "events": []
|
|
774
|
+
}
|
|
775
|
+
json.dump(bundle, open(bundle_path, 'w'))
|
|
776
|
+
PY
|
|
777
|
+
# Log: "npm test || true" captured as PASS (exit 0) — the laundering worked
|
|
778
|
+
printf '%s\n' \
|
|
779
|
+
'{"command":"npm test || true","observedResult":"pass","exitCode":0,"capturedAt":"2026-06-27T00:00:00Z","source":"postToolUse-capture"}' \
|
|
780
|
+
> "$T11/.flow-agents/laundering-sess/command-log.jsonl"
|
|
781
|
+
|
|
782
|
+
set +e
|
|
783
|
+
t11_out="$(run_gate "$T11")"
|
|
784
|
+
t11_exit=$?
|
|
785
|
+
set -e
|
|
786
|
+
echo " POST-FIX exit: $t11_exit (expected 2 — || true laundering detected)"
|
|
787
|
+
if [ "$t11_exit" -eq 2 ]; then
|
|
788
|
+
_pass "PROOF 4: exit-code laundering BLOCKED (exit 2) — 'npm test || true' not a trustworthy pass"
|
|
789
|
+
else
|
|
790
|
+
_fail "PROOF 4 FAILED: laundering should exit 2 but got $t11_exit. output: ${t11_out:0:400}"
|
|
791
|
+
fi
|
|
792
|
+
if echo "$t11_out" | grep -q "exit-code-laundered\|laundering operators mask"; then
|
|
793
|
+
_pass "PROOF 4: 'exit-code-laundered' warning emitted"
|
|
794
|
+
else
|
|
795
|
+
_fail "PROOF 4: expected 'exit-code-laundered' message not found. output: ${t11_out:0:400}"
|
|
796
|
+
fi
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
800
|
+
# Summary
|
|
801
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
802
|
+
echo ""
|
|
803
|
+
echo "================================================================="
|
|
804
|
+
if [ "$errors" -eq 0 ]; then
|
|
805
|
+
echo "PASS test_captured_fail_reconciliation: all checks passed."
|
|
806
|
+
echo ""
|
|
807
|
+
echo "Security proof:"
|
|
808
|
+
echo " BYPASS CLOSED: kit-typed false-completion blocked namespace-agnostically"
|
|
809
|
+
echo " PRE-FIX exit 0 (ships) → POST-FIX exit 2 (blocked)"
|
|
810
|
+
echo " NO OVER-BLOCK: all 4 legitimate cases remain unblocked by new logic"
|
|
811
|
+
echo " #216 FIXED: no-command session NOT blocked by missing-log check"
|
|
812
|
+
echo " AC3: empty-expects regression caught by gate-misconfiguration HARD_BLOCK"
|
|
813
|
+
echo " PROOF 1: Status-gated dodge closed (Fix A) — status=blocked + FAIL + claim=pass → exit 2"
|
|
814
|
+
echo " PROOF 2: Over-block removed (Fix B) — incidental grep fail, no claim → exit 0"
|
|
815
|
+
echo " PROOF 3: Fix-then-pass not blocked (Fix C) — FAIL then PASS + claim=pass → exit 0"
|
|
816
|
+
echo " PROOF 4: Exit-code laundering flagged (Fix D) — 'npm test || true' claim → exit 2"
|
|
817
|
+
exit 0
|
|
818
|
+
fi
|
|
819
|
+
echo "FAIL test_captured_fail_reconciliation: $errors check(s) failed."
|
|
820
|
+
exit 1
|