@kontourai/flow-agents 1.4.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/CODEOWNERS +29 -0
- package/.github/actions/trust-verify/action.yml +145 -0
- package/.github/workflows/ci.yml +11 -4
- package/.github/workflows/kit-gates-demo.yml +2 -2
- package/.github/workflows/publish-npm.yml +10 -2
- package/.github/workflows/release-please.yml +1 -1
- package/.github/workflows/runtime-compat.yml +1 -1
- package/.github/workflows/trust-reconcile.yml +113 -0
- package/AGENTS.md +13 -0
- package/CHANGELOG.md +103 -0
- package/CONTRIBUTING.md +4 -4
- package/README.md +1 -0
- package/agents/tool-planner.json +1 -1
- package/build/src/cli/init.js +242 -20
- package/build/src/cli/validate-workflow-artifacts.js +19 -2
- package/build/src/cli/verify.d.ts +1 -0
- package/build/src/cli/verify.js +90 -0
- package/build/src/cli/workflow-sidecar.d.ts +316 -8
- package/build/src/cli/workflow-sidecar.js +1996 -91
- package/build/src/cli.js +2 -3
- package/build/src/lib/flow-resolver.d.ts +111 -0
- package/build/src/lib/flow-resolver.js +308 -0
- package/build/src/tools/build-universal-bundles.js +34 -22
- package/build/src/tools/generate-context-map.js +3 -16
- package/build/src/tools/validate-source-tree.d.ts +1 -1
- package/build/src/tools/validate-source-tree.js +42 -162
- package/context/contracts/artifact-contract.md +10 -0
- package/context/contracts/delivery-contract.md +1 -0
- package/context/contracts/review-contract.md +1 -0
- package/context/contracts/verification-contract.md +2 -0
- package/context/gate-awareness.md +39 -0
- package/context/scripts/hooks/stop-goal-fit.js +632 -70
- package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
- package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
- package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
- package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
- package/docs/adr/0007-skill-audit.md +1 -1
- package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
- package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
- package/docs/adr/0011-mcp-posture.md +100 -0
- package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
- package/docs/adr/0013-context-lifecycle.md +151 -0
- package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
- package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
- package/docs/adr/0016-three-hard-boundary-model.md +71 -0
- package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
- package/docs/agent-system-guidebook.md +5 -12
- package/docs/context-map.md +4 -10
- package/docs/index.md +3 -2
- package/docs/integrations/framework-adapter.md +19 -6
- package/docs/integrations/index.md +2 -2
- package/docs/north-star.md +4 -4
- package/docs/operating-layers.md +3 -3
- package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
- package/docs/repository-structure.md +2 -2
- package/docs/skills-map.md +1 -0
- package/docs/spec/runtime-hook-surface.md +62 -9
- package/docs/standards-register.md +3 -3
- package/docs/survey-utterance-check.md +1 -1
- package/docs/trust-anchor-adoption.md +197 -0
- package/docs/verifiable-trust.md +95 -0
- package/docs/veritas-integration.md +2 -2
- package/docs/workflow-usage-guide.md +69 -0
- package/evals/acceptance/DEMO-false-completion.md +144 -0
- package/evals/acceptance/demo-cast.sh +92 -0
- package/evals/acceptance/demo-false-completion.sh +72 -0
- package/evals/acceptance/demo-real-evidence.sh +104 -0
- package/evals/acceptance/demo.tape +29 -0
- package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
- package/evals/acceptance/prove-capture-teeth.sh +114 -0
- package/evals/acceptance/prove-teeth.sh +105 -0
- package/evals/ci/antigaming-suite.sh +55 -0
- package/evals/ci/run-baseline.sh +2 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
- package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
- package/evals/integration/test_builder_step_producers.sh +379 -0
- package/evals/integration/test_bundle_install.sh +35 -71
- package/evals/integration/test_bundle_lifecycle.sh +39 -2
- package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
- package/evals/integration/test_checkpoint_signing.sh +489 -0
- package/evals/integration/test_claim_lookup.sh +352 -0
- package/evals/integration/test_command_log_fork_classification.sh +134 -0
- package/evals/integration/test_command_log_integrity.sh +275 -0
- package/evals/integration/test_context_map.sh +0 -2
- package/evals/integration/test_dual_emit_flow_step.sh +278 -0
- package/evals/integration/test_enforcer_expects_driven.sh +281 -0
- package/evals/integration/test_evidence_capture_hook.sh +185 -0
- package/evals/integration/test_flow_kit_repository.sh +2 -0
- package/evals/integration/test_flowdef_session_activation.sh +273 -0
- package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
- package/evals/integration/test_gate_bypass_chain.sh +448 -0
- package/evals/integration/test_gate_lockdown.sh +1137 -0
- package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
- package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
- package/evals/integration/test_goal_fit_hook.sh +69 -4
- package/evals/integration/test_goal_fit_rederive.sh +263 -0
- package/evals/integration/test_install_merge.sh +1176 -0
- package/evals/integration/test_kit_identity_trust.sh +393 -0
- package/evals/integration/test_mint_attestation.sh +373 -0
- package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
- package/evals/integration/test_publish_delivery.sh +269 -0
- package/evals/integration/test_reconcile_soundness.sh +528 -0
- package/evals/integration/test_resolvefirststep_security.sh +208 -0
- package/evals/integration/test_session_resume_roundtrip.sh +286 -0
- package/evals/integration/test_trust_checkpoint.sh +325 -0
- package/evals/integration/test_trust_reconcile.sh +293 -0
- package/evals/integration/test_verify_cli.sh +208 -0
- package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
- package/evals/lib/node.sh +0 -6
- package/evals/run.sh +47 -0
- package/evals/static/test_workflow_skills.sh +6 -13
- package/install.sh +0 -7
- package/integrations/strands-ts/README.md +25 -15
- package/integrations/veritas/flow-agents.adapter.json +1 -2
- package/kits/builder/flows/build.flow.json +59 -12
- package/kits/builder/kit.json +85 -15
- package/kits/builder/skills/continue-work/SKILL.md +116 -0
- package/kits/builder/skills/deliver/SKILL.md +36 -6
- package/kits/builder/skills/design-probe/SKILL.md +28 -0
- package/kits/builder/skills/execute-plan/SKILL.md +9 -1
- package/kits/builder/skills/gate-review/SKILL.md +234 -0
- package/kits/builder/skills/learning-review/SKILL.md +30 -0
- package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
- package/kits/builder/skills/plan-work/SKILL.md +13 -1
- package/kits/builder/skills/pull-work/SKILL.md +19 -0
- package/kits/knowledge/adapters/default-store/index.js +38 -0
- package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
- package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
- package/kits/knowledge/docs/store-contract.md +314 -0
- package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
- package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
- package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
- package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
- package/kits/knowledge/evals/entities/suite.test.js +40 -0
- package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
- package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
- package/kits/knowledge/evals/retirement/suite.test.js +145 -0
- package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
- package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
- package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
- package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
- package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
- package/kits/knowledge/kit.json +51 -1
- package/package.json +6 -6
- package/packaging/conformance/README.md +10 -2
- package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
- package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
- package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
- package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
- package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
- package/packaging/conformance/run-conformance.js +1 -1
- package/scripts/README.md +2 -1
- package/scripts/build-universal-bundles.js +0 -1
- package/scripts/ci/mint-attestation.js +221 -0
- package/scripts/ci/trust-reconcile.js +545 -0
- package/scripts/hooks/config-protection.js +423 -1
- package/scripts/hooks/evidence-capture.js +348 -0
- package/scripts/hooks/lib/liveness-read.js +113 -0
- package/scripts/hooks/run-hook.js +6 -1
- package/scripts/hooks/stop-goal-fit.js +1524 -79
- package/scripts/hooks/workflow-steering.js +135 -5
- package/scripts/install-codex-home.sh +39 -0
- package/scripts/install-merge.js +330 -0
- package/scripts/repair-command-log.js +115 -0
- package/src/cli/init.ts +218 -20
- package/src/cli/validate-workflow-artifacts.ts +18 -2
- package/src/cli/verify.ts +100 -0
- package/src/cli/workflow-sidecar.ts +2127 -84
- package/src/cli.ts +2 -3
- package/src/lib/flow-resolver.ts +369 -0
- package/src/tools/build-universal-bundles.ts +34 -21
- package/src/tools/generate-context-map.ts +3 -17
- package/src/tools/validate-source-tree.ts +44 -104
- package/build/src/tools/filter-installed-packs.d.ts +0 -2
- package/build/src/tools/filter-installed-packs.js +0 -135
- package/packaging/packs.json +0 -49
- package/scripts/filter-installed-packs.js +0 -2
- package/src/tools/filter-installed-packs.ts +0 -132
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_gate_review_inquiry_records.sh — AC1 + AC2 integration tests for gate-review #119.
|
|
3
|
+
#
|
|
4
|
+
# Verifies that the gate-review subcommand emits canonical InquiryRecords
|
|
5
|
+
# (gate-review.inquiries.json) validated against hachure inquiry-record.schema.json.
|
|
6
|
+
#
|
|
7
|
+
# AC1: a session with a gate event yields ≥1 InquiryRecord.
|
|
8
|
+
# AC2: false_block scenario (claim verified + block) and missed_block scenario
|
|
9
|
+
# (expected claim absent) each yield a distinct InquiryRecord with the
|
|
10
|
+
# correct calibration + non-empty advisoryFix.
|
|
11
|
+
#
|
|
12
|
+
# Seed is deterministic: same inputs → same outputs. Surface is loaded from the
|
|
13
|
+
# installed optional dependency (@kontourai/surface).
|
|
14
|
+
set -uo pipefail
|
|
15
|
+
|
|
16
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
17
|
+
source "$ROOT/evals/lib/node.sh"
|
|
18
|
+
|
|
19
|
+
TMPDIR_EVAL="$(mktemp -d)"
|
|
20
|
+
errors=0
|
|
21
|
+
|
|
22
|
+
cleanup() { rm -rf "$TMPDIR_EVAL"; }
|
|
23
|
+
trap cleanup EXIT
|
|
24
|
+
|
|
25
|
+
_pass() { echo " ✓ $1"; }
|
|
26
|
+
_fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
27
|
+
|
|
28
|
+
echo "=== Gate Review InquiryRecord Tests (AC1 + AC2) ==="
|
|
29
|
+
|
|
30
|
+
# ── helpers ──────────────────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
# JSON query helper using node (no jq dependency)
|
|
33
|
+
jq_node() {
|
|
34
|
+
local file="$1"; local expr="$2"
|
|
35
|
+
node -e "
|
|
36
|
+
const d=JSON.parse(require('fs').readFileSync('${file}','utf8'));
|
|
37
|
+
const r=(${expr})(d);
|
|
38
|
+
if(r===undefined||r===null){process.exit(2);}
|
|
39
|
+
if(typeof r==='boolean'||typeof r==='number'||typeof r==='string'){
|
|
40
|
+
process.stdout.write(String(r)+'\n');
|
|
41
|
+
}else{
|
|
42
|
+
process.stdout.write(JSON.stringify(r)+'\n');
|
|
43
|
+
}"
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Write a minimal trust.bundle for testing. Args:
|
|
47
|
+
# $1: dir session artifact dir (bundle written as trust.bundle)
|
|
48
|
+
# $2: slug session slug
|
|
49
|
+
# $3: status claim status (verified|disputed|assumed|stale|unknown)
|
|
50
|
+
seed_trust_bundle() {
|
|
51
|
+
local dir="$1" slug="$2" status="$3"
|
|
52
|
+
local ts="2026-06-24T00:00:00Z"
|
|
53
|
+
local claimId="${slug}/unit-tests.flow-agents.workflow.unit tests pass"
|
|
54
|
+
|
|
55
|
+
# Build events array: add a "verified" or "disputed" event when status requires it
|
|
56
|
+
local events="[]"
|
|
57
|
+
if [[ "$status" == "verified" ]]; then
|
|
58
|
+
events='[{"id":"evt:'"$claimId"'","claimId":"'"$claimId"'","status":"verified","actor":"gate-review-test","method":"validation","evidenceIds":[],"createdAt":"'"$ts"'","verifiedAt":"'"$ts"'"}]'
|
|
59
|
+
elif [[ "$status" == "disputed" ]]; then
|
|
60
|
+
events='[{"id":"evt:'"$claimId"'","claimId":"'"$claimId"'","status":"disputed","actor":"gate-review-test","method":"validation","evidenceIds":[],"createdAt":"'"$ts"'","verifiedAt":"'"$ts"'"}]'
|
|
61
|
+
fi
|
|
62
|
+
|
|
63
|
+
cat > "$dir/trust.bundle" <<JSON
|
|
64
|
+
{
|
|
65
|
+
"schemaVersion": 3,
|
|
66
|
+
"source": "gate-review-test;statusFunctionVersion=1",
|
|
67
|
+
"claims": [
|
|
68
|
+
{
|
|
69
|
+
"id": "$claimId",
|
|
70
|
+
"subjectType": "workflow-check",
|
|
71
|
+
"subjectId": "$slug/unit-tests",
|
|
72
|
+
"surface": "flow-agents.workflow",
|
|
73
|
+
"claimType": "workflow.check.test",
|
|
74
|
+
"fieldOrBehavior": "unit tests pass",
|
|
75
|
+
"value": "pass",
|
|
76
|
+
"status": "$status",
|
|
77
|
+
"createdAt": "$ts",
|
|
78
|
+
"updatedAt": "$ts"
|
|
79
|
+
}
|
|
80
|
+
],
|
|
81
|
+
"evidence": [],
|
|
82
|
+
"events": $events,
|
|
83
|
+
"policies": []
|
|
84
|
+
}
|
|
85
|
+
JSON
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Set the gate block streak file ($1: root, $2: count)
|
|
89
|
+
seed_block_streak() {
|
|
90
|
+
local root="$1" count="$2"
|
|
91
|
+
if [[ "$count" -gt 0 ]]; then
|
|
92
|
+
printf '{"count":%d,"hash":"testHash001"}' "$count" > "$root/.goal-fit-block-streak.json"
|
|
93
|
+
else
|
|
94
|
+
rm -f "$root/.goal-fit-block-streak.json"
|
|
95
|
+
fi
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Remove the block streak file
|
|
99
|
+
clear_block_streak() {
|
|
100
|
+
rm -f "$1/.goal-fit-block-streak.json"
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# ── AC1: session with a gate event → ≥1 InquiryRecord ───────────────────────
|
|
104
|
+
echo ""
|
|
105
|
+
echo "--- AC1: gate event → ≥1 InquiryRecord ---"
|
|
106
|
+
|
|
107
|
+
AC1_ROOT="$TMPDIR_EVAL/ac1/.flow-agents"
|
|
108
|
+
AC1_SLUG="ac1-session"
|
|
109
|
+
AC1_DIR="$AC1_ROOT/$AC1_SLUG"
|
|
110
|
+
mkdir -p "$AC1_DIR"
|
|
111
|
+
|
|
112
|
+
# Seed: verified claim + blocked (false_block scenario for AC1)
|
|
113
|
+
seed_trust_bundle "$AC1_DIR" "$AC1_SLUG" "verified"
|
|
114
|
+
seed_block_streak "$AC1_ROOT" 1
|
|
115
|
+
|
|
116
|
+
if flow_agents_node workflow-sidecar gate-review "$AC1_DIR" \
|
|
117
|
+
>"$TMPDIR_EVAL/ac1.out" 2>"$TMPDIR_EVAL/ac1.err"; then
|
|
118
|
+
_pass "AC1: gate-review exits 0"
|
|
119
|
+
else
|
|
120
|
+
_fail "AC1: gate-review failed: $(cat "$TMPDIR_EVAL/ac1.err")"
|
|
121
|
+
fi
|
|
122
|
+
|
|
123
|
+
AC1_INQUIRIES="$AC1_DIR/gate-review.inquiries.json"
|
|
124
|
+
if [[ -f "$AC1_INQUIRIES" ]]; then
|
|
125
|
+
_pass "AC1: gate-review.inquiries.json emitted"
|
|
126
|
+
else
|
|
127
|
+
_fail "AC1: gate-review.inquiries.json missing"
|
|
128
|
+
fi
|
|
129
|
+
|
|
130
|
+
if [[ -f "$AC1_INQUIRIES" ]]; then
|
|
131
|
+
AC1_COUNT="$(jq_node "$AC1_INQUIRIES" 'd => d.length' 2>/dev/null || echo 0)"
|
|
132
|
+
if [[ "$AC1_COUNT" -ge 1 ]]; then
|
|
133
|
+
_pass "AC1: ≥1 InquiryRecord present (count=$AC1_COUNT)"
|
|
134
|
+
else
|
|
135
|
+
_fail "AC1: expected ≥1 InquiryRecord, got $AC1_COUNT"
|
|
136
|
+
fi
|
|
137
|
+
|
|
138
|
+
# Verify each record has required schema fields
|
|
139
|
+
MISSING_FIELDS="$(node -e "
|
|
140
|
+
const records=JSON.parse(require('fs').readFileSync('$AC1_INQUIRIES','utf8'));
|
|
141
|
+
const required=['id','inquiry','outcome','resolutionPath','inputSnapshot','statusFunctionVersion','resolvedAt'];
|
|
142
|
+
const missing=[];
|
|
143
|
+
for(const [i,r] of records.entries()){
|
|
144
|
+
for(const f of required){
|
|
145
|
+
if(!(f in r)) missing.push('record['+i+'].'+f);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
process.stdout.write(missing.join(','));
|
|
149
|
+
" 2>/dev/null)"
|
|
150
|
+
if [[ -z "$MISSING_FIELDS" ]]; then
|
|
151
|
+
_pass "AC1: all InquiryRecords have required schema fields"
|
|
152
|
+
else
|
|
153
|
+
_fail "AC1: InquiryRecords missing fields: $MISSING_FIELDS"
|
|
154
|
+
fi
|
|
155
|
+
|
|
156
|
+
# Verify each record has non-empty advisoryFix in answer.value
|
|
157
|
+
EMPTY_FIX="$(node -e "
|
|
158
|
+
const records=JSON.parse(require('fs').readFileSync('$AC1_INQUIRIES','utf8'));
|
|
159
|
+
const bad=records.filter(r=>!r.answer||!r.answer.value||!r.answer.value.advisoryFix);
|
|
160
|
+
process.stdout.write(bad.map(r=>r.id).join(','));
|
|
161
|
+
" 2>/dev/null)"
|
|
162
|
+
if [[ -z "$EMPTY_FIX" ]]; then
|
|
163
|
+
_pass "AC1: all InquiryRecords have non-empty advisoryFix"
|
|
164
|
+
else
|
|
165
|
+
_fail "AC1: InquiryRecords with empty/missing advisoryFix: $EMPTY_FIX"
|
|
166
|
+
fi
|
|
167
|
+
fi
|
|
168
|
+
|
|
169
|
+
# ── AC2: false_block scenario ─────────────────────────────────────────────────
|
|
170
|
+
echo ""
|
|
171
|
+
echo "--- AC2a: false_block — verified claim + blocked ---"
|
|
172
|
+
|
|
173
|
+
AC2FB_ROOT="$TMPDIR_EVAL/ac2fb/.flow-agents"
|
|
174
|
+
AC2FB_SLUG="ac2-false-block"
|
|
175
|
+
AC2FB_DIR="$AC2FB_ROOT/$AC2FB_SLUG"
|
|
176
|
+
mkdir -p "$AC2FB_DIR"
|
|
177
|
+
|
|
178
|
+
# Seed: verified claim + blocked → false_block
|
|
179
|
+
seed_trust_bundle "$AC2FB_DIR" "$AC2FB_SLUG" "verified"
|
|
180
|
+
seed_block_streak "$AC2FB_ROOT" 2
|
|
181
|
+
|
|
182
|
+
if flow_agents_node workflow-sidecar gate-review "$AC2FB_DIR" \
|
|
183
|
+
>"$TMPDIR_EVAL/ac2fb.out" 2>"$TMPDIR_EVAL/ac2fb.err"; then
|
|
184
|
+
_pass "AC2a: gate-review exits 0"
|
|
185
|
+
else
|
|
186
|
+
_fail "AC2a: gate-review failed: $(cat "$TMPDIR_EVAL/ac2fb.err")"
|
|
187
|
+
fi
|
|
188
|
+
|
|
189
|
+
AC2FB_INQUIRIES="$AC2FB_DIR/gate-review.inquiries.json"
|
|
190
|
+
if [[ -f "$AC2FB_INQUIRIES" ]]; then
|
|
191
|
+
# outcome must be "matched" (claim exists in bundle)
|
|
192
|
+
OUTCOME="$(jq_node "$AC2FB_INQUIRIES" 'd => d[0].outcome' 2>/dev/null || echo "")"
|
|
193
|
+
if [[ "$OUTCOME" == "matched" ]]; then
|
|
194
|
+
_pass "AC2a: false_block InquiryRecord has outcome=matched"
|
|
195
|
+
else
|
|
196
|
+
_fail "AC2a: expected outcome=matched, got '$OUTCOME'"
|
|
197
|
+
fi
|
|
198
|
+
|
|
199
|
+
# calibration must be false_block
|
|
200
|
+
CALIBRATION="$(jq_node "$AC2FB_INQUIRIES" 'd => d[0].answer.value.calibration' 2>/dev/null || echo "")"
|
|
201
|
+
if [[ "$CALIBRATION" == "false_block" ]]; then
|
|
202
|
+
_pass "AC2a: false_block calibration correct"
|
|
203
|
+
else
|
|
204
|
+
_fail "AC2a: expected calibration=false_block, got '$CALIBRATION'"
|
|
205
|
+
fi
|
|
206
|
+
|
|
207
|
+
# advisoryFix must be non-empty
|
|
208
|
+
ADVISORY="$(jq_node "$AC2FB_INQUIRIES" 'd => d[0].answer.value.advisoryFix' 2>/dev/null || echo "")"
|
|
209
|
+
if [[ -n "$ADVISORY" ]] && [[ "$ADVISORY" != "null" ]]; then
|
|
210
|
+
_pass "AC2a: false_block has non-empty advisoryFix"
|
|
211
|
+
else
|
|
212
|
+
_fail "AC2a: false_block advisoryFix is empty"
|
|
213
|
+
fi
|
|
214
|
+
|
|
215
|
+
# schema validation via hachure (validates against inquiry-record.schema.json)
|
|
216
|
+
SCHEMA_RESULT="$(node -e "
|
|
217
|
+
try {
|
|
218
|
+
const { validateInquiryRecord } = require('$ROOT/build/src/cli/workflow-sidecar.js');
|
|
219
|
+
const records = JSON.parse(require('fs').readFileSync('$AC2FB_INQUIRIES','utf8'));
|
|
220
|
+
let allValid = true;
|
|
221
|
+
const errors = [];
|
|
222
|
+
for (const r of records) {
|
|
223
|
+
const result = validateInquiryRecord(r);
|
|
224
|
+
if (result.available && !result.valid) {
|
|
225
|
+
allValid = false;
|
|
226
|
+
errors.push(...result.errors);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
const available = records.length > 0 ? validateInquiryRecord(records[0]).available : false;
|
|
230
|
+
process.stdout.write(JSON.stringify({ available, allValid, errors }));
|
|
231
|
+
} catch(e) { process.stdout.write(JSON.stringify({ available: false, allValid: true, errors: [String(e)] })); }
|
|
232
|
+
" 2>/dev/null)"
|
|
233
|
+
SCHEMA_AVAILABLE="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT}').available ? 'true' : 'false')" 2>/dev/null || echo "false")"
|
|
234
|
+
SCHEMA_ALL_VALID="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT}').allValid ? 'true' : 'false')" 2>/dev/null || echo "true")"
|
|
235
|
+
if [[ "$SCHEMA_AVAILABLE" == "true" ]]; then
|
|
236
|
+
if [[ "$SCHEMA_ALL_VALID" == "true" ]]; then
|
|
237
|
+
_pass "AC2a: false_block InquiryRecords validate against hachure inquiry-record.schema.json (available=true, valid=true)"
|
|
238
|
+
else
|
|
239
|
+
SCHEMA_ERRORS="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT}').errors.slice(0,3).join('; '))" 2>/dev/null || echo "?")"
|
|
240
|
+
_fail "AC2a: InquiryRecord schema validation failed: $SCHEMA_ERRORS"
|
|
241
|
+
fi
|
|
242
|
+
else
|
|
243
|
+
_pass "AC2a: hachure not available — schema validation skipped (fail-open)"
|
|
244
|
+
fi
|
|
245
|
+
fi
|
|
246
|
+
|
|
247
|
+
# ── AC2: missed_block scenario ────────────────────────────────────────────────
|
|
248
|
+
echo ""
|
|
249
|
+
echo "--- AC2b: missed_block — absent criterion ---"
|
|
250
|
+
|
|
251
|
+
AC2MB_ROOT="$TMPDIR_EVAL/ac2mb/.flow-agents"
|
|
252
|
+
AC2MB_SLUG="ac2-missed-block"
|
|
253
|
+
AC2MB_DIR="$AC2MB_ROOT/$AC2MB_SLUG"
|
|
254
|
+
mkdir -p "$AC2MB_DIR"
|
|
255
|
+
|
|
256
|
+
# Seed: empty bundle (no claims) + no block + expected criterion absent → missed_block
|
|
257
|
+
cat > "$AC2MB_DIR/trust.bundle" <<JSON
|
|
258
|
+
{
|
|
259
|
+
"schemaVersion": 3,
|
|
260
|
+
"source": "gate-review-test;statusFunctionVersion=1",
|
|
261
|
+
"claims": [],
|
|
262
|
+
"evidence": [],
|
|
263
|
+
"events": [],
|
|
264
|
+
"policies": []
|
|
265
|
+
}
|
|
266
|
+
JSON
|
|
267
|
+
|
|
268
|
+
# Seed acceptance.json with an expected criterion
|
|
269
|
+
cat > "$AC2MB_DIR/acceptance.json" <<JSON
|
|
270
|
+
{
|
|
271
|
+
"schema_version": "1.0",
|
|
272
|
+
"task_slug": "$AC2MB_SLUG",
|
|
273
|
+
"criteria": [
|
|
274
|
+
{ "id": "ac-1", "description": "Unit tests pass", "status": "pending" }
|
|
275
|
+
]
|
|
276
|
+
}
|
|
277
|
+
JSON
|
|
278
|
+
|
|
279
|
+
# No block streak — gate did NOT fire
|
|
280
|
+
clear_block_streak "$AC2MB_ROOT"
|
|
281
|
+
|
|
282
|
+
if flow_agents_node workflow-sidecar gate-review "$AC2MB_DIR" \
|
|
283
|
+
>"$TMPDIR_EVAL/ac2mb.out" 2>"$TMPDIR_EVAL/ac2mb.err"; then
|
|
284
|
+
_pass "AC2b: gate-review exits 0"
|
|
285
|
+
else
|
|
286
|
+
_fail "AC2b: gate-review failed: $(cat "$TMPDIR_EVAL/ac2mb.err")"
|
|
287
|
+
fi
|
|
288
|
+
|
|
289
|
+
AC2MB_INQUIRIES="$AC2MB_DIR/gate-review.inquiries.json"
|
|
290
|
+
if [[ -f "$AC2MB_INQUIRIES" ]]; then
|
|
291
|
+
# The absent criterion should yield outcome="unsupported"
|
|
292
|
+
OUTCOME_MB="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].outcome' 2>/dev/null || echo "")"
|
|
293
|
+
if [[ "$OUTCOME_MB" == "unsupported" ]]; then
|
|
294
|
+
_pass "AC2b: missed_block absent criterion yields outcome=unsupported"
|
|
295
|
+
else
|
|
296
|
+
_fail "AC2b: expected outcome=unsupported for absent criterion, got '$OUTCOME_MB'"
|
|
297
|
+
fi
|
|
298
|
+
|
|
299
|
+
# calibration must be missed_block
|
|
300
|
+
CALIBRATION_MB="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].answer.value.calibration' 2>/dev/null || echo "")"
|
|
301
|
+
if [[ "$CALIBRATION_MB" == "missed_block" ]]; then
|
|
302
|
+
_pass "AC2b: missed_block calibration correct"
|
|
303
|
+
else
|
|
304
|
+
_fail "AC2b: expected calibration=missed_block for absent criterion, got '$CALIBRATION_MB'"
|
|
305
|
+
fi
|
|
306
|
+
|
|
307
|
+
# advisoryFix must be non-empty
|
|
308
|
+
ADVISORY_MB="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].answer.value.advisoryFix' 2>/dev/null || echo "")"
|
|
309
|
+
if [[ -n "$ADVISORY_MB" ]] && [[ "$ADVISORY_MB" != "null" ]]; then
|
|
310
|
+
_pass "AC2b: missed_block has non-empty advisoryFix"
|
|
311
|
+
else
|
|
312
|
+
_fail "AC2b: missed_block advisoryFix is empty"
|
|
313
|
+
fi
|
|
314
|
+
|
|
315
|
+
# schema validation
|
|
316
|
+
SCHEMA_RESULT_MB="$(node -e "
|
|
317
|
+
try {
|
|
318
|
+
const { validateInquiryRecord } = require('$ROOT/build/src/cli/workflow-sidecar.js');
|
|
319
|
+
const records = JSON.parse(require('fs').readFileSync('$AC2MB_INQUIRIES','utf8'));
|
|
320
|
+
let allValid = true;
|
|
321
|
+
const errors = [];
|
|
322
|
+
for (const r of records) {
|
|
323
|
+
const result = validateInquiryRecord(r);
|
|
324
|
+
if (result.available && !result.valid) {
|
|
325
|
+
allValid = false;
|
|
326
|
+
errors.push(...result.errors);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const available = records.length > 0 ? validateInquiryRecord(records[0]).available : false;
|
|
330
|
+
process.stdout.write(JSON.stringify({ available, allValid, errors }));
|
|
331
|
+
} catch(e) { process.stdout.write(JSON.stringify({ available: false, allValid: true, errors: [String(e)] })); }
|
|
332
|
+
" 2>/dev/null)"
|
|
333
|
+
SCHEMA_AVAILABLE_MB="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT_MB}').available ? 'true' : 'false')" 2>/dev/null || echo "false")"
|
|
334
|
+
SCHEMA_ALL_VALID_MB="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT_MB}').allValid ? 'true' : 'false')" 2>/dev/null || echo "true")"
|
|
335
|
+
if [[ "$SCHEMA_AVAILABLE_MB" == "true" ]]; then
|
|
336
|
+
if [[ "$SCHEMA_ALL_VALID_MB" == "true" ]]; then
|
|
337
|
+
_pass "AC2b: missed_block InquiryRecords validate against hachure inquiry-record.schema.json (available=true, valid=true)"
|
|
338
|
+
else
|
|
339
|
+
SCHEMA_ERRORS_MB="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT_MB}').errors.slice(0,3).join('; '))" 2>/dev/null || echo "?")"
|
|
340
|
+
_fail "AC2b: InquiryRecord schema validation failed: $SCHEMA_ERRORS_MB"
|
|
341
|
+
fi
|
|
342
|
+
else
|
|
343
|
+
_pass "AC2b: hachure not available — schema validation skipped (fail-open)"
|
|
344
|
+
fi
|
|
345
|
+
|
|
346
|
+
# Verify the absent criterion is the inquiry target
|
|
347
|
+
TARGET_FIELD="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].inquiry.target && d[0].inquiry.target.fieldOrBehavior' 2>/dev/null || echo "")"
|
|
348
|
+
if [[ -n "$TARGET_FIELD" ]] && [[ "$TARGET_FIELD" != "null" ]]; then
|
|
349
|
+
_pass "AC2b: absent criterion inquiry has canonical target"
|
|
350
|
+
else
|
|
351
|
+
_fail "AC2b: absent criterion inquiry missing canonical target"
|
|
352
|
+
fi
|
|
353
|
+
fi
|
|
354
|
+
|
|
355
|
+
# ── AC2: correct scenario (gate blocked + disputed claim) ─────────────────────
|
|
356
|
+
echo ""
|
|
357
|
+
echo "--- AC2c: correct — disputed claim + blocked ---"
|
|
358
|
+
|
|
359
|
+
AC2COR_ROOT="$TMPDIR_EVAL/ac2cor/.flow-agents"
|
|
360
|
+
AC2COR_SLUG="ac2-correct"
|
|
361
|
+
AC2COR_DIR="$AC2COR_ROOT/$AC2COR_SLUG"
|
|
362
|
+
mkdir -p "$AC2COR_DIR"
|
|
363
|
+
|
|
364
|
+
# Seed: disputed claim + blocked → correct
|
|
365
|
+
seed_trust_bundle "$AC2COR_DIR" "$AC2COR_SLUG" "disputed"
|
|
366
|
+
seed_block_streak "$AC2COR_ROOT" 1
|
|
367
|
+
|
|
368
|
+
if flow_agents_node workflow-sidecar gate-review "$AC2COR_DIR" \
|
|
369
|
+
>"$TMPDIR_EVAL/ac2cor.out" 2>"$TMPDIR_EVAL/ac2cor.err"; then
|
|
370
|
+
_pass "AC2c: gate-review exits 0"
|
|
371
|
+
else
|
|
372
|
+
_fail "AC2c: gate-review failed: $(cat "$TMPDIR_EVAL/ac2cor.err")"
|
|
373
|
+
fi
|
|
374
|
+
|
|
375
|
+
AC2COR_INQUIRIES="$AC2COR_DIR/gate-review.inquiries.json"
|
|
376
|
+
if [[ -f "$AC2COR_INQUIRIES" ]]; then
|
|
377
|
+
CALIBRATION_COR="$(jq_node "$AC2COR_INQUIRIES" 'd => d[0].answer.value.calibration' 2>/dev/null || echo "")"
|
|
378
|
+
if [[ "$CALIBRATION_COR" == "correct" ]]; then
|
|
379
|
+
_pass "AC2c: correct calibration (disputed+blocked)"
|
|
380
|
+
else
|
|
381
|
+
_fail "AC2c: expected calibration=correct for disputed+blocked, got '$CALIBRATION_COR'"
|
|
382
|
+
fi
|
|
383
|
+
fi
|
|
384
|
+
|
|
385
|
+
# ── AC3: no hooks changed ─────────────────────────────────────────────────────
|
|
386
|
+
echo ""
|
|
387
|
+
echo "--- AC3: hooks unchanged ---"
|
|
388
|
+
if git -C "$ROOT" diff origin/main -- scripts/hooks/ 2>/dev/null | grep -q '^[+-]'; then
|
|
389
|
+
_fail "AC3: scripts/hooks/ was modified (gate-review must not touch hooks)"
|
|
390
|
+
else
|
|
391
|
+
_pass "AC3: scripts/hooks/ unchanged"
|
|
392
|
+
fi
|
|
393
|
+
|
|
394
|
+
# ── Summary ───────────────────────────────────────────────────────────────────
|
|
395
|
+
echo ""
|
|
396
|
+
echo "────────────────────────────────────────────────────────────"
|
|
397
|
+
echo "gate-review InquiryRecord tests: $errors error(s)"
|
|
398
|
+
[ "$errors" -eq 0 ] && echo "PASS" || echo "FAIL"
|
|
399
|
+
exit "$errors"
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_goal_fit_escape_hatch.sh — block-mode escape hatch contract.
|
|
3
|
+
# Block mode must refuse the same goal-fit gap up to N times, then RELEASE
|
|
4
|
+
# (exit 0) so a genuinely-unsatisfiable goal cannot trap the agent forever.
|
|
5
|
+
set -uo pipefail
|
|
6
|
+
|
|
7
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
8
|
+
|
|
9
|
+
TMPDIR_EVAL="$(mktemp -d)"
|
|
10
|
+
errors=0
|
|
11
|
+
cleanup() { rm -rf "$TMPDIR_EVAL"; }
|
|
12
|
+
trap cleanup EXIT
|
|
13
|
+
|
|
14
|
+
_pass() { echo " ✓ $1"; }
|
|
15
|
+
_fail() { echo " ✗ $1"; errors=$((errors + 1)); }
|
|
16
|
+
|
|
17
|
+
REPO="$TMPDIR_EVAL/repo"
|
|
18
|
+
mkdir -p "$REPO/.flow-agents/stuck"
|
|
19
|
+
printf '# Test Repo\n' > "$REPO/AGENTS.md"
|
|
20
|
+
printf '# Stuck\n\nbranch: main\nstatus: executing\ntype: deliver\n\n## Plan\n\nTBD.\n' \
|
|
21
|
+
> "$REPO/.flow-agents/stuck/stuck--deliver.md"
|
|
22
|
+
|
|
23
|
+
PAYLOAD="{\"hook_event_name\":\"Stop\",\"cwd\":\"$REPO\"}"
|
|
24
|
+
|
|
25
|
+
run_block() {
|
|
26
|
+
printf '%s' "$PAYLOAD" \
|
|
27
|
+
| FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=3 \
|
|
28
|
+
node "$ROOT/scripts/hooks/stop-goal-fit.js" >/dev/null 2>"$1"
|
|
29
|
+
echo $?
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
c1=$(run_block "$TMPDIR_EVAL/b1.err")
|
|
33
|
+
c2=$(run_block "$TMPDIR_EVAL/b2.err")
|
|
34
|
+
c3=$(run_block "$TMPDIR_EVAL/b3.err")
|
|
35
|
+
c4=$(run_block "$TMPDIR_EVAL/b4.err")
|
|
36
|
+
|
|
37
|
+
[[ "$c1" -eq 2 ]] && rg -q 'BLOCK 1/3' "$TMPDIR_EVAL/b1.err" \
|
|
38
|
+
&& _pass "first identical block exits 2 (BLOCK 1/3)" \
|
|
39
|
+
|| _fail "first block should exit 2 BLOCK 1/3 (got $c1: $(cat "$TMPDIR_EVAL/b1.err"))"
|
|
40
|
+
|
|
41
|
+
[[ "$c2" -eq 2 ]] && rg -q 'BLOCK 2/3' "$TMPDIR_EVAL/b2.err" \
|
|
42
|
+
&& _pass "second identical block exits 2 (BLOCK 2/3)" \
|
|
43
|
+
|| _fail "second block should exit 2 BLOCK 2/3 (got $c2)"
|
|
44
|
+
|
|
45
|
+
[[ "$c3" -eq 0 ]] && rg -q 'RELEASED after 3 consecutive identical blocks' "$TMPDIR_EVAL/b3.err" \
|
|
46
|
+
&& _pass "third identical block RELEASES (exit 0, loud notice)" \
|
|
47
|
+
|| _fail "third block should release exit 0 (got $c3: $(cat "$TMPDIR_EVAL/b3.err"))"
|
|
48
|
+
|
|
49
|
+
[[ "$c4" -eq 2 ]] && rg -q 'BLOCK 1/3' "$TMPDIR_EVAL/b4.err" \
|
|
50
|
+
&& _pass "streak resets after release (next block is 1/3 again)" \
|
|
51
|
+
|| _fail "post-release block should reset to BLOCK 1/3 (got $c4)"
|
|
52
|
+
|
|
53
|
+
# A changing goal-fit gap must reset the streak (progress, not a stuck loop).
|
|
54
|
+
printf '%s' "$PAYLOAD" | FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=3 node "$ROOT/scripts/hooks/stop-goal-fit.js" >/dev/null 2>/dev/null
|
|
55
|
+
# mutate the artifact so the warning set differs
|
|
56
|
+
printf '# Stuck\n\nbranch: main\nstatus: verifying\ntype: deliver\n\n## Plan\n\nDifferent.\n' \
|
|
57
|
+
> "$REPO/.flow-agents/stuck/stuck--deliver.md"
|
|
58
|
+
cd=$(run_block "$TMPDIR_EVAL/bd.err")
|
|
59
|
+
[[ "$cd" -eq 2 ]] && rg -q 'BLOCK 1/3' "$TMPDIR_EVAL/bd.err" \
|
|
60
|
+
&& _pass "changed goal-fit gap resets the streak to 1/3" \
|
|
61
|
+
|| _fail "changed gap should reset streak (got $cd: $(cat "$TMPDIR_EVAL/bd.err"))"
|
|
62
|
+
|
|
63
|
+
# warn mode never blocks regardless of streak
|
|
64
|
+
wc=$(printf '%s' "$PAYLOAD" | FLOW_AGENTS_GOAL_FIT_MODE=warn node "$ROOT/scripts/hooks/stop-goal-fit.js" >/dev/null 2>/dev/null; echo $?)
|
|
65
|
+
[[ "$wc" -eq 0 ]] && _pass "warn mode exits 0 (escape hatch irrelevant)" \
|
|
66
|
+
|| _fail "warn mode should exit 0 (got $wc)"
|
|
67
|
+
|
|
68
|
+
if [[ "$errors" -eq 0 ]]; then
|
|
69
|
+
echo "Goal Fit escape hatch integration passed."
|
|
70
|
+
exit 0
|
|
71
|
+
fi
|
|
72
|
+
echo "Goal Fit escape hatch integration failed: $errors issue(s)."
|
|
73
|
+
exit 1
|
|
@@ -5,6 +5,12 @@ set -uo pipefail
|
|
|
5
5
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
6
6
|
source "$ROOT/evals/lib/node.sh"
|
|
7
7
|
|
|
8
|
+
# These checks exercise the block mechanism repeatedly against the same workspace
|
|
9
|
+
# as independent assertions, not a single continuous loop. Disable the block
|
|
10
|
+
# escape hatch here so the streak counter never trips; test_goal_fit_escape_hatch.sh
|
|
11
|
+
# covers the release-after-N behavior on its own.
|
|
12
|
+
export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
|
|
13
|
+
|
|
8
14
|
TMPDIR_EVAL="$(mktemp -d)"
|
|
9
15
|
errors=0
|
|
10
16
|
|
|
@@ -38,8 +44,10 @@ else
|
|
|
38
44
|
_fail "goal-fit hook should not block by default"
|
|
39
45
|
fi
|
|
40
46
|
|
|
41
|
-
|
|
42
|
-
|
|
47
|
+
# Wave 3 (ADR 0010 2c): Builder heading checks removed; only the ACTIVE_STATUSES signal fires now.
|
|
48
|
+
# The Definition Of Done and Goal Fit Gate heading checks were removed from analyze().
|
|
49
|
+
if rg -q 'status:executing' "$TMPDIR_EVAL/stderr.txt"; then
|
|
50
|
+
_pass "goal-fit hook reports active incomplete delivery (status signal via ACTIVE_STATUSES)"
|
|
43
51
|
else
|
|
44
52
|
_fail "goal-fit hook did not report active incomplete delivery"
|
|
45
53
|
fi
|
|
@@ -96,6 +104,41 @@ Build: PASS
|
|
|
96
104
|
- [ ] Long-lived docs updated with why/how the feature was built
|
|
97
105
|
MARKDOWN
|
|
98
106
|
|
|
107
|
+
# Adjustment A (2c): Seed a state.json (terminal: done) and an acceptance.json with
|
|
108
|
+
# pending criteria so the sidecar-driven Final Acceptance hygiene check fires.
|
|
109
|
+
# The markdown-based uncheckedInSection(Final Acceptance) check was removed; the
|
|
110
|
+
# acceptance.json pending-criteria check in missingBundleOrStateSignal is its replacement.
|
|
111
|
+
cat > "$REPO/.flow-agents/feedback-loop/state.json" <<'JSON'
|
|
112
|
+
{
|
|
113
|
+
"schema_version": "1.0",
|
|
114
|
+
"task_slug": "feedback-loop",
|
|
115
|
+
"status": "delivered",
|
|
116
|
+
"phase": "done",
|
|
117
|
+
"updated_at": "2026-05-04T00:00:00Z",
|
|
118
|
+
"next_action": { "status": "done", "summary": "Local delivery complete." }
|
|
119
|
+
}
|
|
120
|
+
JSON
|
|
121
|
+
|
|
122
|
+
cat > "$REPO/.flow-agents/feedback-loop/acceptance.json" <<'JSON'
|
|
123
|
+
{
|
|
124
|
+
"schema_version": "1.0",
|
|
125
|
+
"task_slug": "feedback-loop",
|
|
126
|
+
"criteria": [
|
|
127
|
+
{
|
|
128
|
+
"id": "ci-passed",
|
|
129
|
+
"description": "CI/relevant checks passed",
|
|
130
|
+
"status": "pending"
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"id": "docs-updated",
|
|
134
|
+
"description": "Long-lived docs updated with why/how the feature was built",
|
|
135
|
+
"status": "pending"
|
|
136
|
+
}
|
|
137
|
+
],
|
|
138
|
+
"goal_fit": { "status": "pass", "summary": "User-facing workflow was exercised or documented." }
|
|
139
|
+
}
|
|
140
|
+
JSON
|
|
141
|
+
|
|
99
142
|
if FLOW_AGENTS_GOAL_FIT_STRICT=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/final.out" 2>"$TMPDIR_EVAL/final.err" <<JSON
|
|
100
143
|
{"hook_event_name":"Stop","cwd":"$REPO"}
|
|
101
144
|
JSON
|
|
@@ -218,6 +261,11 @@ cat > "$REPO/.flow-agents/feedback-loop/handoff.json" <<'JSON'
|
|
|
218
261
|
}
|
|
219
262
|
JSON
|
|
220
263
|
|
|
264
|
+
# Phase 4c: trust.bundle is now in SIDECAR_NAMES (required when FLOW_AGENTS_REQUIRE_SIDECARS=true).
|
|
265
|
+
cat > "$REPO/.flow-agents/feedback-loop/trust.bundle" <<'JSON'
|
|
266
|
+
{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c1","subjectId":"feedback-loop/local-delivery","claimType":"workflow.check.test","fieldOrBehavior":"local delivery check","value":"pass","impactLevel":"high","status":"verified","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"}],"evidence":[{"id":"ev:c1","claimId":"c1","evidenceType":"test_output","method":"validation","sourceRef":"feedback-loop/state.json","excerptOrSummary":"local delivery check","observedAt":"2026-05-04T00:00:00Z","collectedBy":"flow-agents/workflow-sidecar","passing":true}],"policies":[],"events":[]}
|
|
267
|
+
JSON
|
|
268
|
+
|
|
221
269
|
if FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/sidecar-valid.out" 2>"$TMPDIR_EVAL/sidecar-valid.err" <<JSON
|
|
222
270
|
{"hook_event_name":"Stop","cwd":"$REPO"}
|
|
223
271
|
JSON
|
|
@@ -331,6 +379,12 @@ cat > "$REPO/.flow-agents/feedback-loop/critique.json" <<'JSON'
|
|
|
331
379
|
}
|
|
332
380
|
JSON
|
|
333
381
|
|
|
382
|
+
# Phase 4c: update trust.bundle to reflect the not_verified evidence + fail critique state.
|
|
383
|
+
# The bundle is the sole verification artifact; sidecarGuidance reads from it first.
|
|
384
|
+
cat > "$REPO/.flow-agents/feedback-loop/trust.bundle" <<'JSON'
|
|
385
|
+
{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c-ext","subjectId":"feedback-loop/external-service","claimType":"workflow.check.external","fieldOrBehavior":"External service was unavailable.\nPretend it passed.","value":"not_verified","impactLevel":"high","status":"not_verified","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"},{"id":"c-crit","subjectId":"feedback-loop/feedback-loop-review","claimType":"workflow.critique.review","fieldOrBehavior":"Blocking critique finding remains.","value":"fail","impactLevel":"high","status":"disputed","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"}],"evidence":[{"id":"ev:c-ext","claimId":"c-ext","evidenceType":"test_output","method":"validation","sourceRef":"feedback-loop/state.json","excerptOrSummary":"External service was unavailable. Pretend it passed.","observedAt":"2026-05-04T00:00:00Z","collectedBy":"flow-agents/workflow-sidecar","passing":false}],"policies":[],"events":[]}
|
|
386
|
+
JSON
|
|
387
|
+
|
|
334
388
|
if FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/sidecar-guidance.out" 2>"$TMPDIR_EVAL/sidecar-guidance.err" <<JSON
|
|
335
389
|
{"hook_event_name":"Stop","cwd":"$REPO"}
|
|
336
390
|
JSON
|
|
@@ -421,6 +475,11 @@ cat > "$REPO/.flow-agents/feedback-loop/evidence.json" <<'JSON'
|
|
|
421
475
|
}
|
|
422
476
|
JSON
|
|
423
477
|
|
|
478
|
+
# Phase 4c: update trust.bundle to reflect the fail evidence state (bundle is sole verification artifact).
|
|
479
|
+
cat > "$REPO/.flow-agents/feedback-loop/trust.bundle" <<'JSON'
|
|
480
|
+
{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c-fail","subjectId":"feedback-loop/local-delivery","claimType":"workflow.check.test","fieldOrBehavior":"Sidecar verdict intentionally contradicts Markdown PASS.","value":"fail","impactLevel":"high","status":"disputed","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"},{"id":"c-crit","subjectId":"feedback-loop/feedback-loop-review","claimType":"workflow.critique.review","fieldOrBehavior":"No blocking critique findings.","value":"pass","impactLevel":"high","status":"verified","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"}],"evidence":[],"policies":[],"events":[]}
|
|
481
|
+
JSON
|
|
482
|
+
|
|
424
483
|
if FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true FLOW_AGENTS_REQUIRE_CRITIQUE=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/sidecar-contradiction.out" 2>"$TMPDIR_EVAL/sidecar-contradiction.err" <<JSON
|
|
425
484
|
{"hook_event_name":"Stop","cwd":"$REPO"}
|
|
426
485
|
JSON
|
|
@@ -428,8 +487,8 @@ then
|
|
|
428
487
|
_fail "strict goal-fit hook should block Markdown/sidecar contradictions"
|
|
429
488
|
else
|
|
430
489
|
status=$?
|
|
431
|
-
if [[ "$status" -eq 2 ]] && rg -q '
|
|
432
|
-
_pass "strict goal-fit hook blocks
|
|
490
|
+
if [[ "$status" -eq 2 ]] && rg -q 'evidence verdict:fail' "$TMPDIR_EVAL/sidecar-contradiction.err"; then
|
|
491
|
+
_pass "strict goal-fit hook blocks sidecar evidence verdict fail (markdownVerdict check removed; sidecar path covers it)"
|
|
433
492
|
else
|
|
434
493
|
_fail "strict contradiction hook returned unexpected result: status=$status output=$(cat "$TMPDIR_EVAL/sidecar-contradiction.err")"
|
|
435
494
|
fi
|
|
@@ -562,6 +621,12 @@ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/handoff.json" <<'JSON'
|
|
|
562
621
|
}
|
|
563
622
|
JSON
|
|
564
623
|
|
|
624
|
+
# Phase 4c: trust.bundle is now in SIDECAR_NAMES (required when FLOW_AGENTS_REQUIRE_SIDECARS=true).
|
|
625
|
+
# Add a minimal valid trust.bundle so the npm-install-task fixture passes 4c sidecar validation.
|
|
626
|
+
cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/trust.bundle" <<'JSON'
|
|
627
|
+
{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c1","subjectId":"npm-install-task/build","claimType":"workflow.check.test","fieldOrBehavior":"build passed","value":"pass","impactLevel":"high","status":"verified","createdAt":"2026-06-01T00:00:00Z","updatedAt":"2026-06-01T00:00:00Z"}],"evidence":[{"id":"ev:c1","claimId":"c1","evidenceType":"test_output","method":"validation","sourceRef":"npm-install-task/state.json","excerptOrSummary":"build passed","observedAt":"2026-06-01T00:00:00Z","collectedBy":"flow-agents/workflow-sidecar","passing":true}],"policies":[],"events":[]}
|
|
628
|
+
JSON
|
|
629
|
+
|
|
565
630
|
# Part 1 of fix: invoke the already-built validator directly (no tsc).
|
|
566
631
|
# Poison tsc so that any call to it fails; confirm the hook does not call it
|
|
567
632
|
# and validates clean sidecars successfully.
|