npm - @kontourai/flow-agents - Versions diffs - 1.4.0 → 2.0.1 - Mend

@kontourai/flow-agents 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

package/.github/CODEOWNERS +29 -0
package/.github/actions/trust-verify/action.yml +145 -0
package/.github/workflows/ci.yml +11 -4
package/.github/workflows/kit-gates-demo.yml +2 -2
package/.github/workflows/publish-npm.yml +10 -2
package/.github/workflows/release-please.yml +1 -1
package/.github/workflows/runtime-compat.yml +1 -1
package/.github/workflows/trust-reconcile.yml +113 -0
package/AGENTS.md +13 -0
package/CHANGELOG.md +103 -0
package/CONTRIBUTING.md +4 -4
package/README.md +1 -0
package/agents/tool-planner.json +1 -1
package/build/src/cli/init.js +242 -20
package/build/src/cli/validate-workflow-artifacts.js +19 -2
package/build/src/cli/verify.d.ts +1 -0
package/build/src/cli/verify.js +90 -0
package/build/src/cli/workflow-sidecar.d.ts +316 -8
package/build/src/cli/workflow-sidecar.js +1996 -91
package/build/src/cli.js +2 -3
package/build/src/lib/flow-resolver.d.ts +111 -0
package/build/src/lib/flow-resolver.js +308 -0
package/build/src/tools/build-universal-bundles.js +34 -22
package/build/src/tools/generate-context-map.js +3 -16
package/build/src/tools/validate-source-tree.d.ts +1 -1
package/build/src/tools/validate-source-tree.js +42 -162
package/context/contracts/artifact-contract.md +10 -0
package/context/contracts/delivery-contract.md +1 -0
package/context/contracts/review-contract.md +1 -0
package/context/contracts/verification-contract.md +2 -0
package/context/gate-awareness.md +39 -0
package/context/scripts/hooks/stop-goal-fit.js +632 -70
package/docs/adr/0001-flow-agents-consumes-flow.md +1 -1
package/docs/adr/0002-flow-kits-as-extension-unit.md +1 -1
package/docs/adr/0004-gates-expect-surface-claims.md +2 -0
package/docs/adr/0005-kubernetes-inspired-resource-contracts.md +2 -0
package/docs/adr/0007-skill-audit.md +1 -1
package/docs/adr/0009-canonical-hook-core-kit-boundary.md +95 -0
package/docs/adr/0010-workflow-trust-state-as-hachure-bundle.md +139 -0
package/docs/adr/0011-mcp-posture.md +100 -0
package/docs/adr/0012-agent-coordination-as-liveness-claims.md +119 -0
package/docs/adr/0013-context-lifecycle.md +151 -0
package/docs/adr/0014-core-vs-domain-kit-boundary.md +143 -0
package/docs/adr/0015-flow-flow-agents-boundary-reconciliation.md +120 -0
package/docs/adr/0016-three-hard-boundary-model.md +71 -0
package/docs/adr/0017-anti-gaming-trust-security-model.md +155 -0
package/docs/agent-system-guidebook.md +5 -12
package/docs/context-map.md +4 -10
package/docs/index.md +3 -2
package/docs/integrations/framework-adapter.md +19 -6
package/docs/integrations/index.md +2 -2
package/docs/north-star.md +4 -4
package/docs/operating-layers.md +3 -3
package/docs/plans/adr-0010-phase2-gate-recompute.md +55 -0
package/docs/repository-structure.md +2 -2
package/docs/skills-map.md +1 -0
package/docs/spec/runtime-hook-surface.md +62 -9
package/docs/standards-register.md +3 -3
package/docs/survey-utterance-check.md +1 -1
package/docs/trust-anchor-adoption.md +197 -0
package/docs/verifiable-trust.md +95 -0
package/docs/veritas-integration.md +2 -2
package/docs/workflow-usage-guide.md +69 -0
package/evals/acceptance/DEMO-false-completion.md +144 -0
package/evals/acceptance/demo-cast.sh +92 -0
package/evals/acceptance/demo-false-completion.sh +72 -0
package/evals/acceptance/demo-real-evidence.sh +104 -0
package/evals/acceptance/demo.tape +29 -0
package/evals/acceptance/prove-capture-teeth-declared.sh +335 -0
package/evals/acceptance/prove-capture-teeth.sh +114 -0
package/evals/acceptance/prove-teeth.sh +105 -0
package/evals/ci/antigaming-suite.sh +55 -0
package/evals/ci/run-baseline.sh +2 -0
package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/flows/review.flow.json +26 -0
package/evals/fixtures/flow-kit-repository/invalid-missing-extension-asset/kit.json +20 -0
package/evals/fixtures/flow-kit-repository/valid-unknown-extension/flows/review.flow.json +26 -0
package/evals/fixtures/flow-kit-repository/valid-unknown-extension/kit.json +18 -0
package/evals/integration/test_builder_step_producers.sh +379 -0
package/evals/integration/test_bundle_install.sh +35 -71
package/evals/integration/test_bundle_lifecycle.sh +39 -2
package/evals/integration/test_captured_fail_reconciliation.sh +820 -0
package/evals/integration/test_checkpoint_signing.sh +489 -0
package/evals/integration/test_claim_lookup.sh +352 -0
package/evals/integration/test_command_log_fork_classification.sh +134 -0
package/evals/integration/test_command_log_integrity.sh +275 -0
package/evals/integration/test_context_map.sh +0 -2
package/evals/integration/test_dual_emit_flow_step.sh +278 -0
package/evals/integration/test_enforcer_expects_driven.sh +281 -0
package/evals/integration/test_evidence_capture_hook.sh +185 -0
package/evals/integration/test_flow_kit_repository.sh +2 -0
package/evals/integration/test_flowdef_session_activation.sh +273 -0
package/evals/integration/test_flowdef_session_history_preservation.sh +250 -0
package/evals/integration/test_gate_bypass_chain.sh +448 -0
package/evals/integration/test_gate_lockdown.sh +1137 -0
package/evals/integration/test_gate_review_inquiry_records.sh +399 -0
package/evals/integration/test_goal_fit_escape_hatch.sh +73 -0
package/evals/integration/test_goal_fit_hook.sh +69 -4
package/evals/integration/test_goal_fit_rederive.sh +263 -0
package/evals/integration/test_install_merge.sh +1176 -0
package/evals/integration/test_kit_identity_trust.sh +393 -0
package/evals/integration/test_mint_attestation.sh +373 -0
package/evals/integration/test_phase_map_and_gate_claim.sh +365 -0
package/evals/integration/test_publish_delivery.sh +269 -0
package/evals/integration/test_reconcile_soundness.sh +528 -0
package/evals/integration/test_resolvefirststep_security.sh +208 -0
package/evals/integration/test_session_resume_roundtrip.sh +286 -0
package/evals/integration/test_trust_checkpoint.sh +325 -0
package/evals/integration/test_trust_reconcile.sh +293 -0
package/evals/integration/test_verify_cli.sh +208 -0
package/evals/integration/test_workflow_sidecar_writer.sh +549 -34
package/evals/lib/node.sh +0 -6
package/evals/run.sh +47 -0
package/evals/static/test_workflow_skills.sh +6 -13
package/install.sh +0 -7
package/integrations/strands-ts/README.md +25 -15
package/integrations/veritas/flow-agents.adapter.json +1 -2
package/kits/builder/flows/build.flow.json +59 -12
package/kits/builder/kit.json +85 -15
package/kits/builder/skills/continue-work/SKILL.md +116 -0
package/kits/builder/skills/deliver/SKILL.md +36 -6
package/kits/builder/skills/design-probe/SKILL.md +28 -0
package/kits/builder/skills/execute-plan/SKILL.md +9 -1
package/kits/builder/skills/gate-review/SKILL.md +234 -0
package/kits/builder/skills/learning-review/SKILL.md +30 -0
package/kits/builder/skills/pickup-probe/SKILL.md +29 -0
package/kits/builder/skills/plan-work/SKILL.md +13 -1
package/kits/builder/skills/pull-work/SKILL.md +19 -0
package/kits/knowledge/adapters/default-store/index.js +38 -0
package/kits/knowledge/adapters/flow-runner/index.js +1620 -0
package/kits/knowledge/adapters/obsidian-store/index.js +36 -6
package/kits/knowledge/docs/store-contract.md +314 -0
package/kits/knowledge/evals/audit-freshness/suite.test.js +368 -0
package/kits/knowledge/evals/canonicalize-category/suite.test.js +383 -0
package/kits/knowledge/evals/contract-suite/suite.test.js +111 -0
package/kits/knowledge/evals/detect-contradictions/suite.test.js +324 -0
package/kits/knowledge/evals/entities/suite.test.js +40 -0
package/kits/knowledge/evals/glossary-sync/suite.test.js +416 -0
package/kits/knowledge/evals/hygiene-review/suite.test.js +396 -0
package/kits/knowledge/evals/retirement/suite.test.js +145 -0
package/kits/knowledge/flows/audit-freshness.flow.json +44 -0
package/kits/knowledge/flows/canonicalize-category.flow.json +44 -0
package/kits/knowledge/flows/detect-contradictions.flow.json +44 -0
package/kits/knowledge/flows/glossary-sync.flow.json +61 -0
package/kits/knowledge/flows/hygiene-review.flow.json +43 -0
package/kits/knowledge/kit.json +51 -1
package/package.json +6 -6
package/packaging/conformance/README.md +10 -2
package/packaging/conformance/fixtures/evidence-capture--allow-records-command.json +29 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-bundle-disputed-claim.json +29 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-capture-contradicts-claimed-pass.json +30 -0
package/packaging/conformance/fixtures/stop-goal-fit--block-mode.json +23 -0
package/packaging/conformance/fixtures/stop-goal-fit--off-mode.json +24 -0
package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +5 -2
package/packaging/conformance/fixtures/stop-goal-fit--warn-no-bundle.json +23 -0
package/packaging/conformance/fixtures/workflow-steering--reground-active-prompt.json +30 -0
package/packaging/conformance/fixtures/workflow-steering--reground-session-start.json +30 -0
package/packaging/conformance/run-conformance.js +1 -1
package/scripts/README.md +2 -1
package/scripts/build-universal-bundles.js +0 -1
package/scripts/ci/mint-attestation.js +221 -0
package/scripts/ci/trust-reconcile.js +545 -0
package/scripts/hooks/config-protection.js +423 -1
package/scripts/hooks/evidence-capture.js +348 -0
package/scripts/hooks/lib/liveness-read.js +113 -0
package/scripts/hooks/run-hook.js +6 -1
package/scripts/hooks/stop-goal-fit.js +1524 -79
package/scripts/hooks/workflow-steering.js +135 -5
package/scripts/install-codex-home.sh +39 -0
package/scripts/install-merge.js +330 -0
package/scripts/repair-command-log.js +115 -0
package/src/cli/init.ts +218 -20
package/src/cli/validate-workflow-artifacts.ts +18 -2
package/src/cli/verify.ts +100 -0
package/src/cli/workflow-sidecar.ts +2127 -84
package/src/cli.ts +2 -3
package/src/lib/flow-resolver.ts +369 -0
package/src/tools/build-universal-bundles.ts +34 -21
package/src/tools/generate-context-map.ts +3 -17
package/src/tools/validate-source-tree.ts +44 -104
package/build/src/tools/filter-installed-packs.d.ts +0 -2
package/build/src/tools/filter-installed-packs.js +0 -135
package/packaging/packs.json +0 -49
package/scripts/filter-installed-packs.js +0 -2
package/src/tools/filter-installed-packs.ts +0 -132

package/evals/integration/test_gate_review_inquiry_records.sh ADDED Viewed

@@ -0,0 +1,399 @@
+#!/usr/bin/env bash
+# test_gate_review_inquiry_records.sh — AC1 + AC2 integration tests for gate-review #119.
+#
+# Verifies that the gate-review subcommand emits canonical InquiryRecords
+# (gate-review.inquiries.json) validated against hachure inquiry-record.schema.json.
+#
+# AC1: a session with a gate event yields ≥1 InquiryRecord.
+# AC2: false_block scenario (claim verified + block) and missed_block scenario
+#      (expected claim absent) each yield a distinct InquiryRecord with the
+#      correct calibration + non-empty advisoryFix.
+#
+# Seed is deterministic: same inputs → same outputs. Surface is loaded from the
+# installed optional dependency (@kontourai/surface).
+set -uo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+source "$ROOT/evals/lib/node.sh"
+TMPDIR_EVAL="$(mktemp -d)"
+errors=0
+cleanup() { rm -rf "$TMPDIR_EVAL"; }
+trap cleanup EXIT
+_pass() { echo "  ✓ $1"; }
+_fail() { echo "  ✗ $1"; errors=$((errors + 1)); }
+echo "=== Gate Review InquiryRecord Tests (AC1 + AC2) ==="
+# ── helpers ──────────────────────────────────────────────────────────────────
+# JSON query helper using node (no jq dependency)
+jq_node() {
+  local file="$1"; local expr="$2"
+  node -e "
+const d=JSON.parse(require('fs').readFileSync('${file}','utf8'));
+const r=(${expr})(d);
+if(r===undefined||r===null){process.exit(2);}
+if(typeof r==='boolean'||typeof r==='number'||typeof r==='string'){
+  process.stdout.write(String(r)+'\n');
+}else{
+  process.stdout.write(JSON.stringify(r)+'\n');
+}"
+}
+# Write a minimal trust.bundle for testing. Args:
+#   $1: dir       session artifact dir (bundle written as trust.bundle)
+#   $2: slug      session slug
+#   $3: status    claim status (verified|disputed|assumed|stale|unknown)
+seed_trust_bundle() {
+  local dir="$1" slug="$2" status="$3"
+  local ts="2026-06-24T00:00:00Z"
+  local claimId="${slug}/unit-tests.flow-agents.workflow.unit tests pass"
+  # Build events array: add a "verified" or "disputed" event when status requires it
+  local events="[]"
+  if [[ "$status" == "verified" ]]; then
+    events='[{"id":"evt:'"$claimId"'","claimId":"'"$claimId"'","status":"verified","actor":"gate-review-test","method":"validation","evidenceIds":[],"createdAt":"'"$ts"'","verifiedAt":"'"$ts"'"}]'
+  elif [[ "$status" == "disputed" ]]; then
+    events='[{"id":"evt:'"$claimId"'","claimId":"'"$claimId"'","status":"disputed","actor":"gate-review-test","method":"validation","evidenceIds":[],"createdAt":"'"$ts"'","verifiedAt":"'"$ts"'"}]'
+  fi
+  cat > "$dir/trust.bundle" <<JSON
+{
+  "schemaVersion": 3,
+  "source": "gate-review-test;statusFunctionVersion=1",
+  "claims": [
+    {
+      "id": "$claimId",
+      "subjectType": "workflow-check",
+      "subjectId": "$slug/unit-tests",
+      "surface": "flow-agents.workflow",
+      "claimType": "workflow.check.test",
+      "fieldOrBehavior": "unit tests pass",
+      "value": "pass",
+      "status": "$status",
+      "createdAt": "$ts",
+      "updatedAt": "$ts"
+    }
+  ],
+  "evidence": [],
+  "events": $events,
+  "policies": []
+}
+JSON
+}
+# Set the gate block streak file ($1: root, $2: count)
+seed_block_streak() {
+  local root="$1" count="$2"
+  if [[ "$count" -gt 0 ]]; then
+    printf '{"count":%d,"hash":"testHash001"}' "$count" > "$root/.goal-fit-block-streak.json"
+  else
+    rm -f "$root/.goal-fit-block-streak.json"
+  fi
+}
+# Remove the block streak file
+clear_block_streak() {
+  rm -f "$1/.goal-fit-block-streak.json"
+}
+# ── AC1: session with a gate event → ≥1 InquiryRecord ───────────────────────
+echo ""
+echo "--- AC1: gate event → ≥1 InquiryRecord ---"
+AC1_ROOT="$TMPDIR_EVAL/ac1/.flow-agents"
+AC1_SLUG="ac1-session"
+AC1_DIR="$AC1_ROOT/$AC1_SLUG"
+mkdir -p "$AC1_DIR"
+# Seed: verified claim + blocked (false_block scenario for AC1)
+seed_trust_bundle "$AC1_DIR" "$AC1_SLUG" "verified"
+seed_block_streak "$AC1_ROOT" 1
+if flow_agents_node workflow-sidecar gate-review "$AC1_DIR" \
+  >"$TMPDIR_EVAL/ac1.out" 2>"$TMPDIR_EVAL/ac1.err"; then
+  _pass "AC1: gate-review exits 0"
+else
+  _fail "AC1: gate-review failed: $(cat "$TMPDIR_EVAL/ac1.err")"
+fi
+AC1_INQUIRIES="$AC1_DIR/gate-review.inquiries.json"
+if [[ -f "$AC1_INQUIRIES" ]]; then
+  _pass "AC1: gate-review.inquiries.json emitted"
+else
+  _fail "AC1: gate-review.inquiries.json missing"
+fi
+if [[ -f "$AC1_INQUIRIES" ]]; then
+  AC1_COUNT="$(jq_node "$AC1_INQUIRIES" 'd => d.length' 2>/dev/null || echo 0)"
+  if [[ "$AC1_COUNT" -ge 1 ]]; then
+    _pass "AC1: ≥1 InquiryRecord present (count=$AC1_COUNT)"
+  else
+    _fail "AC1: expected ≥1 InquiryRecord, got $AC1_COUNT"
+  fi
+  # Verify each record has required schema fields
+  MISSING_FIELDS="$(node -e "
+const records=JSON.parse(require('fs').readFileSync('$AC1_INQUIRIES','utf8'));
+const required=['id','inquiry','outcome','resolutionPath','inputSnapshot','statusFunctionVersion','resolvedAt'];
+const missing=[];
+for(const [i,r] of records.entries()){
+  for(const f of required){
+    if(!(f in r)) missing.push('record['+i+'].'+f);
+  }
+}
+process.stdout.write(missing.join(','));
+" 2>/dev/null)"
+  if [[ -z "$MISSING_FIELDS" ]]; then
+    _pass "AC1: all InquiryRecords have required schema fields"
+  else
+    _fail "AC1: InquiryRecords missing fields: $MISSING_FIELDS"
+  fi
+  # Verify each record has non-empty advisoryFix in answer.value
+  EMPTY_FIX="$(node -e "
+const records=JSON.parse(require('fs').readFileSync('$AC1_INQUIRIES','utf8'));
+const bad=records.filter(r=>!r.answer||!r.answer.value||!r.answer.value.advisoryFix);
+process.stdout.write(bad.map(r=>r.id).join(','));
+" 2>/dev/null)"
+  if [[ -z "$EMPTY_FIX" ]]; then
+    _pass "AC1: all InquiryRecords have non-empty advisoryFix"
+  else
+    _fail "AC1: InquiryRecords with empty/missing advisoryFix: $EMPTY_FIX"
+  fi
+fi
+# ── AC2: false_block scenario ─────────────────────────────────────────────────
+echo ""
+echo "--- AC2a: false_block — verified claim + blocked ---"
+AC2FB_ROOT="$TMPDIR_EVAL/ac2fb/.flow-agents"
+AC2FB_SLUG="ac2-false-block"
+AC2FB_DIR="$AC2FB_ROOT/$AC2FB_SLUG"
+mkdir -p "$AC2FB_DIR"
+# Seed: verified claim + blocked → false_block
+seed_trust_bundle "$AC2FB_DIR" "$AC2FB_SLUG" "verified"
+seed_block_streak "$AC2FB_ROOT" 2
+if flow_agents_node workflow-sidecar gate-review "$AC2FB_DIR" \
+  >"$TMPDIR_EVAL/ac2fb.out" 2>"$TMPDIR_EVAL/ac2fb.err"; then
+  _pass "AC2a: gate-review exits 0"
+else
+  _fail "AC2a: gate-review failed: $(cat "$TMPDIR_EVAL/ac2fb.err")"
+fi
+AC2FB_INQUIRIES="$AC2FB_DIR/gate-review.inquiries.json"
+if [[ -f "$AC2FB_INQUIRIES" ]]; then
+  # outcome must be "matched" (claim exists in bundle)
+  OUTCOME="$(jq_node "$AC2FB_INQUIRIES" 'd => d[0].outcome' 2>/dev/null || echo "")"
+  if [[ "$OUTCOME" == "matched" ]]; then
+    _pass "AC2a: false_block InquiryRecord has outcome=matched"
+  else
+    _fail "AC2a: expected outcome=matched, got '$OUTCOME'"
+  fi
+  # calibration must be false_block
+  CALIBRATION="$(jq_node "$AC2FB_INQUIRIES" 'd => d[0].answer.value.calibration' 2>/dev/null || echo "")"
+  if [[ "$CALIBRATION" == "false_block" ]]; then
+    _pass "AC2a: false_block calibration correct"
+  else
+    _fail "AC2a: expected calibration=false_block, got '$CALIBRATION'"
+  fi
+  # advisoryFix must be non-empty
+  ADVISORY="$(jq_node "$AC2FB_INQUIRIES" 'd => d[0].answer.value.advisoryFix' 2>/dev/null || echo "")"
+  if [[ -n "$ADVISORY" ]] && [[ "$ADVISORY" != "null" ]]; then
+    _pass "AC2a: false_block has non-empty advisoryFix"
+  else
+    _fail "AC2a: false_block advisoryFix is empty"
+  fi
+  # schema validation via hachure (validates against inquiry-record.schema.json)
+  SCHEMA_RESULT="$(node -e "
+try {
+  const { validateInquiryRecord } = require('$ROOT/build/src/cli/workflow-sidecar.js');
+  const records = JSON.parse(require('fs').readFileSync('$AC2FB_INQUIRIES','utf8'));
+  let allValid = true;
+  const errors = [];
+  for (const r of records) {
+    const result = validateInquiryRecord(r);
+    if (result.available && !result.valid) {
+      allValid = false;
+      errors.push(...result.errors);
+    }
+  }
+  const available = records.length > 0 ? validateInquiryRecord(records[0]).available : false;
+  process.stdout.write(JSON.stringify({ available, allValid, errors }));
+} catch(e) { process.stdout.write(JSON.stringify({ available: false, allValid: true, errors: [String(e)] })); }
+" 2>/dev/null)"
+  SCHEMA_AVAILABLE="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT}').available ? 'true' : 'false')" 2>/dev/null || echo "false")"
+  SCHEMA_ALL_VALID="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT}').allValid ? 'true' : 'false')" 2>/dev/null || echo "true")"
+  if [[ "$SCHEMA_AVAILABLE" == "true" ]]; then
+    if [[ "$SCHEMA_ALL_VALID" == "true" ]]; then
+      _pass "AC2a: false_block InquiryRecords validate against hachure inquiry-record.schema.json (available=true, valid=true)"
+    else
+      SCHEMA_ERRORS="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT}').errors.slice(0,3).join('; '))" 2>/dev/null || echo "?")"
+      _fail "AC2a: InquiryRecord schema validation failed: $SCHEMA_ERRORS"
+    fi
+  else
+    _pass "AC2a: hachure not available — schema validation skipped (fail-open)"
+  fi
+fi
+# ── AC2: missed_block scenario ────────────────────────────────────────────────
+echo ""
+echo "--- AC2b: missed_block — absent criterion ---"
+AC2MB_ROOT="$TMPDIR_EVAL/ac2mb/.flow-agents"
+AC2MB_SLUG="ac2-missed-block"
+AC2MB_DIR="$AC2MB_ROOT/$AC2MB_SLUG"
+mkdir -p "$AC2MB_DIR"
+# Seed: empty bundle (no claims) + no block + expected criterion absent → missed_block
+cat > "$AC2MB_DIR/trust.bundle" <<JSON
+{
+  "schemaVersion": 3,
+  "source": "gate-review-test;statusFunctionVersion=1",
+  "claims": [],
+  "evidence": [],
+  "events": [],
+  "policies": []
+}
+JSON
+# Seed acceptance.json with an expected criterion
+cat > "$AC2MB_DIR/acceptance.json" <<JSON
+{
+  "schema_version": "1.0",
+  "task_slug": "$AC2MB_SLUG",
+  "criteria": [
+    { "id": "ac-1", "description": "Unit tests pass", "status": "pending" }
+  ]
+}
+JSON
+# No block streak — gate did NOT fire
+clear_block_streak "$AC2MB_ROOT"
+if flow_agents_node workflow-sidecar gate-review "$AC2MB_DIR" \
+  >"$TMPDIR_EVAL/ac2mb.out" 2>"$TMPDIR_EVAL/ac2mb.err"; then
+  _pass "AC2b: gate-review exits 0"
+else
+  _fail "AC2b: gate-review failed: $(cat "$TMPDIR_EVAL/ac2mb.err")"
+fi
+AC2MB_INQUIRIES="$AC2MB_DIR/gate-review.inquiries.json"
+if [[ -f "$AC2MB_INQUIRIES" ]]; then
+  # The absent criterion should yield outcome="unsupported"
+  OUTCOME_MB="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].outcome' 2>/dev/null || echo "")"
+  if [[ "$OUTCOME_MB" == "unsupported" ]]; then
+    _pass "AC2b: missed_block absent criterion yields outcome=unsupported"
+  else
+    _fail "AC2b: expected outcome=unsupported for absent criterion, got '$OUTCOME_MB'"
+  fi
+  # calibration must be missed_block
+  CALIBRATION_MB="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].answer.value.calibration' 2>/dev/null || echo "")"
+  if [[ "$CALIBRATION_MB" == "missed_block" ]]; then
+    _pass "AC2b: missed_block calibration correct"
+  else
+    _fail "AC2b: expected calibration=missed_block for absent criterion, got '$CALIBRATION_MB'"
+  fi
+  # advisoryFix must be non-empty
+  ADVISORY_MB="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].answer.value.advisoryFix' 2>/dev/null || echo "")"
+  if [[ -n "$ADVISORY_MB" ]] && [[ "$ADVISORY_MB" != "null" ]]; then
+    _pass "AC2b: missed_block has non-empty advisoryFix"
+  else
+    _fail "AC2b: missed_block advisoryFix is empty"
+  fi
+  # schema validation
+  SCHEMA_RESULT_MB="$(node -e "
+try {
+  const { validateInquiryRecord } = require('$ROOT/build/src/cli/workflow-sidecar.js');
+  const records = JSON.parse(require('fs').readFileSync('$AC2MB_INQUIRIES','utf8'));
+  let allValid = true;
+  const errors = [];
+  for (const r of records) {
+    const result = validateInquiryRecord(r);
+    if (result.available && !result.valid) {
+      allValid = false;
+      errors.push(...result.errors);
+    }
+  }
+  const available = records.length > 0 ? validateInquiryRecord(records[0]).available : false;
+  process.stdout.write(JSON.stringify({ available, allValid, errors }));
+} catch(e) { process.stdout.write(JSON.stringify({ available: false, allValid: true, errors: [String(e)] })); }
+" 2>/dev/null)"
+  SCHEMA_AVAILABLE_MB="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT_MB}').available ? 'true' : 'false')" 2>/dev/null || echo "false")"
+  SCHEMA_ALL_VALID_MB="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT_MB}').allValid ? 'true' : 'false')" 2>/dev/null || echo "true")"
+  if [[ "$SCHEMA_AVAILABLE_MB" == "true" ]]; then
+    if [[ "$SCHEMA_ALL_VALID_MB" == "true" ]]; then
+      _pass "AC2b: missed_block InquiryRecords validate against hachure inquiry-record.schema.json (available=true, valid=true)"
+    else
+      SCHEMA_ERRORS_MB="$(node -e "process.stdout.write(JSON.parse('${SCHEMA_RESULT_MB}').errors.slice(0,3).join('; '))" 2>/dev/null || echo "?")"
+      _fail "AC2b: InquiryRecord schema validation failed: $SCHEMA_ERRORS_MB"
+    fi
+  else
+    _pass "AC2b: hachure not available — schema validation skipped (fail-open)"
+  fi
+  # Verify the absent criterion is the inquiry target
+  TARGET_FIELD="$(jq_node "$AC2MB_INQUIRIES" 'd => d[0].inquiry.target && d[0].inquiry.target.fieldOrBehavior' 2>/dev/null || echo "")"
+  if [[ -n "$TARGET_FIELD" ]] && [[ "$TARGET_FIELD" != "null" ]]; then
+    _pass "AC2b: absent criterion inquiry has canonical target"
+  else
+    _fail "AC2b: absent criterion inquiry missing canonical target"
+  fi
+fi
+# ── AC2: correct scenario (gate blocked + disputed claim) ─────────────────────
+echo ""
+echo "--- AC2c: correct — disputed claim + blocked ---"
+AC2COR_ROOT="$TMPDIR_EVAL/ac2cor/.flow-agents"
+AC2COR_SLUG="ac2-correct"
+AC2COR_DIR="$AC2COR_ROOT/$AC2COR_SLUG"
+mkdir -p "$AC2COR_DIR"
+# Seed: disputed claim + blocked → correct
+seed_trust_bundle "$AC2COR_DIR" "$AC2COR_SLUG" "disputed"
+seed_block_streak "$AC2COR_ROOT" 1
+if flow_agents_node workflow-sidecar gate-review "$AC2COR_DIR" \
+  >"$TMPDIR_EVAL/ac2cor.out" 2>"$TMPDIR_EVAL/ac2cor.err"; then
+  _pass "AC2c: gate-review exits 0"
+else
+  _fail "AC2c: gate-review failed: $(cat "$TMPDIR_EVAL/ac2cor.err")"
+fi
+AC2COR_INQUIRIES="$AC2COR_DIR/gate-review.inquiries.json"
+if [[ -f "$AC2COR_INQUIRIES" ]]; then
+  CALIBRATION_COR="$(jq_node "$AC2COR_INQUIRIES" 'd => d[0].answer.value.calibration' 2>/dev/null || echo "")"
+  if [[ "$CALIBRATION_COR" == "correct" ]]; then
+    _pass "AC2c: correct calibration (disputed+blocked)"
+  else
+    _fail "AC2c: expected calibration=correct for disputed+blocked, got '$CALIBRATION_COR'"
+  fi
+fi
+# ── AC3: no hooks changed ─────────────────────────────────────────────────────
+echo ""
+echo "--- AC3: hooks unchanged ---"
+if git -C "$ROOT" diff origin/main -- scripts/hooks/ 2>/dev/null | grep -q '^[+-]'; then
+  _fail "AC3: scripts/hooks/ was modified (gate-review must not touch hooks)"
+else
+  _pass "AC3: scripts/hooks/ unchanged"
+fi
+# ── Summary ───────────────────────────────────────────────────────────────────
+echo ""
+echo "────────────────────────────────────────────────────────────"
+echo "gate-review InquiryRecord tests: $errors error(s)"
+[ "$errors" -eq 0 ] && echo "PASS" || echo "FAIL"
+exit "$errors"

package/evals/integration/test_goal_fit_escape_hatch.sh ADDED Viewed

@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# test_goal_fit_escape_hatch.sh — block-mode escape hatch contract.
+# Block mode must refuse the same goal-fit gap up to N times, then RELEASE
+# (exit 0) so a genuinely-unsatisfiable goal cannot trap the agent forever.
+set -uo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+TMPDIR_EVAL="$(mktemp -d)"
+errors=0
+cleanup() { rm -rf "$TMPDIR_EVAL"; }
+trap cleanup EXIT
+_pass() { echo "  ✓ $1"; }
+_fail() { echo "  ✗ $1"; errors=$((errors + 1)); }
+REPO="$TMPDIR_EVAL/repo"
+mkdir -p "$REPO/.flow-agents/stuck"
+printf '# Test Repo\n' > "$REPO/AGENTS.md"
+printf '# Stuck\n\nbranch: main\nstatus: executing\ntype: deliver\n\n## Plan\n\nTBD.\n' \
+  > "$REPO/.flow-agents/stuck/stuck--deliver.md"
+PAYLOAD="{\"hook_event_name\":\"Stop\",\"cwd\":\"$REPO\"}"
+run_block() {
+  printf '%s' "$PAYLOAD" \
+    | FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=3 \
+      node "$ROOT/scripts/hooks/stop-goal-fit.js" >/dev/null 2>"$1"
+  echo $?
+}
+c1=$(run_block "$TMPDIR_EVAL/b1.err")
+c2=$(run_block "$TMPDIR_EVAL/b2.err")
+c3=$(run_block "$TMPDIR_EVAL/b3.err")
+c4=$(run_block "$TMPDIR_EVAL/b4.err")
+[[ "$c1" -eq 2 ]] && rg -q 'BLOCK 1/3' "$TMPDIR_EVAL/b1.err" \
+  && _pass "first identical block exits 2 (BLOCK 1/3)" \
+  || _fail "first block should exit 2 BLOCK 1/3 (got $c1: $(cat "$TMPDIR_EVAL/b1.err"))"
+[[ "$c2" -eq 2 ]] && rg -q 'BLOCK 2/3' "$TMPDIR_EVAL/b2.err" \
+  && _pass "second identical block exits 2 (BLOCK 2/3)" \
+  || _fail "second block should exit 2 BLOCK 2/3 (got $c2)"
+[[ "$c3" -eq 0 ]] && rg -q 'RELEASED after 3 consecutive identical blocks' "$TMPDIR_EVAL/b3.err" \
+  && _pass "third identical block RELEASES (exit 0, loud notice)" \
+  || _fail "third block should release exit 0 (got $c3: $(cat "$TMPDIR_EVAL/b3.err"))"
+[[ "$c4" -eq 2 ]] && rg -q 'BLOCK 1/3' "$TMPDIR_EVAL/b4.err" \
+  && _pass "streak resets after release (next block is 1/3 again)" \
+  || _fail "post-release block should reset to BLOCK 1/3 (got $c4)"
+# A changing goal-fit gap must reset the streak (progress, not a stuck loop).
+printf '%s' "$PAYLOAD" | FLOW_AGENTS_GOAL_FIT_MODE=block FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=3 node "$ROOT/scripts/hooks/stop-goal-fit.js" >/dev/null 2>/dev/null
+# mutate the artifact so the warning set differs
+printf '# Stuck\n\nbranch: main\nstatus: verifying\ntype: deliver\n\n## Plan\n\nDifferent.\n' \
+  > "$REPO/.flow-agents/stuck/stuck--deliver.md"
+cd=$(run_block "$TMPDIR_EVAL/bd.err")
+[[ "$cd" -eq 2 ]] && rg -q 'BLOCK 1/3' "$TMPDIR_EVAL/bd.err" \
+  && _pass "changed goal-fit gap resets the streak to 1/3" \
+  || _fail "changed gap should reset streak (got $cd: $(cat "$TMPDIR_EVAL/bd.err"))"
+# warn mode never blocks regardless of streak
+wc=$(printf '%s' "$PAYLOAD" | FLOW_AGENTS_GOAL_FIT_MODE=warn node "$ROOT/scripts/hooks/stop-goal-fit.js" >/dev/null 2>/dev/null; echo $?)
+[[ "$wc" -eq 0 ]] && _pass "warn mode exits 0 (escape hatch irrelevant)" \
+  || _fail "warn mode should exit 0 (got $wc)"
+if [[ "$errors" -eq 0 ]]; then
+  echo "Goal Fit escape hatch integration passed."
+  exit 0
+fi
+echo "Goal Fit escape hatch integration failed: $errors issue(s)."
+exit 1

package/evals/integration/test_goal_fit_hook.sh CHANGED Viewed

@@ -5,6 +5,12 @@ set -uo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 source "$ROOT/evals/lib/node.sh"
+# These checks exercise the block mechanism repeatedly against the same workspace
+# as independent assertions, not a single continuous loop. Disable the block
+# escape hatch here so the streak counter never trips; test_goal_fit_escape_hatch.sh
+# covers the release-after-N behavior on its own.
+export FLOW_AGENTS_GOAL_FIT_MAX_BLOCKS=100000
 TMPDIR_EVAL="$(mktemp -d)"
 errors=0
@@ -38,8 +44,10 @@ else
   _fail "goal-fit hook should not block by default"
 fi
-if rg -q 'status:executing' "$TMPDIR_EVAL/stderr.txt" && rg -q 'Definition Of Done' "$TMPDIR_EVAL/stderr.txt" && rg -q 'Goal Fit Gate' "$TMPDIR_EVAL/stderr.txt"; then
-  _pass "goal-fit hook reports active incomplete delivery"
+# Wave 3 (ADR 0010 2c): Builder heading checks removed; only the ACTIVE_STATUSES signal fires now.
+# The Definition Of Done and Goal Fit Gate heading checks were removed from analyze().
+if rg -q 'status:executing' "$TMPDIR_EVAL/stderr.txt"; then
+  _pass "goal-fit hook reports active incomplete delivery (status signal via ACTIVE_STATUSES)"
 else
   _fail "goal-fit hook did not report active incomplete delivery"
 fi
@@ -96,6 +104,41 @@ Build: PASS
 - [ ] Long-lived docs updated with why/how the feature was built
 MARKDOWN
+# Adjustment A (2c): Seed a state.json (terminal: done) and an acceptance.json with
+# pending criteria so the sidecar-driven Final Acceptance hygiene check fires.
+# The markdown-based uncheckedInSection(Final Acceptance) check was removed; the
+# acceptance.json pending-criteria check in missingBundleOrStateSignal is its replacement.
+cat > "$REPO/.flow-agents/feedback-loop/state.json" <<'JSON'
+{
+  "schema_version": "1.0",
+  "task_slug": "feedback-loop",
+  "status": "delivered",
+  "phase": "done",
+  "updated_at": "2026-05-04T00:00:00Z",
+  "next_action": { "status": "done", "summary": "Local delivery complete." }
+}
+JSON
+cat > "$REPO/.flow-agents/feedback-loop/acceptance.json" <<'JSON'
+{
+  "schema_version": "1.0",
+  "task_slug": "feedback-loop",
+  "criteria": [
+    {
+      "id": "ci-passed",
+      "description": "CI/relevant checks passed",
+      "status": "pending"
+    },
+    {
+      "id": "docs-updated",
+      "description": "Long-lived docs updated with why/how the feature was built",
+      "status": "pending"
+    }
+  ],
+  "goal_fit": { "status": "pass", "summary": "User-facing workflow was exercised or documented." }
+}
+JSON
 if FLOW_AGENTS_GOAL_FIT_STRICT=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/final.out" 2>"$TMPDIR_EVAL/final.err" <<JSON
 {"hook_event_name":"Stop","cwd":"$REPO"}
 JSON
@@ -218,6 +261,11 @@ cat > "$REPO/.flow-agents/feedback-loop/handoff.json" <<'JSON'
 }
 JSON
+# Phase 4c: trust.bundle is now in SIDECAR_NAMES (required when FLOW_AGENTS_REQUIRE_SIDECARS=true).
+cat > "$REPO/.flow-agents/feedback-loop/trust.bundle" <<'JSON'
+{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c1","subjectId":"feedback-loop/local-delivery","claimType":"workflow.check.test","fieldOrBehavior":"local delivery check","value":"pass","impactLevel":"high","status":"verified","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"}],"evidence":[{"id":"ev:c1","claimId":"c1","evidenceType":"test_output","method":"validation","sourceRef":"feedback-loop/state.json","excerptOrSummary":"local delivery check","observedAt":"2026-05-04T00:00:00Z","collectedBy":"flow-agents/workflow-sidecar","passing":true}],"policies":[],"events":[]}
+JSON
 if FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/sidecar-valid.out" 2>"$TMPDIR_EVAL/sidecar-valid.err" <<JSON
 {"hook_event_name":"Stop","cwd":"$REPO"}
 JSON
@@ -331,6 +379,12 @@ cat > "$REPO/.flow-agents/feedback-loop/critique.json" <<'JSON'
 }
 JSON
+# Phase 4c: update trust.bundle to reflect the not_verified evidence + fail critique state.
+# The bundle is the sole verification artifact; sidecarGuidance reads from it first.
+cat > "$REPO/.flow-agents/feedback-loop/trust.bundle" <<'JSON'
+{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c-ext","subjectId":"feedback-loop/external-service","claimType":"workflow.check.external","fieldOrBehavior":"External service was unavailable.\nPretend it passed.","value":"not_verified","impactLevel":"high","status":"not_verified","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"},{"id":"c-crit","subjectId":"feedback-loop/feedback-loop-review","claimType":"workflow.critique.review","fieldOrBehavior":"Blocking critique finding remains.","value":"fail","impactLevel":"high","status":"disputed","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"}],"evidence":[{"id":"ev:c-ext","claimId":"c-ext","evidenceType":"test_output","method":"validation","sourceRef":"feedback-loop/state.json","excerptOrSummary":"External service was unavailable. Pretend it passed.","observedAt":"2026-05-04T00:00:00Z","collectedBy":"flow-agents/workflow-sidecar","passing":false}],"policies":[],"events":[]}
+JSON
 if FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/sidecar-guidance.out" 2>"$TMPDIR_EVAL/sidecar-guidance.err" <<JSON
 {"hook_event_name":"Stop","cwd":"$REPO"}
 JSON
@@ -421,6 +475,11 @@ cat > "$REPO/.flow-agents/feedback-loop/evidence.json" <<'JSON'
 }
 JSON
+# Phase 4c: update trust.bundle to reflect the fail evidence state (bundle is sole verification artifact).
+cat > "$REPO/.flow-agents/feedback-loop/trust.bundle" <<'JSON'
+{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c-fail","subjectId":"feedback-loop/local-delivery","claimType":"workflow.check.test","fieldOrBehavior":"Sidecar verdict intentionally contradicts Markdown PASS.","value":"fail","impactLevel":"high","status":"disputed","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"},{"id":"c-crit","subjectId":"feedback-loop/feedback-loop-review","claimType":"workflow.critique.review","fieldOrBehavior":"No blocking critique findings.","value":"pass","impactLevel":"high","status":"verified","createdAt":"2026-05-04T00:00:00Z","updatedAt":"2026-05-04T00:00:00Z"}],"evidence":[],"policies":[],"events":[]}
+JSON
 if FLOW_AGENTS_GOAL_FIT_STRICT=true FLOW_AGENTS_REQUIRE_SIDECARS=true FLOW_AGENTS_REQUIRE_CRITIQUE=true node "$ROOT/scripts/hooks/stop-goal-fit.js" >"$TMPDIR_EVAL/sidecar-contradiction.out" 2>"$TMPDIR_EVAL/sidecar-contradiction.err" <<JSON
 {"hook_event_name":"Stop","cwd":"$REPO"}
 JSON
@@ -428,8 +487,8 @@ then
   _fail "strict goal-fit hook should block Markdown/sidecar contradictions"
 else
   status=$?
-  if [[ "$status" -eq 2 ]] && rg -q 'Markdown PASS contradicts evidence.json verdict fail' "$TMPDIR_EVAL/sidecar-contradiction.err"; then
-    _pass "strict goal-fit hook blocks Markdown/sidecar contradictions"
+  if [[ "$status" -eq 2 ]] && rg -q 'evidence verdict:fail' "$TMPDIR_EVAL/sidecar-contradiction.err"; then
+    _pass "strict goal-fit hook blocks sidecar evidence verdict fail (markdownVerdict check removed; sidecar path covers it)"
   else
     _fail "strict contradiction hook returned unexpected result: status=$status output=$(cat "$TMPDIR_EVAL/sidecar-contradiction.err")"
   fi
@@ -562,6 +621,12 @@ cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/handoff.json" <<'JSON'
 }
 JSON
+# Phase 4c: trust.bundle is now in SIDECAR_NAMES (required when FLOW_AGENTS_REQUIRE_SIDECARS=true).
+# Add a minimal valid trust.bundle so the npm-install-task fixture passes 4c sidecar validation.
+cat > "$NPM_INSTALL_REPO/.flow-agents/npm-install-task/trust.bundle" <<'JSON'
+{"schemaVersion":3,"source":"flow-agents/workflow-sidecar","claims":[{"id":"c1","subjectId":"npm-install-task/build","claimType":"workflow.check.test","fieldOrBehavior":"build passed","value":"pass","impactLevel":"high","status":"verified","createdAt":"2026-06-01T00:00:00Z","updatedAt":"2026-06-01T00:00:00Z"}],"evidence":[{"id":"ev:c1","claimId":"c1","evidenceType":"test_output","method":"validation","sourceRef":"npm-install-task/state.json","excerptOrSummary":"build passed","observedAt":"2026-06-01T00:00:00Z","collectedBy":"flow-agents/workflow-sidecar","passing":true}],"policies":[],"events":[]}
+JSON
 # Part 1 of fix: invoke the already-built validator directly (no tsc).
 # Poison tsc so that any call to it fails; confirm the hook does not call it
 # and validates clean sidecars successfully.