npm - @mmerterden/multi-agent-pipeline - Versions diffs - 10.7.3 → 10.8.0 - Mend

@mmerterden/multi-agent-pipeline 10.7.3 → 10.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/pipeline/schemas/diff-risk.schema.json CHANGED Viewed

@@ -1,16 +1,16 @@
 {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://github.com/mmerterden/multi-agent-pipeline/pipeline/schemas/diff-risk.schema.json",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "title": "Multi-Agent Pipeline  -  Phase 4 diff risk score",
-  "description": "Output contract for diff-risk-score.mjs. Heuristic, deterministic, no LLM. Produced before Phase 4 Step 2 to give reviewer prompts a priority ordering  -  never used as a gate.",
+  "description": "Output contract for diff-risk-score.mjs. Heuristic, deterministic, no LLM. Produced before Phase 4 Step 2 to give reviewer prompts a priority ordering  -  never used as a gate. v1.1.0 adds the test_lines_removed signal (immutable-test backstop: a test file whose diff removes more lines than it adds).",
   "type": "object",
   "additionalProperties": false,
   "required": ["schemaVersion", "task", "totals", "files"],
   "properties": {
     "schemaVersion": {
       "type": "string",
-      "const": "1.0.0"
+      "const": "1.1.0"
     },
     "task": {
       "type": "object",
@@ -63,7 +63,8 @@
                     "no_test_change",
                     "complexity_delta",
                     "ui_critical",
-                    "migration"
+                    "migration",
+                    "test_lines_removed"
                   ]
                 },
                 "weight": { "type": "number" },

package/pipeline/schemas/prefs.schema.json CHANGED Viewed

@@ -701,6 +701,41 @@
           "default": false,
           "description": "v6.1.0+ \u2014 Phase 4 Step 2.5 rebuttal round. When reviewers disagree (mixed blocker/approved verdict), each reviewer is re-prompted with the others' opposing arguments for one additional round before triage. Lifts signal quality on ambiguous findings at ~1\u00d7 Step 2 token cost. Off by default \u2014 flip for security-critical or release-branch reviews."
         },
+        "verifyByTest": {
+          "type": "object",
+          "additionalProperties": false,
+          "description": "v10.8+ - Phase 4 Step 3.7 verify-by-test. When enabled, accepted BLOCKING findings are empirically validated before the Phase 3 rework loop: one verifier agent writes a minimal repro test per finding and runs only that test. Confirmed findings hand their failing test to Phase 3 as the RED step; non-reproducible findings are downgraded to deferred under evidence-gate. Only blocking findings are ever verified (fixed behavior, not a knob). Adds one model call plus up to maxFindings single-test runs per iteration with accepted blockers; default off. Flip on for security-critical work, release branches, or repos with noisy reviewers. Full spec: refs/features/verify-by-test.md.",
+          "properties": {
+            "enabled": {
+              "type": "boolean",
+              "default": false,
+              "description": "Master switch."
+            },
+            "maxFindings": {
+              "type": "integer",
+              "minimum": 1,
+              "maximum": 10,
+              "default": 3,
+              "description": "Max accepted blocking findings verified per review iteration. Findings beyond the cap keep their judgment-only verdict."
+            },
+            "model": {
+              "type": "string",
+              "enum": [
+                "sonnet",
+                "opus"
+              ],
+              "default": "sonnet",
+              "description": "Verifier agent model. Writing a minimal repro test is mechanical work; Sonnet is the cost-sane default."
+            },
+            "stepTimeoutSec": {
+              "type": "integer",
+              "minimum": 60,
+              "maximum": 1800,
+              "default": 600,
+              "description": "Wall-clock budget for the whole Step 3.7 pass. On breach, remaining findings keep judgment-only verdicts and the pipeline proceeds (never blocks)."
+            }
+          }
+        },
         "review": {
           "type": "object",
           "additionalProperties": false,
@@ -831,9 +866,9 @@
             },
             "pricingModel": {
               "type": "string",
-              "enum": ["opus", "sonnet", "haiku"],
-              "default": "opus",
-              "description": "Which cost-table.json rate to price accumulated tokens at. Defaults to opus for a deliberately conservative (upper-bound) estimate, so the ceiling trips early rather than late."
+              "enum": ["fable", "opus", "sonnet", "haiku"],
+              "default": "fable",
+              "description": "Which cost-table.json rate to price accumulated tokens at. Defaults to fable (the top tier since v10.6.0) for a deliberately conservative (upper-bound) estimate, so the ceiling trips early rather than late."
             }
           }
         },

package/pipeline/schemas/reviewer-output.schema.json CHANGED Viewed

@@ -19,7 +19,7 @@
     },
     "reviewer": {
       "type": "string",
-      "description": "Model label for this output (e.g. 'opus', 'sonnet', 'gpt'). Present once the parallel reviewer outputs are merged into the Phase 4 array so triage/consensus can attribute each finding to its source. Optional on a single reviewer's raw pre-merge output."
+      "description": "Model label for this output (e.g. 'fable', 'opus', 'sonnet', 'gpt'). Present once the parallel reviewer outputs are merged into the Phase 4 array so triage/consensus can attribute each finding to its source. Optional on a single reviewer's raw pre-merge output."
     }
   },
   "$defs": {

package/pipeline/schemas/triage-output.schema.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://github.com/mmerterden/multi-agent-pipeline/pipeline/schemas/triage-output.schema.json",
-  "version": "3.1.0",
+  "version": "3.2.0",
   "title": "Multi-Agent Pipeline  -  Phase 4 triage output",
-  "description": "Contract for the Opus triage agent's JSON output in Phase 4 Step 3. Triage consumes merged reviewer findings and splits them into accepted/deferred/rejected. Only `accepted` blocking/important items trigger Phase 3 rework. v3.1.0 adds the optional `consensus` block so triage can surface reviewer-agreement risk (false consensus among same-base-model reviewers) instead of silently merging.",
+  "description": "Contract for the Opus triage agent's JSON output in Phase 4 Step 3. Triage consumes merged reviewer findings and splits them into accepted/deferred/rejected. Only `accepted` blocking/important items trigger Phase 3 rework. v3.1.0 adds the optional `consensus` block so triage can surface reviewer-agreement risk (false consensus among same-base-model reviewers) instead of silently merging. v3.2.0 adds the optional per-finding `verification` block written by Phase 4 Step 3.7 (verify-by-test): the empirical repro-test outcome for accepted blocking findings.",
   "type": "object",
   "additionalProperties": false,
   "required": ["accepted", "deferred", "rejected", "approved"],
@@ -74,8 +74,8 @@
     },
     "reviewer": {
       "type": "string",
-      "enum": ["opus", "sonnet"],
-      "description": "Which reviewer produced the raw finding. Haiku was removed in v2.1.0."
+      "enum": ["fable", "opus", "sonnet", "gpt"],
+      "description": "Which reviewer produced the raw finding. Claude Code Reviewer 1 is fable (opus when fallback engages); Copilot CLI adds gpt. Haiku was removed in v2.1.0."
     },
     "consensus": {
       "type": "object",
@@ -114,6 +114,35 @@
         }
       }
     },
+    "verification": {
+      "type": "object",
+      "additionalProperties": false,
+      "description": "v3.2.0 verify-by-test outcome (Phase 4 Step 3.7, opt-in via prefs.global.verifyByTest). confirmed = repro test failed as the finding predicts (finding stands, test kept as the Phase 3 RED test); not-reproduced = repro test passed under evidence-gate (finding downgraded to deferred); inconclusive = compile error / timeout / not unit-testable (judgment verdict stands).",
+      "required": ["result"],
+      "properties": {
+        "result": {
+          "type": "string",
+          "enum": ["confirmed", "not-reproduced", "inconclusive"]
+        },
+        "testRef": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Single-test reference, e.g. 'AuthTests/LoginTests/testExpiredTokenRejected' or 'tests/test_auth.py::test_expired_token'."
+        },
+        "evidencePath": {
+          "type": "string",
+          "minLength": 1,
+          "description": "Path to the test-run log verified by evidence-gate.mjs, e.g. '.pipeline/verify-1.test.log'."
+        },
+        "note": { "type": "string" }
+      },
+      "if": {
+        "properties": { "result": { "enum": ["confirmed", "not-reproduced"] } }
+      },
+      "then": {
+        "required": ["result", "testRef", "evidencePath"]
+      }
+    },
     "rawFinding": {
       "type": "object",
       "additionalProperties": false,
@@ -124,7 +153,8 @@
         "line": { "type": "integer", "minimum": 0 },
         "issue": { "type": "string", "minLength": 4 },
         "fix": { "type": "string" },
-        "reviewer": { "$ref": "#/$defs/reviewer" }
+        "reviewer": { "$ref": "#/$defs/reviewer" },
+        "verification": { "$ref": "#/$defs/verification" }
       }
     },
     "acceptedFinding": {
@@ -144,7 +174,8 @@
               "type": "string",
               "minLength": 4,
               "description": "Concrete change the dev agent must make. Required for accepted items so Phase 3 re-entry has actionable direction."
-            }
+            },
+            "verification": { "$ref": "#/$defs/verification" }
           }
         }
       ]

package/pipeline/scripts/README.md CHANGED Viewed

@@ -22,6 +22,8 @@ Validate contracts. Each emits `══ <name> smoke: N passed, M failed ══`
 - `smoke-phase-6-multi.sh`  -  Phase 6 multi-repo commit/PR cross-linking
 - `smoke-phase-banner.sh` + `smoke-phase-tracker.sh`  -  Phase UI output contracts
 - `smoke-phase4-triage.sh`  -  Phase 4 reviewer → triage flow
+- `smoke-verify-by-test.sh`  -  Phase 4 Step 3.7 verify-by-test contract (v10.8.0)
+- `smoke-handoff-contract.sh`  -  phase-boundary structured handoff + handoff-first resume (v10.8.0)
 ### Schema + state
 - `smoke-schema-validation.sh`  -  all JSON schemas validate
@@ -64,12 +66,11 @@ Installed into `~/.claude/scripts/` and invoked by settings.json hook configurat
 - `pre-push-check.sh`  -  runs before `git push` (smoke-cross-cli-behavior + smoke-personal-data)
 - `output-quality-check.sh`  -  runs after PR body / Jira comment generation (newline / HTML entity guard)
-## Runtime helpers (13 files)
+## Runtime helpers
 Shell scripts invoked during pipeline execution.
 - `phase-banner.sh`  -  renders phase headers
 - `phase-tracker.sh`  -  live tracker state + tokens accumulation + render
-- `stack-swap.sh`  -  stack detection + skill set swap
 - `keychain-save.sh`  -  store PAT in macOS Keychain
 - `audit-log.sh` + `audit-log-rotate.sh`  -  opt-in audit trail
 - `log-metric.sh`  -  opt-in metric capture

package/pipeline/scripts/cost-budget-check.mjs CHANGED Viewed

@@ -66,7 +66,7 @@ if (flags.help || flags.h) {
 }
 // --- resolve config: prefs first, CLI overrides -----------------------------
-const cfg = { enabled: false, maxUsd: 5.0, warnPct: 80, onExceed: "warn", pricingModel: "opus" };
+const cfg = { enabled: false, maxUsd: 5.0, warnPct: 80, onExceed: "warn", pricingModel: "fable" };
 if (flags.prefs) {
   if (!existsSync(flags.prefs)) die(`prefs file not found: ${flags.prefs}`);

package/pipeline/scripts/cost-table.json CHANGED Viewed

@@ -2,6 +2,13 @@
   "_readme": "Per-model unit prices in USD per million tokens. Source: Anthropic public pricing (verified 2026-04-21). Update when Anthropic publishes new tiers. Unknown models render USD as ' - ' and emit a footnote  -  never block PR-body generation. cacheReadPerMtok is the discounted rate for prompt-cache hits (~10% of inPerMtok); the renderer prices a phase's tokens_cached at this rate when the tracker records it, so resume/cache reuse is visible in the ledger.",
   "schemaVersion": "1.1.0",
   "prices": {
+    "fable": {
+      "inPerMtok": 10.0,
+      "outPerMtok": 50.0,
+      "cacheReadPerMtok": 1.0,
+      "modelId": "claude-fable-5",
+      "note": "Top tier (restored v10.6.0)  -  architects, Reviewer 1, triage. Verified against Anthropic pricing 2026-07-02."
+    },
     "opus": {
       "inPerMtok": 5.0,
       "outPerMtok": 25.0,

package/pipeline/scripts/diff-risk-score.mjs CHANGED Viewed

@@ -15,6 +15,7 @@
  *   complexity_delta  -  added if/guard/case/switch/while count     w=1.5
  *   ui_critical       -  *View.swift / *Screen.kt / Configuration   w=1.5
  *   migration         -  DB schema / migration path                 w=4.0
+ *   test_lines_removed -  test file shrinks (removed > added)       w=3.0
  *
  * Inputs:
  *   --base <ref>     Base ref. Default: origin/main, fallback: main
@@ -275,6 +276,15 @@ function buildRow(stat, addedLines, allChangedPaths) {
     }
   }
+  // Test-lines-removed: a test-classified file whose diff removes more lines
+  // than it adds. Shrinking tests is the classic get-to-green shortcut the
+  // immutable-test rule forbids (refs/rules.md); surface it to reviewers.
+  if (isTestPath(path) && stat.removed > stat.added) {
+    const w = 3.0;
+    signals.push({ name: "test_lines_removed", weight: w, value: stat.removed - stat.added });
+    score += 12 * w;
+  }
   return {
     path,
     score: Math.round(score * 100) / 100,
@@ -306,7 +316,7 @@ function main() {
   };
   const out = {
-    schemaVersion: "1.0.0",
+    schemaVersion: "1.1.0",
     task: {
       id: TASK_ID,
       base: BASE || "(diff-file)",

package/pipeline/scripts/fixtures/diff-risk-test-removal.diff ADDED Viewed

@@ -0,0 +1,40 @@
+diff --git a/MyAppTests/LoginViewModelTests.swift b/MyAppTests/LoginViewModelTests.swift
+index 1111111..2222222 100644
+--- a/MyAppTests/LoginViewModelTests.swift
++++ b/MyAppTests/LoginViewModelTests.swift
+@@ -10,30 +10,20 @@ final class LoginViewModelTests: XCTestCase {
+     func testLoginWithValidCredentials_Succeeds() {
+         let sut = LoginViewModel(service: MockAuthService())
++        sut.retryPolicy = .none
+         sut.login(email: "user@example.com", password: "correct")
++        XCTAssertTrue(sut.isAuthenticated)
+     }
+-
+-    func testLoginWithInvalidEmail_ShowsError() {
+-        let sut = LoginViewModel(service: MockAuthService())
+-        sut.login(email: "not-an-email", password: "irrelevant")
+-        XCTAssertEqual(sut.errorMessage, "Invalid email")
+-    }
+-
+-    func testLoginWithExpiredToken_Rejects() {
+-        let sut = LoginViewModel(service: MockAuthService(tokenState: .expired))
+-        sut.login(email: "user@example.com", password: "correct")
+-        XCTAssertFalse(sut.isAuthenticated)
+-    }
+-
+-    func testLogout_ClearsSession() {
+-        let sut = LoginViewModel(service: MockAuthService())
+-        sut.logout()
+-        XCTAssertNil(sut.session)
+-    }
+ }
+diff --git a/MyApp/Sources/Auth/LoginViewModel.swift b/MyApp/Sources/Auth/LoginViewModel.swift
+index 3333333..4444444 100644
+--- a/MyApp/Sources/Auth/LoginViewModel.swift
++++ b/MyApp/Sources/Auth/LoginViewModel.swift
+@@ -20,6 +20,8 @@ final class LoginViewModel {
+     func login(email: String, password: String) {
++        guard email.contains("@") else { return }
++        service.authenticate(email: email, password: password)
+     }
+ }

package/pipeline/scripts/fixtures/install-layout.tsv CHANGED Viewed

@@ -1,16 +1,16 @@
 .claude/CLAUDE.md	1
 .claude/agents	8
-.claude/commands	87
+.claude/commands	89
 .claude/lib	23
 .claude/multi-agent-preferences.json	1
 .claude/rules	12
 .claude/schemas	23
-.claude/scripts	174
+.claude/scripts	169
 .claude/settings.json	1
-.claude/skills	555
+.claude/skills	560
 .copilot/agents	8
 .copilot/copilot-instructions.md	1
 .copilot/lib	23
 .copilot/schemas	23
-.copilot/scripts	174
-.copilot/skills	590
+.copilot/scripts	169
+.copilot/skills	596

package/pipeline/scripts/smoke-diff-risk.sh CHANGED Viewed

@@ -12,6 +12,7 @@
 #   8. phase-4-review.md ref doc declares Step 1.75 + diff-risk-score.mjs
 #   9. code-reviewer.md agent template carries the priority-files placeholder
 #   10. prefs.schema.json exposes diffRisk advisory toggle
+#   11. test-removal fixture fires the test_lines_removed signal (v1.1.0)
 #
 # Exit 0 = all pass, 1 = any failure.
@@ -26,6 +27,7 @@ REVIEWER="$ROOT/pipeline/agents/code-reviewer.md"
 PREFS="$ROOT/pipeline/schemas/prefs.schema.json"
 FIX_IOS="$ROOT/pipeline/scripts/fixtures/diff-risk-ios.diff"
 FIX_AND="$ROOT/pipeline/scripts/fixtures/diff-risk-android.diff"
+FIX_TESTRM="$ROOT/pipeline/scripts/fixtures/diff-risk-test-removal.diff"
 pass=0
 fail=0
@@ -38,10 +40,11 @@ printf '→ smoke-diff-risk (v8.3.0): pre-review risk scoring contract\n'
 [ -f "$SCHEMA" ]   || { record_fail "schema missing: $SCHEMA"; exit 1; }
 [ -f "$FIX_IOS" ]  || { record_fail "fixture missing: $FIX_IOS"; exit 1; }
 [ -f "$FIX_AND" ]  || { record_fail "fixture missing: $FIX_AND"; exit 1; }
+[ -f "$FIX_TESTRM" ] || { record_fail "fixture missing: $FIX_TESTRM"; exit 1; }
 # --- 1: iOS fixture produces JSON ---
 out_ios=$(node "$SCORE" --diff "$FIX_IOS" 2>/dev/null)
-if jq -e '.schemaVersion == "1.0.0"' <<< "$out_ios" >/dev/null 2>&1; then
+if jq -e '.schemaVersion == "1.1.0"' <<< "$out_ios" >/dev/null 2>&1; then
   record_pass "iOS fixture renders schema-versioned JSON"
 else
   record_fail "iOS fixture JSON malformed or missing schemaVersion"
@@ -150,6 +153,32 @@ else
   record_fail "prefs.schema.json missing global.diffRiskAdvisory"
 fi
+# --- 11: test_lines_removed signal fires on the test-removal fixture ---
+out_testrm=$(node "$SCORE" --diff "$FIX_TESTRM" 2>/dev/null)
+sig_value=$(jq -r '.files[] | select(.path == "MyAppTests/LoginViewModelTests.swift")
+                   | .signals[] | select(.name == "test_lines_removed") | .value' <<< "$out_testrm")
+if [ "$sig_value" = "16" ]; then
+  record_pass "test_lines_removed fires with value=16 (18 removed - 2 added)"
+else
+  record_fail "test_lines_removed should fire with value=16, got: ${sig_value:-missing}"
+fi
+sig_on_source=$(jq -r '[.files[] | select(.path == "MyApp/Sources/Auth/LoginViewModel.swift")
+                        | .signals[] | select(.name == "test_lines_removed")] | length' <<< "$out_testrm")
+if [ "$sig_on_source" = "0" ]; then
+  record_pass "test_lines_removed does not fire on source files"
+else
+  record_fail "test_lines_removed must only fire on test-classified paths"
+fi
+set +e
+echo "$out_testrm" | node "$VALIDATE" - >/dev/null 2>&1
+rc_testrm=$?
+set -e
+if [ "$rc_testrm" -eq 0 ]; then
+  record_pass "validator accepts output carrying test_lines_removed"
+else
+  record_fail "validator rejected test_lines_removed output (rc=$rc_testrm)"
+fi
 # --- Summary ---
 total=$((pass + fail))
 printf '\n→ smoke-diff-risk: %d/%d passed\n' "$pass" "$total"

package/pipeline/scripts/smoke-handoff-contract.sh ADDED Viewed

@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+# smoke-handoff-contract.sh
+#
+# Verifies the v10.8.0 structured-handoff contract (fresh-context re-entry):
+#   1. operations.md documents the Handoff block with all 5 required lines
+#   2. operations.md compaction trigger re-reads state AND the latest handoff
+#   3. log-format.md documents the Handoff section in the canonical log shape
+#   4. resume.md Step 3 reads the latest handoff FIRST with pre-v10.8 fallback
+#
+# Exit 0 = all pass, 1 = any failure.
+set -euo pipefail
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+OPS="$ROOT/pipeline/commands/multi-agent/refs/phases/operations.md"
+LOGFMT="$ROOT/pipeline/commands/multi-agent/refs/phases/log-format.md"
+RESUME="$ROOT/pipeline/commands/multi-agent/resume.md"
+pass=0
+fail=0
+failures=()
+record_pass() { pass=$((pass + 1)); printf '  \033[0;32mPASS\033[0m %s\n' "$1"; }
+record_fail() { fail=$((fail + 1)); failures+=("$1"); printf '  \033[0;31mFAIL\033[0m %s\n' "$1"; }
+printf '→ smoke-handoff-contract: structured handoff (fresh-context re-entry)\n'
+# 1. operations.md documents the Handoff block with the 5 required lines
+if [ ! -f "$OPS" ]; then
+  record_fail "operations.md missing"
+else
+  if grep -qF "Handoff block (v10.8.0)" "$OPS"; then
+    record_pass "operations.md documents the Handoff block"
+  else
+    record_fail "operations.md missing 'Handoff block (v10.8.0)' spec"
+  fi
+  for line in "- Done:" "- Remaining:" "- Decisions:" "- Open findings:" "- Next:"; do
+    if grep -qF -- "$line" "$OPS"; then
+      record_pass "operations.md handoff spec has '$line'"
+    else
+      record_fail "operations.md handoff spec missing '$line'"
+    fi
+  done
+  if grep -qF "no agent dispatch, no extra LLM call" "$OPS"; then
+    record_pass "operations.md states handoff is orchestrator-written (no LLM call)"
+  else
+    record_fail "operations.md must state the handoff costs no LLM call"
+  fi
+fi
+# 2. Compaction trigger re-reads state AND latest handoff
+if grep -qE 'agent-state\.json.*AND the latest.*Handoff' "$OPS"; then
+  record_pass "compaction trigger re-reads state + latest handoff"
+else
+  record_fail "operations.md compaction trigger must re-read agent-state.json AND the latest Handoff block"
+fi
+# 3. log-format.md documents the Handoff section
+if grep -qF "## Handoff - end of Phase" "$LOGFMT"; then
+  record_pass "log-format.md documents the Handoff section"
+else
+  record_fail "log-format.md missing the Handoff section"
+fi
+if grep -qF "LATEST block is authoritative" "$LOGFMT"; then
+  record_pass "log-format.md states latest-block-wins semantics"
+else
+  record_fail "log-format.md must state the latest handoff block is authoritative"
+fi
+# 4. resume.md reads handoff first, with fallback for older logs
+if grep -qE 'LATEST .?## Handoff.? block' "$RESUME"; then
+  record_pass "resume.md Step 3 reads the latest Handoff block first"
+else
+  record_fail "resume.md Step 3 must read the latest Handoff block first"
+fi
+if grep -qiF "fall back to per-phase findings" "$RESUME"; then
+  record_pass "resume.md keeps the pre-v10.8 per-phase fallback"
+else
+  record_fail "resume.md must keep the pre-v10.8 per-phase findings fallback"
+fi
+if grep -qF "trust state on mismatch" "$RESUME"; then
+  record_pass "resume.md defines state-wins conflict rule"
+else
+  record_fail "resume.md must define the handoff-vs-state conflict rule (state wins)"
+fi
+printf '\n══ handoff-contract smoke: %d passed, %d failed ══\n' "$pass" "$fail"
+if [ "$fail" -gt 0 ]; then
+  printf '\nFailures:\n'
+  for msg in "${failures[@]}"; do printf '  - %s\n' "$msg"; done
+  exit 1
+fi
+exit 0

package/pipeline/scripts/smoke-verify-by-test.sh ADDED Viewed

@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+# smoke-verify-by-test.sh
+#
+# Verifies the Phase 4 Step 3.7 verify-by-test contract:
+#   1. phase-4-review.md documents Step 3.7 with evidence-gate invocation + feature-doc pointer
+#   2. refs/features/verify-by-test.md exists and covers the verdict table + red-test handoff
+#   3. prefs.schema.json exposes global.verifyByTest.{enabled,maxFindings,model,stepTimeoutSec}
+#   4. verifyByTest.enabled defaults to false (opt-in, no surprise cost)
+#   5. triage-output.schema.json is v3.2.0 with the $defs.verification result enum
+#   6. validate-triage.mjs accepts a valid `confirmed` verification and rejects bad ones
+#   7. phase-3-dev.md documents the redTests rework re-entry
+#
+# Exit 0 = all pass, 1 = any failure.
+set -euo pipefail
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+PHASE4_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-4-review.md"
+PHASE3_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md"
+FEATURE_DOC="$ROOT/pipeline/commands/multi-agent/refs/features/verify-by-test.md"
+PREFS_SCHEMA="$ROOT/pipeline/schemas/prefs.schema.json"
+TRIAGE_SCHEMA="$ROOT/pipeline/schemas/triage-output.schema.json"
+VALIDATOR="$ROOT/pipeline/scripts/validate-triage.mjs"
+pass=0
+fail=0
+failures=()
+record_pass() { pass=$((pass + 1)); printf '  \033[0;32mPASS\033[0m %s\n' "$1"; }
+record_fail() { fail=$((fail + 1)); failures+=("$1"); printf '  \033[0;31mFAIL\033[0m %s\n' "$1"; }
+printf '→ smoke-verify-by-test: Phase 4 Step 3.7 contract\n'
+# 1. Phase 4 doc documents Step 3.7
+if [ ! -f "$PHASE4_DOC" ]; then
+  record_fail "phase-4-review.md missing"
+else
+  if grep -qF "3.7 Verify-by-test" "$PHASE4_DOC"; then
+    record_pass "phase-4-review.md documents Step 3.7"
+  else
+    record_fail "phase-4-review.md missing Step 3.7 section"
+  fi
+  if grep -qF "evidence-gate.mjs --claim test --status passed" "$PHASE4_DOC"; then
+    record_pass "Step 3.7 downgrade is evidence-gated"
+  else
+    record_fail "Step 3.7 must gate downgrades via evidence-gate.mjs --claim test --status passed"
+  fi
+  if grep -qF "refs/features/verify-by-test.md" "$PHASE4_DOC"; then
+    record_pass "phase-4-review.md points to the feature doc"
+  else
+    record_fail "phase-4-review.md must reference refs/features/verify-by-test.md"
+  fi
+  if grep -qF "review.verify_by_test" "$PHASE4_DOC"; then
+    record_pass "Step 3.7 emits review.verify_by_test telemetry"
+  else
+    record_fail "Step 3.7 must document the review.verify_by_test metric"
+  fi
+fi
+# 2. Feature doc exists with verdict + handoff coverage
+if [ ! -f "$FEATURE_DOC" ]; then
+  record_fail "refs/features/verify-by-test.md missing"
+else
+  for token in "not-reproduced" "inconclusive" "redTests" "Off by default"; do
+    if grep -qF "$token" "$FEATURE_DOC"; then
+      record_pass "feature doc covers '$token'"
+    else
+      record_fail "feature doc missing '$token'"
+    fi
+  done
+fi
+# 3. Prefs schema exposes verifyByTest knobs
+for prop in enabled maxFindings model stepTimeoutSec; do
+  if jq -e ".properties.global.properties.verifyByTest.properties.${prop}" "$PREFS_SCHEMA" >/dev/null 2>&1; then
+    record_pass "prefs schema exposes verifyByTest.${prop}"
+  else
+    record_fail "prefs schema missing verifyByTest.${prop}"
+  fi
+done
+# 4. Off by default  -  preserves existing-user baseline
+if jq -e '.properties.global.properties.verifyByTest.properties.enabled
+          | has("default") and .default == false' "$PREFS_SCHEMA" >/dev/null 2>&1; then
+  record_pass "verifyByTest.enabled defaults to false (opt-in)"
+else
+  record_fail "verifyByTest.enabled must default to false"
+fi
+# 5. Triage schema version + verification enum
+schema_version=$(jq -r '.version // empty' "$TRIAGE_SCHEMA")
+if [ "$schema_version" = "3.2.0" ]; then
+  record_pass "triage-output schema version is 3.2.0"
+else
+  record_fail "triage-output schema version should be 3.2.0 (was: ${schema_version:-missing})"
+fi
+if jq -e '.["$defs"].verification.properties.result.enum
+          | (index("confirmed") != null
+             and index("not-reproduced") != null
+             and index("inconclusive") != null)' "$TRIAGE_SCHEMA" >/dev/null 2>&1; then
+  record_pass "schema verification.result enum complete"
+else
+  record_fail "schema \$defs.verification.result enum must be confirmed/not-reproduced/inconclusive"
+fi
+# 6. Behavioral validator round-trips
+valid_fixture='{"accepted":[{"severity":"blocking","file":"Sources/Auth/Login.swift","line":42,"issue":"expired token accepted as valid","fix":"reject tokens past expiry in validateToken()","reviewer":"fable","verification":{"result":"confirmed","testRef":"AuthTests/LoginTests/testExpiredTokenRejected","evidencePath":".pipeline/verify-1.test.log"}}],"deferred":[],"rejected":[],"approved":false}'
+if printf '%s' "$valid_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
+  record_pass "validator accepts confirmed verification with testRef+evidencePath"
+else
+  record_fail "validator rejected a valid confirmed verification"
+fi
+bad_result_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"maybe"}}],"deferred":[],"rejected":[],"approved":false}'
+if printf '%s' "$bad_result_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
+  record_fail "validator must reject verification.result 'maybe'"
+else
+  record_pass "validator rejects bad verification.result"
+fi
+missing_ref_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"confirmed"}}],"deferred":[],"rejected":[],"approved":false}'
+if printf '%s' "$missing_ref_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
+  record_fail "validator must reject confirmed verification without testRef/evidencePath"
+else
+  record_pass "validator rejects confirmed verification lacking testRef/evidencePath"
+fi
+# Reviewer enum parity: fable (Claude Code default) and gpt (Copilot CLI) accepted
+fable_fixture='{"accepted":[{"severity":"important","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"gpt"}],"deferred":[],"rejected":[],"approved":true}'
+if printf '%s' "$fable_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
+  record_pass "validator accepts schema-allowed reviewers (fable/gpt)"
+else
+  record_fail "validator must accept reviewer values fable and gpt (schema v3.1.0 parity)"
+fi
+# 7. Phase 3 doc documents the red-test rework re-entry
+if grep -qF "verifyByTest.redTests" "$PHASE3_DOC"; then
+  record_pass "phase-3-dev.md documents redTests rework re-entry"
+else
+  record_fail "phase-3-dev.md must document verifyByTest.redTests re-entry"
+fi
+printf '\n══ verify-by-test smoke: %d passed, %d failed ══\n' "$pass" "$fail"
+if [ "$fail" -gt 0 ]; then
+  printf '\nFailures:\n'
+  for msg in "${failures[@]}"; do printf '  - %s\n' "$msg"; done
+  exit 1
+fi
+exit 0