@mmerterden/multi-agent-pipeline 10.7.4 → 10.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/CHANGELOG.md +93 -0
  2. package/README.md +2 -0
  3. package/docs/engineering.md +76 -0
  4. package/docs/features.md +49 -33
  5. package/package.json +1 -1
  6. package/pipeline/commands/multi-agent/refs/features/verify-by-test.md +41 -0
  7. package/pipeline/commands/multi-agent/refs/phases/log-format.md +10 -0
  8. package/pipeline/commands/multi-agent/refs/phases/operations.md +15 -2
  9. package/pipeline/commands/multi-agent/refs/phases/phase-0-init.md +9 -0
  10. package/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md +12 -0
  11. package/pipeline/commands/multi-agent/refs/phases/phase-4-review.md +12 -1
  12. package/pipeline/commands/multi-agent/refs/rules.md +1 -0
  13. package/pipeline/commands/multi-agent/resume.md +7 -4
  14. package/pipeline/commands/multi-agent/review.md +33 -1
  15. package/pipeline/schemas/diff-risk.schema.json +5 -4
  16. package/pipeline/schemas/prefs.schema.json +59 -0
  17. package/pipeline/schemas/token-budget.json +7 -7
  18. package/pipeline/schemas/triage-output.schema.json +35 -4
  19. package/pipeline/scripts/README.md +3 -0
  20. package/pipeline/scripts/diff-risk-score.mjs +11 -1
  21. package/pipeline/scripts/fixtures/diff-risk-test-removal.diff +40 -0
  22. package/pipeline/scripts/fixtures/install-layout.tsv +3 -3
  23. package/pipeline/scripts/smoke-diff-risk.sh +30 -1
  24. package/pipeline/scripts/smoke-handoff-contract.sh +92 -0
  25. package/pipeline/scripts/smoke-update-check.sh +122 -0
  26. package/pipeline/scripts/smoke-verify-by-test.sh +148 -0
  27. package/pipeline/scripts/update-check.sh +82 -0
  28. package/pipeline/scripts/validate-diff-risk.mjs +2 -1
  29. package/pipeline/scripts/validate-triage.mjs +31 -2
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env bash
2
+ # smoke-verify-by-test.sh
3
+ #
4
+ # Verifies the Phase 4 Step 3.7 verify-by-test contract:
5
+ # 1. phase-4-review.md documents Step 3.7 with evidence-gate invocation + feature-doc pointer
6
+ # 2. refs/features/verify-by-test.md exists and covers the verdict table + red-test handoff
7
+ # 3. prefs.schema.json exposes global.verifyByTest.{enabled,maxFindings,model,stepTimeoutSec}
8
+ # 4. verifyByTest.enabled defaults to false (opt-in, no surprise cost)
9
+ # 5. triage-output.schema.json is v3.2.0 with the $defs.verification result enum
10
+ # 6. validate-triage.mjs accepts a valid `confirmed` verification and rejects bad ones
11
+ # 7. phase-3-dev.md documents the redTests rework re-entry
12
+ #
13
+ # Exit 0 = all pass, 1 = any failure.
14
+
15
+ set -euo pipefail
16
+
17
+ ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
18
+ PHASE4_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-4-review.md"
19
+ PHASE3_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md"
20
+ FEATURE_DOC="$ROOT/pipeline/commands/multi-agent/refs/features/verify-by-test.md"
21
+ PREFS_SCHEMA="$ROOT/pipeline/schemas/prefs.schema.json"
22
+ TRIAGE_SCHEMA="$ROOT/pipeline/schemas/triage-output.schema.json"
23
+ VALIDATOR="$ROOT/pipeline/scripts/validate-triage.mjs"
24
+
25
+ pass=0
26
+ fail=0
27
+ failures=()
28
+ record_pass() { pass=$((pass + 1)); printf ' \033[0;32mPASS\033[0m %s\n' "$1"; }
29
+ record_fail() { fail=$((fail + 1)); failures+=("$1"); printf ' \033[0;31mFAIL\033[0m %s\n' "$1"; }
30
+
31
+ printf '→ smoke-verify-by-test: Phase 4 Step 3.7 contract\n'
32
+
33
+ # 1. Phase 4 doc documents Step 3.7
34
+ if [ ! -f "$PHASE4_DOC" ]; then
35
+ record_fail "phase-4-review.md missing"
36
+ else
37
+ if grep -qF "3.7 Verify-by-test" "$PHASE4_DOC"; then
38
+ record_pass "phase-4-review.md documents Step 3.7"
39
+ else
40
+ record_fail "phase-4-review.md missing Step 3.7 section"
41
+ fi
42
+ if grep -qF "evidence-gate.mjs --claim test --status passed" "$PHASE4_DOC"; then
43
+ record_pass "Step 3.7 downgrade is evidence-gated"
44
+ else
45
+ record_fail "Step 3.7 must gate downgrades via evidence-gate.mjs --claim test --status passed"
46
+ fi
47
+ if grep -qF "refs/features/verify-by-test.md" "$PHASE4_DOC"; then
48
+ record_pass "phase-4-review.md points to the feature doc"
49
+ else
50
+ record_fail "phase-4-review.md must reference refs/features/verify-by-test.md"
51
+ fi
52
+ if grep -qF "review.verify_by_test" "$PHASE4_DOC"; then
53
+ record_pass "Step 3.7 emits review.verify_by_test telemetry"
54
+ else
55
+ record_fail "Step 3.7 must document the review.verify_by_test metric"
56
+ fi
57
+ fi
58
+
59
+ # 2. Feature doc exists with verdict + handoff coverage
60
+ if [ ! -f "$FEATURE_DOC" ]; then
61
+ record_fail "refs/features/verify-by-test.md missing"
62
+ else
63
+ for token in "not-reproduced" "inconclusive" "redTests" "Off by default"; do
64
+ if grep -qF "$token" "$FEATURE_DOC"; then
65
+ record_pass "feature doc covers '$token'"
66
+ else
67
+ record_fail "feature doc missing '$token'"
68
+ fi
69
+ done
70
+ fi
71
+
72
+ # 3. Prefs schema exposes verifyByTest knobs
73
+ for prop in enabled maxFindings model stepTimeoutSec; do
74
+ if jq -e ".properties.global.properties.verifyByTest.properties.${prop}" "$PREFS_SCHEMA" >/dev/null 2>&1; then
75
+ record_pass "prefs schema exposes verifyByTest.${prop}"
76
+ else
77
+ record_fail "prefs schema missing verifyByTest.${prop}"
78
+ fi
79
+ done
80
+
81
+ # 4. Off by default - preserves existing-user baseline
82
+ if jq -e '.properties.global.properties.verifyByTest.properties.enabled
83
+ | has("default") and .default == false' "$PREFS_SCHEMA" >/dev/null 2>&1; then
84
+ record_pass "verifyByTest.enabled defaults to false (opt-in)"
85
+ else
86
+ record_fail "verifyByTest.enabled must default to false"
87
+ fi
88
+
89
+ # 5. Triage schema version + verification enum
90
+ schema_version=$(jq -r '.version // empty' "$TRIAGE_SCHEMA")
91
+ if [ "$schema_version" = "3.2.0" ]; then
92
+ record_pass "triage-output schema version is 3.2.0"
93
+ else
94
+ record_fail "triage-output schema version should be 3.2.0 (was: ${schema_version:-missing})"
95
+ fi
96
+ if jq -e '.["$defs"].verification.properties.result.enum
97
+ | (index("confirmed") != null
98
+ and index("not-reproduced") != null
99
+ and index("inconclusive") != null)' "$TRIAGE_SCHEMA" >/dev/null 2>&1; then
100
+ record_pass "schema verification.result enum complete"
101
+ else
102
+ record_fail "schema \$defs.verification.result enum must be confirmed/not-reproduced/inconclusive"
103
+ fi
104
+
105
+ # 6. Behavioral validator round-trips
106
+ valid_fixture='{"accepted":[{"severity":"blocking","file":"Sources/Auth/Login.swift","line":42,"issue":"expired token accepted as valid","fix":"reject tokens past expiry in validateToken()","reviewer":"fable","verification":{"result":"confirmed","testRef":"AuthTests/LoginTests/testExpiredTokenRejected","evidencePath":".pipeline/verify-1.test.log"}}],"deferred":[],"rejected":[],"approved":false}'
107
+ if printf '%s' "$valid_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
108
+ record_pass "validator accepts confirmed verification with testRef+evidencePath"
109
+ else
110
+ record_fail "validator rejected a valid confirmed verification"
111
+ fi
112
+
113
+ bad_result_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"maybe"}}],"deferred":[],"rejected":[],"approved":false}'
114
+ if printf '%s' "$bad_result_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
115
+ record_fail "validator must reject verification.result 'maybe'"
116
+ else
117
+ record_pass "validator rejects bad verification.result"
118
+ fi
119
+
120
+ missing_ref_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"confirmed"}}],"deferred":[],"rejected":[],"approved":false}'
121
+ if printf '%s' "$missing_ref_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
122
+ record_fail "validator must reject confirmed verification without testRef/evidencePath"
123
+ else
124
+ record_pass "validator rejects confirmed verification lacking testRef/evidencePath"
125
+ fi
126
+
127
+ # Reviewer enum parity: fable (Claude Code default) and gpt (Copilot CLI) accepted
128
+ fable_fixture='{"accepted":[{"severity":"important","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"gpt"}],"deferred":[],"rejected":[],"approved":true}'
129
+ if printf '%s' "$fable_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
130
+ record_pass "validator accepts schema-allowed reviewers (fable/gpt)"
131
+ else
132
+ record_fail "validator must accept reviewer values fable and gpt (schema v3.1.0 parity)"
133
+ fi
134
+
135
+ # 7. Phase 3 doc documents the red-test rework re-entry
136
+ if grep -qF "verifyByTest.redTests" "$PHASE3_DOC"; then
137
+ record_pass "phase-3-dev.md documents redTests rework re-entry"
138
+ else
139
+ record_fail "phase-3-dev.md must document verifyByTest.redTests re-entry"
140
+ fi
141
+
142
+ printf '\n══ verify-by-test smoke: %d passed, %d failed ══\n' "$pass" "$fail"
143
+ if [ "$fail" -gt 0 ]; then
144
+ printf '\nFailures:\n'
145
+ for msg in "${failures[@]}"; do printf ' - %s\n' "$msg"; done
146
+ exit 1
147
+ fi
148
+ exit 0
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env bash
2
+ # update-check.sh - cached advisory version check (Phase 0 Step 0.6)
3
+ #
4
+ # Compares the locally installed pipeline version against the npm registry's
5
+ # dist-tags.latest. Cached with a TTL so at most one network call per TTL
6
+ # window; the call is bounded by a short timeout and every failure path is
7
+ # silent - this script NEVER blocks or fails the pipeline.
8
+ #
9
+ # stdout: "<local>|<latest>" when a newer version exists, nothing otherwise.
10
+ # Exit code: always 0 (advisory contract).
11
+ #
12
+ # Usage:
13
+ # bash pipeline/scripts/update-check.sh # auto-detect local version
14
+ # bash pipeline/scripts/update-check.sh --local 10.8.0 # explicit local version
15
+ # bash pipeline/scripts/update-check.sh --ttl-hours 24 # cache window (default 24)
16
+ # bash pipeline/scripts/update-check.sh --force # ignore cache
17
+ #
18
+ # Cache file: ~/.claude/logs/multi-agent/.update-check ("epoch|latest").
19
+ # Registry read is a plain curl - never `npm view` (a user-level .npmrc scope
20
+ # mapping can silently reroute npm to a different registry; curl cannot lie).
21
+
22
+ set -euo pipefail
23
+
24
+ PKG="@mmerterden/multi-agent-pipeline"
25
+ REGISTRY_URL="https://registry.npmjs.org/${PKG/\//%2F}"
26
+ CACHE_FILE="${UPDATE_CHECK_CACHE:-$HOME/.claude/logs/multi-agent/.update-check}"
27
+ TTL_HOURS=24
28
+ LOCAL_VERSION=""
29
+ FORCE=0
30
+
31
+ while [ $# -gt 0 ]; do
32
+ case "$1" in
33
+ --local) LOCAL_VERSION="${2:-}"; shift 2 ;;
34
+ --ttl-hours) TTL_HOURS="${2:-24}"; shift 2 ;;
35
+ --force) FORCE=1; shift ;;
36
+ *) shift ;;
37
+ esac
38
+ done
39
+
40
+ # Local version: explicit arg, else read from the pipeline repo clone.
41
+ if [ -z "$LOCAL_VERSION" ]; then
42
+ for candidate in "$HOME/multi-agent-pipeline" "$HOME/dev/multi-agent-pipeline" "$HOME/projects/multi-agent-pipeline"; do
43
+ if [ -f "$candidate/package.json" ]; then
44
+ LOCAL_VERSION=$(node -p "require('$candidate/package.json').version" 2>/dev/null || true)
45
+ [ -n "$LOCAL_VERSION" ] && break
46
+ fi
47
+ done
48
+ fi
49
+ [ -z "$LOCAL_VERSION" ] && exit 0 # cannot determine local version -> silent no-op
50
+
51
+ now=$(date +%s)
52
+ latest=""
53
+
54
+ # Fresh cache?
55
+ if [ "$FORCE" -eq 0 ] && [ -f "$CACHE_FILE" ]; then
56
+ cached_epoch=$(cut -d'|' -f1 "$CACHE_FILE" 2>/dev/null || echo 0)
57
+ cached_latest=$(cut -d'|' -f2 "$CACHE_FILE" 2>/dev/null || echo "")
58
+ case "$cached_epoch" in (*[!0-9]*|"") cached_epoch=0 ;; esac
59
+ if [ $((now - cached_epoch)) -lt $((TTL_HOURS * 3600)) ] && [ -n "$cached_latest" ]; then
60
+ latest="$cached_latest"
61
+ fi
62
+ fi
63
+
64
+ # Stale or missing cache -> one bounded registry call (silent on any failure).
65
+ if [ -z "$latest" ]; then
66
+ latest=$(curl -sm 3 "$REGISTRY_URL" 2>/dev/null \
67
+ | { command -v jq >/dev/null 2>&1 && jq -r '."dist-tags".latest // empty' \
68
+ || sed -n 's/.*"latest":"\([^"]*\)".*/\1/p'; } | head -1) || true
69
+ [ -z "$latest" ] && exit 0
70
+ mkdir -p "$(dirname "$CACHE_FILE")" 2>/dev/null || exit 0
71
+ printf '%s|%s\n' "$now" "$latest" > "$CACHE_FILE" 2>/dev/null || true
72
+ fi
73
+
74
+ [ "$latest" = "$LOCAL_VERSION" ] && exit 0
75
+
76
+ # Update available only when latest sorts strictly ABOVE local (a dev machine
77
+ # running ahead of the registry must not see an "update" prompt).
78
+ highest=$(printf '%s\n%s\n' "$LOCAL_VERSION" "$latest" | sort -t. -k1,1n -k2,2n -k3,3n | tail -1)
79
+ if [ "$highest" = "$latest" ]; then
80
+ printf '%s|%s\n' "$LOCAL_VERSION" "$latest"
81
+ fi
82
+ exit 0
@@ -23,6 +23,7 @@ const ALLOWED_SIGNALS = new Set([
23
23
  "complexity_delta",
24
24
  "ui_critical",
25
25
  "migration",
26
+ "test_lines_removed",
26
27
  ]);
27
28
 
28
29
  function readInput() {
@@ -48,7 +49,7 @@ function validate(obj) {
48
49
  if (typeof obj !== "object" || obj === null || Array.isArray(obj)) {
49
50
  return ["root must be an object"];
50
51
  }
51
- if (obj.schemaVersion !== "1.0.0") errors.push(`schemaVersion must be "1.0.0", got ${JSON.stringify(obj.schemaVersion)}`);
52
+ if (obj.schemaVersion !== "1.1.0") errors.push(`schemaVersion must be "1.1.0", got ${JSON.stringify(obj.schemaVersion)}`);
52
53
 
53
54
  if (typeof obj.task !== "object" || obj.task === null) {
54
55
  errors.push("task must be an object");
@@ -23,9 +23,10 @@
23
23
 
24
24
  import { readFileSync } from "node:fs";
25
25
 
26
- const ALLOWED_REVIEWERS = new Set(["opus", "sonnet"]);
26
+ const ALLOWED_REVIEWERS = new Set(["fable", "opus", "sonnet", "gpt"]);
27
27
  const ALLOWED_SEVERITIES = new Set(["blocking", "important", "suggestion"]);
28
28
  const ALLOWED_CONSENSUS_VERDICTS = new Set(["unanimous-pass", "unanimous-block", "split", "unverified"]);
29
+ const ALLOWED_VERIFICATION_RESULTS = new Set(["confirmed", "not-reproduced", "inconclusive"]);
29
30
  const OVER_REJECT_THRESHOLD = 0.8;
30
31
  const OVER_REJECT_MIN_FINDINGS = 5;
31
32
 
@@ -64,13 +65,41 @@ function validateRawFinding(f, label, errors) {
64
65
  if (typeof f.issue !== "string" || f.issue.length < 4) {
65
66
  errors.push(`${label}: issue must be a string ≥4 chars`);
66
67
  }
68
+ if (f.verification !== undefined) {
69
+ validateVerification(f.verification, `${label}.verification`, errors);
70
+ }
71
+ }
72
+
73
+ // v3.2.0 verify-by-test outcome (Phase 4 Step 3.7). Optional; when present:
74
+ // result is required, and confirmed/not-reproduced additionally require
75
+ // testRef + evidencePath (the empirical claims must be traceable).
76
+ function validateVerification(v, label, errors) {
77
+ if (typeof v !== "object" || v === null || Array.isArray(v)) {
78
+ errors.push(`${label}: must be an object when present`);
79
+ return;
80
+ }
81
+ if (!ALLOWED_VERIFICATION_RESULTS.has(v.result)) {
82
+ errors.push(`${label}: bad result "${v.result}" (allowed: confirmed|not-reproduced|inconclusive)`);
83
+ return;
84
+ }
85
+ if (v.result === "confirmed" || v.result === "not-reproduced") {
86
+ if (typeof v.testRef !== "string" || v.testRef.length === 0) {
87
+ errors.push(`${label}: testRef required and non-empty when result is "${v.result}"`);
88
+ }
89
+ if (typeof v.evidencePath !== "string" || v.evidencePath.length === 0) {
90
+ errors.push(`${label}: evidencePath required and non-empty when result is "${v.result}"`);
91
+ }
92
+ }
93
+ if (v.note !== undefined && typeof v.note !== "string") {
94
+ errors.push(`${label}: note must be a string when present`);
95
+ }
67
96
  }
68
97
 
69
98
  function validateAccepted(f, i, errors) {
70
99
  validateRawFinding(f, `accepted[${i}]`, errors);
71
100
  if (!ALLOWED_REVIEWERS.has(f.reviewer)) {
72
101
  errors.push(
73
- `accepted[${i}]: reviewer must be "opus" or "sonnet" (got ${JSON.stringify(f.reviewer)}; haiku was removed in v2.1.0)`,
102
+ `accepted[${i}]: reviewer must be one of fable|opus|sonnet|gpt (got ${JSON.stringify(f.reviewer)}; haiku was removed in v2.1.0)`,
74
103
  );
75
104
  }
76
105
  if (typeof f.fix !== "string" || f.fix.length < 4) {