@mmerterden/multi-agent-pipeline 10.7.4 → 10.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +93 -0
- package/README.md +2 -0
- package/docs/engineering.md +76 -0
- package/docs/features.md +49 -33
- package/package.json +1 -1
- package/pipeline/commands/multi-agent/refs/features/verify-by-test.md +41 -0
- package/pipeline/commands/multi-agent/refs/phases/log-format.md +10 -0
- package/pipeline/commands/multi-agent/refs/phases/operations.md +15 -2
- package/pipeline/commands/multi-agent/refs/phases/phase-0-init.md +9 -0
- package/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md +12 -0
- package/pipeline/commands/multi-agent/refs/phases/phase-4-review.md +12 -1
- package/pipeline/commands/multi-agent/refs/rules.md +1 -0
- package/pipeline/commands/multi-agent/resume.md +7 -4
- package/pipeline/commands/multi-agent/review.md +33 -1
- package/pipeline/schemas/diff-risk.schema.json +5 -4
- package/pipeline/schemas/prefs.schema.json +59 -0
- package/pipeline/schemas/token-budget.json +7 -7
- package/pipeline/schemas/triage-output.schema.json +35 -4
- package/pipeline/scripts/README.md +3 -0
- package/pipeline/scripts/diff-risk-score.mjs +11 -1
- package/pipeline/scripts/fixtures/diff-risk-test-removal.diff +40 -0
- package/pipeline/scripts/fixtures/install-layout.tsv +3 -3
- package/pipeline/scripts/smoke-diff-risk.sh +30 -1
- package/pipeline/scripts/smoke-handoff-contract.sh +92 -0
- package/pipeline/scripts/smoke-update-check.sh +122 -0
- package/pipeline/scripts/smoke-verify-by-test.sh +148 -0
- package/pipeline/scripts/update-check.sh +82 -0
- package/pipeline/scripts/validate-diff-risk.mjs +2 -1
- package/pipeline/scripts/validate-triage.mjs +31 -2
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# smoke-verify-by-test.sh
|
|
3
|
+
#
|
|
4
|
+
# Verifies the Phase 4 Step 3.7 verify-by-test contract:
|
|
5
|
+
# 1. phase-4-review.md documents Step 3.7 with evidence-gate invocation + feature-doc pointer
|
|
6
|
+
# 2. refs/features/verify-by-test.md exists and covers the verdict table + red-test handoff
|
|
7
|
+
# 3. prefs.schema.json exposes global.verifyByTest.{enabled,maxFindings,model,stepTimeoutSec}
|
|
8
|
+
# 4. verifyByTest.enabled defaults to false (opt-in, no surprise cost)
|
|
9
|
+
# 5. triage-output.schema.json is v3.2.0 with the $defs.verification result enum
|
|
10
|
+
# 6. validate-triage.mjs accepts a valid `confirmed` verification and rejects bad ones
|
|
11
|
+
# 7. phase-3-dev.md documents the redTests rework re-entry
|
|
12
|
+
#
|
|
13
|
+
# Exit 0 = all pass, 1 = any failure.
|
|
14
|
+
|
|
15
|
+
set -euo pipefail
|
|
16
|
+
|
|
17
|
+
ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
|
18
|
+
PHASE4_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-4-review.md"
|
|
19
|
+
PHASE3_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md"
|
|
20
|
+
FEATURE_DOC="$ROOT/pipeline/commands/multi-agent/refs/features/verify-by-test.md"
|
|
21
|
+
PREFS_SCHEMA="$ROOT/pipeline/schemas/prefs.schema.json"
|
|
22
|
+
TRIAGE_SCHEMA="$ROOT/pipeline/schemas/triage-output.schema.json"
|
|
23
|
+
VALIDATOR="$ROOT/pipeline/scripts/validate-triage.mjs"
|
|
24
|
+
|
|
25
|
+
pass=0
|
|
26
|
+
fail=0
|
|
27
|
+
failures=()
|
|
28
|
+
record_pass() { pass=$((pass + 1)); printf ' \033[0;32mPASS\033[0m %s\n' "$1"; }
|
|
29
|
+
record_fail() { fail=$((fail + 1)); failures+=("$1"); printf ' \033[0;31mFAIL\033[0m %s\n' "$1"; }
|
|
30
|
+
|
|
31
|
+
printf '→ smoke-verify-by-test: Phase 4 Step 3.7 contract\n'
|
|
32
|
+
|
|
33
|
+
# 1. Phase 4 doc documents Step 3.7
|
|
34
|
+
if [ ! -f "$PHASE4_DOC" ]; then
|
|
35
|
+
record_fail "phase-4-review.md missing"
|
|
36
|
+
else
|
|
37
|
+
if grep -qF "3.7 Verify-by-test" "$PHASE4_DOC"; then
|
|
38
|
+
record_pass "phase-4-review.md documents Step 3.7"
|
|
39
|
+
else
|
|
40
|
+
record_fail "phase-4-review.md missing Step 3.7 section"
|
|
41
|
+
fi
|
|
42
|
+
if grep -qF "evidence-gate.mjs --claim test --status passed" "$PHASE4_DOC"; then
|
|
43
|
+
record_pass "Step 3.7 downgrade is evidence-gated"
|
|
44
|
+
else
|
|
45
|
+
record_fail "Step 3.7 must gate downgrades via evidence-gate.mjs --claim test --status passed"
|
|
46
|
+
fi
|
|
47
|
+
if grep -qF "refs/features/verify-by-test.md" "$PHASE4_DOC"; then
|
|
48
|
+
record_pass "phase-4-review.md points to the feature doc"
|
|
49
|
+
else
|
|
50
|
+
record_fail "phase-4-review.md must reference refs/features/verify-by-test.md"
|
|
51
|
+
fi
|
|
52
|
+
if grep -qF "review.verify_by_test" "$PHASE4_DOC"; then
|
|
53
|
+
record_pass "Step 3.7 emits review.verify_by_test telemetry"
|
|
54
|
+
else
|
|
55
|
+
record_fail "Step 3.7 must document the review.verify_by_test metric"
|
|
56
|
+
fi
|
|
57
|
+
fi
|
|
58
|
+
|
|
59
|
+
# 2. Feature doc exists with verdict + handoff coverage
|
|
60
|
+
if [ ! -f "$FEATURE_DOC" ]; then
|
|
61
|
+
record_fail "refs/features/verify-by-test.md missing"
|
|
62
|
+
else
|
|
63
|
+
for token in "not-reproduced" "inconclusive" "redTests" "Off by default"; do
|
|
64
|
+
if grep -qF "$token" "$FEATURE_DOC"; then
|
|
65
|
+
record_pass "feature doc covers '$token'"
|
|
66
|
+
else
|
|
67
|
+
record_fail "feature doc missing '$token'"
|
|
68
|
+
fi
|
|
69
|
+
done
|
|
70
|
+
fi
|
|
71
|
+
|
|
72
|
+
# 3. Prefs schema exposes verifyByTest knobs
|
|
73
|
+
for prop in enabled maxFindings model stepTimeoutSec; do
|
|
74
|
+
if jq -e ".properties.global.properties.verifyByTest.properties.${prop}" "$PREFS_SCHEMA" >/dev/null 2>&1; then
|
|
75
|
+
record_pass "prefs schema exposes verifyByTest.${prop}"
|
|
76
|
+
else
|
|
77
|
+
record_fail "prefs schema missing verifyByTest.${prop}"
|
|
78
|
+
fi
|
|
79
|
+
done
|
|
80
|
+
|
|
81
|
+
# 4. Off by default - preserves existing-user baseline
|
|
82
|
+
if jq -e '.properties.global.properties.verifyByTest.properties.enabled
|
|
83
|
+
| has("default") and .default == false' "$PREFS_SCHEMA" >/dev/null 2>&1; then
|
|
84
|
+
record_pass "verifyByTest.enabled defaults to false (opt-in)"
|
|
85
|
+
else
|
|
86
|
+
record_fail "verifyByTest.enabled must default to false"
|
|
87
|
+
fi
|
|
88
|
+
|
|
89
|
+
# 5. Triage schema version + verification enum
|
|
90
|
+
schema_version=$(jq -r '.version // empty' "$TRIAGE_SCHEMA")
|
|
91
|
+
if [ "$schema_version" = "3.2.0" ]; then
|
|
92
|
+
record_pass "triage-output schema version is 3.2.0"
|
|
93
|
+
else
|
|
94
|
+
record_fail "triage-output schema version should be 3.2.0 (was: ${schema_version:-missing})"
|
|
95
|
+
fi
|
|
96
|
+
if jq -e '.["$defs"].verification.properties.result.enum
|
|
97
|
+
| (index("confirmed") != null
|
|
98
|
+
and index("not-reproduced") != null
|
|
99
|
+
and index("inconclusive") != null)' "$TRIAGE_SCHEMA" >/dev/null 2>&1; then
|
|
100
|
+
record_pass "schema verification.result enum complete"
|
|
101
|
+
else
|
|
102
|
+
record_fail "schema \$defs.verification.result enum must be confirmed/not-reproduced/inconclusive"
|
|
103
|
+
fi
|
|
104
|
+
|
|
105
|
+
# 6. Behavioral validator round-trips
|
|
106
|
+
valid_fixture='{"accepted":[{"severity":"blocking","file":"Sources/Auth/Login.swift","line":42,"issue":"expired token accepted as valid","fix":"reject tokens past expiry in validateToken()","reviewer":"fable","verification":{"result":"confirmed","testRef":"AuthTests/LoginTests/testExpiredTokenRejected","evidencePath":".pipeline/verify-1.test.log"}}],"deferred":[],"rejected":[],"approved":false}'
|
|
107
|
+
if printf '%s' "$valid_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
|
|
108
|
+
record_pass "validator accepts confirmed verification with testRef+evidencePath"
|
|
109
|
+
else
|
|
110
|
+
record_fail "validator rejected a valid confirmed verification"
|
|
111
|
+
fi
|
|
112
|
+
|
|
113
|
+
bad_result_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"maybe"}}],"deferred":[],"rejected":[],"approved":false}'
|
|
114
|
+
if printf '%s' "$bad_result_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
|
|
115
|
+
record_fail "validator must reject verification.result 'maybe'"
|
|
116
|
+
else
|
|
117
|
+
record_pass "validator rejects bad verification.result"
|
|
118
|
+
fi
|
|
119
|
+
|
|
120
|
+
missing_ref_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"confirmed"}}],"deferred":[],"rejected":[],"approved":false}'
|
|
121
|
+
if printf '%s' "$missing_ref_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
|
|
122
|
+
record_fail "validator must reject confirmed verification without testRef/evidencePath"
|
|
123
|
+
else
|
|
124
|
+
record_pass "validator rejects confirmed verification lacking testRef/evidencePath"
|
|
125
|
+
fi
|
|
126
|
+
|
|
127
|
+
# Reviewer enum parity: fable (Claude Code default) and gpt (Copilot CLI) accepted
|
|
128
|
+
fable_fixture='{"accepted":[{"severity":"important","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"gpt"}],"deferred":[],"rejected":[],"approved":true}'
|
|
129
|
+
if printf '%s' "$fable_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
|
|
130
|
+
record_pass "validator accepts schema-allowed reviewers (fable/gpt)"
|
|
131
|
+
else
|
|
132
|
+
record_fail "validator must accept reviewer values fable and gpt (schema v3.1.0 parity)"
|
|
133
|
+
fi
|
|
134
|
+
|
|
135
|
+
# 7. Phase 3 doc documents the red-test rework re-entry
|
|
136
|
+
if grep -qF "verifyByTest.redTests" "$PHASE3_DOC"; then
|
|
137
|
+
record_pass "phase-3-dev.md documents redTests rework re-entry"
|
|
138
|
+
else
|
|
139
|
+
record_fail "phase-3-dev.md must document verifyByTest.redTests re-entry"
|
|
140
|
+
fi
|
|
141
|
+
|
|
142
|
+
printf '\n══ verify-by-test smoke: %d passed, %d failed ══\n' "$pass" "$fail"
|
|
143
|
+
if [ "$fail" -gt 0 ]; then
|
|
144
|
+
printf '\nFailures:\n'
|
|
145
|
+
for msg in "${failures[@]}"; do printf ' - %s\n' "$msg"; done
|
|
146
|
+
exit 1
|
|
147
|
+
fi
|
|
148
|
+
exit 0
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# update-check.sh - cached advisory version check (Phase 0 Step 0.6)
|
|
3
|
+
#
|
|
4
|
+
# Compares the locally installed pipeline version against the npm registry's
|
|
5
|
+
# dist-tags.latest. Cached with a TTL so at most one network call per TTL
|
|
6
|
+
# window; the call is bounded by a short timeout and every failure path is
|
|
7
|
+
# silent - this script NEVER blocks or fails the pipeline.
|
|
8
|
+
#
|
|
9
|
+
# stdout: "<local>|<latest>" when a newer version exists, nothing otherwise.
|
|
10
|
+
# Exit code: always 0 (advisory contract).
|
|
11
|
+
#
|
|
12
|
+
# Usage:
|
|
13
|
+
# bash pipeline/scripts/update-check.sh # auto-detect local version
|
|
14
|
+
# bash pipeline/scripts/update-check.sh --local 10.8.0 # explicit local version
|
|
15
|
+
# bash pipeline/scripts/update-check.sh --ttl-hours 24 # cache window (default 24)
|
|
16
|
+
# bash pipeline/scripts/update-check.sh --force # ignore cache
|
|
17
|
+
#
|
|
18
|
+
# Cache file: ~/.claude/logs/multi-agent/.update-check ("epoch|latest").
|
|
19
|
+
# Registry read is a plain curl - never `npm view` (a user-level .npmrc scope
|
|
20
|
+
# mapping can silently reroute npm to a different registry; curl cannot lie).
|
|
21
|
+
|
|
22
|
+
set -euo pipefail
|
|
23
|
+
|
|
24
|
+
PKG="@mmerterden/multi-agent-pipeline"
|
|
25
|
+
REGISTRY_URL="https://registry.npmjs.org/${PKG/\//%2F}"
|
|
26
|
+
CACHE_FILE="${UPDATE_CHECK_CACHE:-$HOME/.claude/logs/multi-agent/.update-check}"
|
|
27
|
+
TTL_HOURS=24
|
|
28
|
+
LOCAL_VERSION=""
|
|
29
|
+
FORCE=0
|
|
30
|
+
|
|
31
|
+
while [ $# -gt 0 ]; do
|
|
32
|
+
case "$1" in
|
|
33
|
+
--local) LOCAL_VERSION="${2:-}"; shift 2 ;;
|
|
34
|
+
--ttl-hours) TTL_HOURS="${2:-24}"; shift 2 ;;
|
|
35
|
+
--force) FORCE=1; shift ;;
|
|
36
|
+
*) shift ;;
|
|
37
|
+
esac
|
|
38
|
+
done
|
|
39
|
+
|
|
40
|
+
# Local version: explicit arg, else read from the pipeline repo clone.
|
|
41
|
+
if [ -z "$LOCAL_VERSION" ]; then
|
|
42
|
+
for candidate in "$HOME/multi-agent-pipeline" "$HOME/dev/multi-agent-pipeline" "$HOME/projects/multi-agent-pipeline"; do
|
|
43
|
+
if [ -f "$candidate/package.json" ]; then
|
|
44
|
+
LOCAL_VERSION=$(node -p "require('$candidate/package.json').version" 2>/dev/null || true)
|
|
45
|
+
[ -n "$LOCAL_VERSION" ] && break
|
|
46
|
+
fi
|
|
47
|
+
done
|
|
48
|
+
fi
|
|
49
|
+
[ -z "$LOCAL_VERSION" ] && exit 0 # cannot determine local version -> silent no-op
|
|
50
|
+
|
|
51
|
+
now=$(date +%s)
|
|
52
|
+
latest=""
|
|
53
|
+
|
|
54
|
+
# Fresh cache?
|
|
55
|
+
if [ "$FORCE" -eq 0 ] && [ -f "$CACHE_FILE" ]; then
|
|
56
|
+
cached_epoch=$(cut -d'|' -f1 "$CACHE_FILE" 2>/dev/null || echo 0)
|
|
57
|
+
cached_latest=$(cut -d'|' -f2 "$CACHE_FILE" 2>/dev/null || echo "")
|
|
58
|
+
case "$cached_epoch" in (*[!0-9]*|"") cached_epoch=0 ;; esac
|
|
59
|
+
if [ $((now - cached_epoch)) -lt $((TTL_HOURS * 3600)) ] && [ -n "$cached_latest" ]; then
|
|
60
|
+
latest="$cached_latest"
|
|
61
|
+
fi
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
# Stale or missing cache -> one bounded registry call (silent on any failure).
|
|
65
|
+
if [ -z "$latest" ]; then
|
|
66
|
+
latest=$(curl -sm 3 "$REGISTRY_URL" 2>/dev/null \
|
|
67
|
+
| { command -v jq >/dev/null 2>&1 && jq -r '."dist-tags".latest // empty' \
|
|
68
|
+
|| sed -n 's/.*"latest":"\([^"]*\)".*/\1/p'; } | head -1) || true
|
|
69
|
+
[ -z "$latest" ] && exit 0
|
|
70
|
+
mkdir -p "$(dirname "$CACHE_FILE")" 2>/dev/null || exit 0
|
|
71
|
+
printf '%s|%s\n' "$now" "$latest" > "$CACHE_FILE" 2>/dev/null || true
|
|
72
|
+
fi
|
|
73
|
+
|
|
74
|
+
[ "$latest" = "$LOCAL_VERSION" ] && exit 0
|
|
75
|
+
|
|
76
|
+
# Update available only when latest sorts strictly ABOVE local (a dev machine
|
|
77
|
+
# running ahead of the registry must not see an "update" prompt).
|
|
78
|
+
highest=$(printf '%s\n%s\n' "$LOCAL_VERSION" "$latest" | sort -t. -k1,1n -k2,2n -k3,3n | tail -1)
|
|
79
|
+
if [ "$highest" = "$latest" ]; then
|
|
80
|
+
printf '%s|%s\n' "$LOCAL_VERSION" "$latest"
|
|
81
|
+
fi
|
|
82
|
+
exit 0
|
|
@@ -23,6 +23,7 @@ const ALLOWED_SIGNALS = new Set([
|
|
|
23
23
|
"complexity_delta",
|
|
24
24
|
"ui_critical",
|
|
25
25
|
"migration",
|
|
26
|
+
"test_lines_removed",
|
|
26
27
|
]);
|
|
27
28
|
|
|
28
29
|
function readInput() {
|
|
@@ -48,7 +49,7 @@ function validate(obj) {
|
|
|
48
49
|
if (typeof obj !== "object" || obj === null || Array.isArray(obj)) {
|
|
49
50
|
return ["root must be an object"];
|
|
50
51
|
}
|
|
51
|
-
if (obj.schemaVersion !== "1.
|
|
52
|
+
if (obj.schemaVersion !== "1.1.0") errors.push(`schemaVersion must be "1.1.0", got ${JSON.stringify(obj.schemaVersion)}`);
|
|
52
53
|
|
|
53
54
|
if (typeof obj.task !== "object" || obj.task === null) {
|
|
54
55
|
errors.push("task must be an object");
|
|
@@ -23,9 +23,10 @@
|
|
|
23
23
|
|
|
24
24
|
import { readFileSync } from "node:fs";
|
|
25
25
|
|
|
26
|
-
const ALLOWED_REVIEWERS = new Set(["opus", "sonnet"]);
|
|
26
|
+
const ALLOWED_REVIEWERS = new Set(["fable", "opus", "sonnet", "gpt"]);
|
|
27
27
|
const ALLOWED_SEVERITIES = new Set(["blocking", "important", "suggestion"]);
|
|
28
28
|
const ALLOWED_CONSENSUS_VERDICTS = new Set(["unanimous-pass", "unanimous-block", "split", "unverified"]);
|
|
29
|
+
const ALLOWED_VERIFICATION_RESULTS = new Set(["confirmed", "not-reproduced", "inconclusive"]);
|
|
29
30
|
const OVER_REJECT_THRESHOLD = 0.8;
|
|
30
31
|
const OVER_REJECT_MIN_FINDINGS = 5;
|
|
31
32
|
|
|
@@ -64,13 +65,41 @@ function validateRawFinding(f, label, errors) {
|
|
|
64
65
|
if (typeof f.issue !== "string" || f.issue.length < 4) {
|
|
65
66
|
errors.push(`${label}: issue must be a string ≥4 chars`);
|
|
66
67
|
}
|
|
68
|
+
if (f.verification !== undefined) {
|
|
69
|
+
validateVerification(f.verification, `${label}.verification`, errors);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// v3.2.0 verify-by-test outcome (Phase 4 Step 3.7). Optional; when present:
|
|
74
|
+
// result is required, and confirmed/not-reproduced additionally require
|
|
75
|
+
// testRef + evidencePath (the empirical claims must be traceable).
|
|
76
|
+
function validateVerification(v, label, errors) {
|
|
77
|
+
if (typeof v !== "object" || v === null || Array.isArray(v)) {
|
|
78
|
+
errors.push(`${label}: must be an object when present`);
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
if (!ALLOWED_VERIFICATION_RESULTS.has(v.result)) {
|
|
82
|
+
errors.push(`${label}: bad result "${v.result}" (allowed: confirmed|not-reproduced|inconclusive)`);
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
if (v.result === "confirmed" || v.result === "not-reproduced") {
|
|
86
|
+
if (typeof v.testRef !== "string" || v.testRef.length === 0) {
|
|
87
|
+
errors.push(`${label}: testRef required and non-empty when result is "${v.result}"`);
|
|
88
|
+
}
|
|
89
|
+
if (typeof v.evidencePath !== "string" || v.evidencePath.length === 0) {
|
|
90
|
+
errors.push(`${label}: evidencePath required and non-empty when result is "${v.result}"`);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (v.note !== undefined && typeof v.note !== "string") {
|
|
94
|
+
errors.push(`${label}: note must be a string when present`);
|
|
95
|
+
}
|
|
67
96
|
}
|
|
68
97
|
|
|
69
98
|
function validateAccepted(f, i, errors) {
|
|
70
99
|
validateRawFinding(f, `accepted[${i}]`, errors);
|
|
71
100
|
if (!ALLOWED_REVIEWERS.has(f.reviewer)) {
|
|
72
101
|
errors.push(
|
|
73
|
-
`accepted[${i}]: reviewer must be
|
|
102
|
+
`accepted[${i}]: reviewer must be one of fable|opus|sonnet|gpt (got ${JSON.stringify(f.reviewer)}; haiku was removed in v2.1.0)`,
|
|
74
103
|
);
|
|
75
104
|
}
|
|
76
105
|
if (typeof f.fix !== "string" || f.fix.length < 4) {
|