@mmerterden/multi-agent-pipeline 10.7.3 → 10.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +73 -2
  2. package/docs/adr/0001-three-model-triage.md +2 -2
  3. package/docs/adr/0007-multi-tool-adapter-framework.md +1 -1
  4. package/docs/adr/README.md +2 -2
  5. package/docs/architecture.md +14 -14
  6. package/docs/features.md +35 -22
  7. package/docs/performance.md +3 -3
  8. package/index.js +3 -7
  9. package/install/templates/copilot-instructions.md +2 -2
  10. package/package.json +2 -5
  11. package/pipeline/agents/dev-critic.md +1 -1
  12. package/pipeline/claude-md-template.md +1 -1
  13. package/pipeline/commands/multi-agent/dev-autopilot.md +1 -1
  14. package/pipeline/commands/multi-agent/finish.md +2 -2
  15. package/pipeline/commands/multi-agent/help.md +12 -12
  16. package/pipeline/commands/multi-agent/local.md +1 -1
  17. package/pipeline/commands/multi-agent/refs/features/dev-critic.md +1 -1
  18. package/pipeline/commands/multi-agent/refs/features/model-fallback.md +7 -3
  19. package/pipeline/commands/multi-agent/refs/features/verify-by-test.md +41 -0
  20. package/pipeline/commands/multi-agent/refs/knowledge.md +1 -1
  21. package/pipeline/commands/multi-agent/refs/phases/log-format.md +11 -1
  22. package/pipeline/commands/multi-agent/refs/phases/modes.md +1 -1
  23. package/pipeline/commands/multi-agent/refs/phases/operations.md +15 -2
  24. package/pipeline/commands/multi-agent/refs/phases/phase-1-analysis.md +2 -2
  25. package/pipeline/commands/multi-agent/refs/phases/phase-2-planning.md +2 -2
  26. package/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md +3 -1
  27. package/pipeline/commands/multi-agent/refs/phases/phase-4-review.md +51 -19
  28. package/pipeline/commands/multi-agent/refs/progress-contract.md +1 -1
  29. package/pipeline/commands/multi-agent/refs/rules.md +1 -0
  30. package/pipeline/commands/multi-agent/refs/tracker-contract.md +1 -2
  31. package/pipeline/commands/multi-agent/resume.md +7 -4
  32. package/pipeline/commands/multi-agent/review.md +41 -9
  33. package/pipeline/commands/multi-agent/sync.md +3 -3
  34. package/pipeline/commands/multi-agent.md +7 -7
  35. package/pipeline/schemas/agent-state.schema.json +1 -1
  36. package/pipeline/schemas/diff-risk.schema.json +5 -4
  37. package/pipeline/schemas/prefs.schema.json +38 -3
  38. package/pipeline/schemas/reviewer-output.schema.json +1 -1
  39. package/pipeline/schemas/triage-output.schema.json +37 -6
  40. package/pipeline/scripts/README.md +3 -2
  41. package/pipeline/scripts/cost-budget-check.mjs +1 -1
  42. package/pipeline/scripts/cost-table.json +7 -0
  43. package/pipeline/scripts/diff-risk-score.mjs +11 -1
  44. package/pipeline/scripts/fixtures/diff-risk-test-removal.diff +40 -0
  45. package/pipeline/scripts/fixtures/install-layout.tsv +5 -5
  46. package/pipeline/scripts/smoke-diff-risk.sh +30 -1
  47. package/pipeline/scripts/smoke-handoff-contract.sh +92 -0
  48. package/pipeline/scripts/smoke-verify-by-test.sh +148 -0
  49. package/pipeline/scripts/uninstall.mjs +53 -57
  50. package/pipeline/scripts/validate-diff-risk.mjs +2 -1
  51. package/pipeline/scripts/validate-triage.mjs +31 -2
  52. package/pipeline/skills/shared/core/multi-agent/SKILL.md +11 -11
  53. package/pipeline/skills/shared/core/multi-agent-dev-autopilot/SKILL.md +1 -1
  54. package/pipeline/skills/shared/core/multi-agent-finish/SKILL.md +1 -1
  55. package/pipeline/skills/shared/core/multi-agent-help/SKILL.md +8 -8
  56. package/pipeline/skills/shared/core/multi-agent-review/SKILL.md +5 -5
  57. package/pipeline/skills/shared/core/multi-agent-sync/SKILL.md +7 -5
  58. package/pipeline/scripts/smoke-readme-counts.sh +0 -120
@@ -1,16 +1,16 @@
1
1
  {
2
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
3
  "$id": "https://github.com/mmerterden/multi-agent-pipeline/pipeline/schemas/diff-risk.schema.json",
4
- "version": "1.0.0",
4
+ "version": "1.1.0",
5
5
  "title": "Multi-Agent Pipeline - Phase 4 diff risk score",
6
- "description": "Output contract for diff-risk-score.mjs. Heuristic, deterministic, no LLM. Produced before Phase 4 Step 2 to give reviewer prompts a priority ordering - never used as a gate.",
6
+ "description": "Output contract for diff-risk-score.mjs. Heuristic, deterministic, no LLM. Produced before Phase 4 Step 2 to give reviewer prompts a priority ordering - never used as a gate. v1.1.0 adds the test_lines_removed signal (immutable-test backstop: a test file whose diff removes more lines than it adds).",
7
7
  "type": "object",
8
8
  "additionalProperties": false,
9
9
  "required": ["schemaVersion", "task", "totals", "files"],
10
10
  "properties": {
11
11
  "schemaVersion": {
12
12
  "type": "string",
13
- "const": "1.0.0"
13
+ "const": "1.1.0"
14
14
  },
15
15
  "task": {
16
16
  "type": "object",
@@ -63,7 +63,8 @@
63
63
  "no_test_change",
64
64
  "complexity_delta",
65
65
  "ui_critical",
66
- "migration"
66
+ "migration",
67
+ "test_lines_removed"
67
68
  ]
68
69
  },
69
70
  "weight": { "type": "number" },
@@ -701,6 +701,41 @@
701
701
  "default": false,
702
702
  "description": "v6.1.0+ \u2014 Phase 4 Step 2.5 rebuttal round. When reviewers disagree (mixed blocker/approved verdict), each reviewer is re-prompted with the others' opposing arguments for one additional round before triage. Lifts signal quality on ambiguous findings at ~1\u00d7 Step 2 token cost. Off by default \u2014 flip for security-critical or release-branch reviews."
703
703
  },
704
+ "verifyByTest": {
705
+ "type": "object",
706
+ "additionalProperties": false,
707
+ "description": "v10.8+ - Phase 4 Step 3.7 verify-by-test. When enabled, accepted BLOCKING findings are empirically validated before the Phase 3 rework loop: one verifier agent writes a minimal repro test per finding and runs only that test. Confirmed findings hand their failing test to Phase 3 as the RED step; non-reproducible findings are downgraded to deferred under evidence-gate. Only blocking findings are ever verified (fixed behavior, not a knob). Adds one model call plus up to maxFindings single-test runs per iteration with accepted blockers; default off. Flip on for security-critical work, release branches, or repos with noisy reviewers. Full spec: refs/features/verify-by-test.md.",
708
+ "properties": {
709
+ "enabled": {
710
+ "type": "boolean",
711
+ "default": false,
712
+ "description": "Master switch."
713
+ },
714
+ "maxFindings": {
715
+ "type": "integer",
716
+ "minimum": 1,
717
+ "maximum": 10,
718
+ "default": 3,
719
+ "description": "Max accepted blocking findings verified per review iteration. Findings beyond the cap keep their judgment-only verdict."
720
+ },
721
+ "model": {
722
+ "type": "string",
723
+ "enum": [
724
+ "sonnet",
725
+ "opus"
726
+ ],
727
+ "default": "sonnet",
728
+ "description": "Verifier agent model. Writing a minimal repro test is mechanical work; Sonnet is the cost-sane default."
729
+ },
730
+ "stepTimeoutSec": {
731
+ "type": "integer",
732
+ "minimum": 60,
733
+ "maximum": 1800,
734
+ "default": 600,
735
+ "description": "Wall-clock budget for the whole Step 3.7 pass. On breach, remaining findings keep judgment-only verdicts and the pipeline proceeds (never blocks)."
736
+ }
737
+ }
738
+ },
704
739
  "review": {
705
740
  "type": "object",
706
741
  "additionalProperties": false,
@@ -831,9 +866,9 @@
831
866
  },
832
867
  "pricingModel": {
833
868
  "type": "string",
834
- "enum": ["opus", "sonnet", "haiku"],
835
- "default": "opus",
836
- "description": "Which cost-table.json rate to price accumulated tokens at. Defaults to opus for a deliberately conservative (upper-bound) estimate, so the ceiling trips early rather than late."
869
+ "enum": ["fable", "opus", "sonnet", "haiku"],
870
+ "default": "fable",
871
+ "description": "Which cost-table.json rate to price accumulated tokens at. Defaults to fable (the top tier since v10.6.0) for a deliberately conservative (upper-bound) estimate, so the ceiling trips early rather than late."
837
872
  }
838
873
  }
839
874
  },
@@ -19,7 +19,7 @@
19
19
  },
20
20
  "reviewer": {
21
21
  "type": "string",
22
- "description": "Model label for this output (e.g. 'opus', 'sonnet', 'gpt'). Present once the parallel reviewer outputs are merged into the Phase 4 array so triage/consensus can attribute each finding to its source. Optional on a single reviewer's raw pre-merge output."
22
+ "description": "Model label for this output (e.g. 'fable', 'opus', 'sonnet', 'gpt'). Present once the parallel reviewer outputs are merged into the Phase 4 array so triage/consensus can attribute each finding to its source. Optional on a single reviewer's raw pre-merge output."
23
23
  }
24
24
  },
25
25
  "$defs": {
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
3
  "$id": "https://github.com/mmerterden/multi-agent-pipeline/pipeline/schemas/triage-output.schema.json",
4
- "version": "3.1.0",
4
+ "version": "3.2.0",
5
5
  "title": "Multi-Agent Pipeline - Phase 4 triage output",
6
- "description": "Contract for the Opus triage agent's JSON output in Phase 4 Step 3. Triage consumes merged reviewer findings and splits them into accepted/deferred/rejected. Only `accepted` blocking/important items trigger Phase 3 rework. v3.1.0 adds the optional `consensus` block so triage can surface reviewer-agreement risk (false consensus among same-base-model reviewers) instead of silently merging.",
6
+ "description": "Contract for the Opus triage agent's JSON output in Phase 4 Step 3. Triage consumes merged reviewer findings and splits them into accepted/deferred/rejected. Only `accepted` blocking/important items trigger Phase 3 rework. v3.1.0 adds the optional `consensus` block so triage can surface reviewer-agreement risk (false consensus among same-base-model reviewers) instead of silently merging. v3.2.0 adds the optional per-finding `verification` block written by Phase 4 Step 3.7 (verify-by-test): the empirical repro-test outcome for accepted blocking findings.",
7
7
  "type": "object",
8
8
  "additionalProperties": false,
9
9
  "required": ["accepted", "deferred", "rejected", "approved"],
@@ -74,8 +74,8 @@
74
74
  },
75
75
  "reviewer": {
76
76
  "type": "string",
77
- "enum": ["opus", "sonnet"],
78
- "description": "Which reviewer produced the raw finding. Haiku was removed in v2.1.0."
77
+ "enum": ["fable", "opus", "sonnet", "gpt"],
78
+ "description": "Which reviewer produced the raw finding. Claude Code Reviewer 1 is fable (opus when fallback engages); Copilot CLI adds gpt. Haiku was removed in v2.1.0."
79
79
  },
80
80
  "consensus": {
81
81
  "type": "object",
@@ -114,6 +114,35 @@
114
114
  }
115
115
  }
116
116
  },
117
+ "verification": {
118
+ "type": "object",
119
+ "additionalProperties": false,
120
+ "description": "v3.2.0 verify-by-test outcome (Phase 4 Step 3.7, opt-in via prefs.global.verifyByTest). confirmed = repro test failed as the finding predicts (finding stands, test kept as the Phase 3 RED test); not-reproduced = repro test passed under evidence-gate (finding downgraded to deferred); inconclusive = compile error / timeout / not unit-testable (judgment verdict stands).",
121
+ "required": ["result"],
122
+ "properties": {
123
+ "result": {
124
+ "type": "string",
125
+ "enum": ["confirmed", "not-reproduced", "inconclusive"]
126
+ },
127
+ "testRef": {
128
+ "type": "string",
129
+ "minLength": 1,
130
+ "description": "Single-test reference, e.g. 'AuthTests/LoginTests/testExpiredTokenRejected' or 'tests/test_auth.py::test_expired_token'."
131
+ },
132
+ "evidencePath": {
133
+ "type": "string",
134
+ "minLength": 1,
135
+ "description": "Path to the test-run log verified by evidence-gate.mjs, e.g. '.pipeline/verify-1.test.log'."
136
+ },
137
+ "note": { "type": "string" }
138
+ },
139
+ "if": {
140
+ "properties": { "result": { "enum": ["confirmed", "not-reproduced"] } }
141
+ },
142
+ "then": {
143
+ "required": ["result", "testRef", "evidencePath"]
144
+ }
145
+ },
117
146
  "rawFinding": {
118
147
  "type": "object",
119
148
  "additionalProperties": false,
@@ -124,7 +153,8 @@
124
153
  "line": { "type": "integer", "minimum": 0 },
125
154
  "issue": { "type": "string", "minLength": 4 },
126
155
  "fix": { "type": "string" },
127
- "reviewer": { "$ref": "#/$defs/reviewer" }
156
+ "reviewer": { "$ref": "#/$defs/reviewer" },
157
+ "verification": { "$ref": "#/$defs/verification" }
128
158
  }
129
159
  },
130
160
  "acceptedFinding": {
@@ -144,7 +174,8 @@
144
174
  "type": "string",
145
175
  "minLength": 4,
146
176
  "description": "Concrete change the dev agent must make. Required for accepted items so Phase 3 re-entry has actionable direction."
147
- }
177
+ },
178
+ "verification": { "$ref": "#/$defs/verification" }
148
179
  }
149
180
  }
150
181
  ]
@@ -22,6 +22,8 @@ Validate contracts. Each emits `══ <name> smoke: N passed, M failed ══`
22
22
  - `smoke-phase-6-multi.sh` - Phase 6 multi-repo commit/PR cross-linking
23
23
  - `smoke-phase-banner.sh` + `smoke-phase-tracker.sh` - Phase UI output contracts
24
24
  - `smoke-phase4-triage.sh` - Phase 4 reviewer → triage flow
25
+ - `smoke-verify-by-test.sh` - Phase 4 Step 3.7 verify-by-test contract (v10.8.0)
26
+ - `smoke-handoff-contract.sh` - phase-boundary structured handoff + handoff-first resume (v10.8.0)
25
27
 
26
28
  ### Schema + state
27
29
  - `smoke-schema-validation.sh` - all JSON schemas validate
@@ -64,12 +66,11 @@ Installed into `~/.claude/scripts/` and invoked by settings.json hook configurat
64
66
  - `pre-push-check.sh` - runs before `git push` (smoke-cross-cli-behavior + smoke-personal-data)
65
67
  - `output-quality-check.sh` - runs after PR body / Jira comment generation (newline / HTML entity guard)
66
68
 
67
- ## Runtime helpers (13 files)
69
+ ## Runtime helpers
68
70
  Shell scripts invoked during pipeline execution.
69
71
 
70
72
  - `phase-banner.sh` - renders phase headers
71
73
  - `phase-tracker.sh` - live tracker state + tokens accumulation + render
72
- - `stack-swap.sh` - stack detection + skill set swap
73
74
  - `keychain-save.sh` - store PAT in macOS Keychain
74
75
  - `audit-log.sh` + `audit-log-rotate.sh` - opt-in audit trail
75
76
  - `log-metric.sh` - opt-in metric capture
@@ -66,7 +66,7 @@ if (flags.help || flags.h) {
66
66
  }
67
67
 
68
68
  // --- resolve config: prefs first, CLI overrides -----------------------------
69
- const cfg = { enabled: false, maxUsd: 5.0, warnPct: 80, onExceed: "warn", pricingModel: "opus" };
69
+ const cfg = { enabled: false, maxUsd: 5.0, warnPct: 80, onExceed: "warn", pricingModel: "fable" };
70
70
 
71
71
  if (flags.prefs) {
72
72
  if (!existsSync(flags.prefs)) die(`prefs file not found: ${flags.prefs}`);
@@ -2,6 +2,13 @@
2
2
  "_readme": "Per-model unit prices in USD per million tokens. Source: Anthropic public pricing (verified 2026-04-21). Update when Anthropic publishes new tiers. Unknown models render USD as ' - ' and emit a footnote - never block PR-body generation. cacheReadPerMtok is the discounted rate for prompt-cache hits (~10% of inPerMtok); the renderer prices a phase's tokens_cached at this rate when the tracker records it, so resume/cache reuse is visible in the ledger.",
3
3
  "schemaVersion": "1.1.0",
4
4
  "prices": {
5
+ "fable": {
6
+ "inPerMtok": 10.0,
7
+ "outPerMtok": 50.0,
8
+ "cacheReadPerMtok": 1.0,
9
+ "modelId": "claude-fable-5",
10
+ "note": "Top tier (restored v10.6.0) - architects, Reviewer 1, triage. Verified against Anthropic pricing 2026-07-02."
11
+ },
5
12
  "opus": {
6
13
  "inPerMtok": 5.0,
7
14
  "outPerMtok": 25.0,
@@ -15,6 +15,7 @@
15
15
  * complexity_delta - added if/guard/case/switch/while count w=1.5
16
16
  * ui_critical - *View.swift / *Screen.kt / Configuration w=1.5
17
17
  * migration - DB schema / migration path w=4.0
18
+ * test_lines_removed - test file shrinks (removed > added) w=3.0
18
19
  *
19
20
  * Inputs:
20
21
  * --base <ref> Base ref. Default: origin/main, fallback: main
@@ -275,6 +276,15 @@ function buildRow(stat, addedLines, allChangedPaths) {
275
276
  }
276
277
  }
277
278
 
279
+ // Test-lines-removed: a test-classified file whose diff removes more lines
280
+ // than it adds. Shrinking tests is the classic get-to-green shortcut the
281
+ // immutable-test rule forbids (refs/rules.md); surface it to reviewers.
282
+ if (isTestPath(path) && stat.removed > stat.added) {
283
+ const w = 3.0;
284
+ signals.push({ name: "test_lines_removed", weight: w, value: stat.removed - stat.added });
285
+ score += 12 * w;
286
+ }
287
+
278
288
  return {
279
289
  path,
280
290
  score: Math.round(score * 100) / 100,
@@ -306,7 +316,7 @@ function main() {
306
316
  };
307
317
 
308
318
  const out = {
309
- schemaVersion: "1.0.0",
319
+ schemaVersion: "1.1.0",
310
320
  task: {
311
321
  id: TASK_ID,
312
322
  base: BASE || "(diff-file)",
@@ -0,0 +1,40 @@
1
+ diff --git a/MyAppTests/LoginViewModelTests.swift b/MyAppTests/LoginViewModelTests.swift
2
+ index 1111111..2222222 100644
3
+ --- a/MyAppTests/LoginViewModelTests.swift
4
+ +++ b/MyAppTests/LoginViewModelTests.swift
5
+ @@ -10,30 +10,20 @@ final class LoginViewModelTests: XCTestCase {
6
+ func testLoginWithValidCredentials_Succeeds() {
7
+ let sut = LoginViewModel(service: MockAuthService())
8
+ + sut.retryPolicy = .none
9
+ sut.login(email: "user@example.com", password: "correct")
10
+ + XCTAssertTrue(sut.isAuthenticated)
11
+ }
12
+ -
13
+ - func testLoginWithInvalidEmail_ShowsError() {
14
+ - let sut = LoginViewModel(service: MockAuthService())
15
+ - sut.login(email: "not-an-email", password: "irrelevant")
16
+ - XCTAssertEqual(sut.errorMessage, "Invalid email")
17
+ - }
18
+ -
19
+ - func testLoginWithExpiredToken_Rejects() {
20
+ - let sut = LoginViewModel(service: MockAuthService(tokenState: .expired))
21
+ - sut.login(email: "user@example.com", password: "correct")
22
+ - XCTAssertFalse(sut.isAuthenticated)
23
+ - }
24
+ -
25
+ - func testLogout_ClearsSession() {
26
+ - let sut = LoginViewModel(service: MockAuthService())
27
+ - sut.logout()
28
+ - XCTAssertNil(sut.session)
29
+ - }
30
+ }
31
+ diff --git a/MyApp/Sources/Auth/LoginViewModel.swift b/MyApp/Sources/Auth/LoginViewModel.swift
32
+ index 3333333..4444444 100644
33
+ --- a/MyApp/Sources/Auth/LoginViewModel.swift
34
+ +++ b/MyApp/Sources/Auth/LoginViewModel.swift
35
+ @@ -20,6 +20,8 @@ final class LoginViewModel {
36
+ func login(email: String, password: String) {
37
+ + guard email.contains("@") else { return }
38
+ + service.authenticate(email: email, password: password)
39
+ }
40
+ }
@@ -1,16 +1,16 @@
1
1
  .claude/CLAUDE.md 1
2
2
  .claude/agents 8
3
- .claude/commands 87
3
+ .claude/commands 89
4
4
  .claude/lib 23
5
5
  .claude/multi-agent-preferences.json 1
6
6
  .claude/rules 12
7
7
  .claude/schemas 23
8
- .claude/scripts 174
8
+ .claude/scripts 169
9
9
  .claude/settings.json 1
10
- .claude/skills 555
10
+ .claude/skills 560
11
11
  .copilot/agents 8
12
12
  .copilot/copilot-instructions.md 1
13
13
  .copilot/lib 23
14
14
  .copilot/schemas 23
15
- .copilot/scripts 174
16
- .copilot/skills 590
15
+ .copilot/scripts 169
16
+ .copilot/skills 596
@@ -12,6 +12,7 @@
12
12
  # 8. phase-4-review.md ref doc declares Step 1.75 + diff-risk-score.mjs
13
13
  # 9. code-reviewer.md agent template carries the priority-files placeholder
14
14
  # 10. prefs.schema.json exposes diffRisk advisory toggle
15
+ # 11. test-removal fixture fires the test_lines_removed signal (v1.1.0)
15
16
  #
16
17
  # Exit 0 = all pass, 1 = any failure.
17
18
 
@@ -26,6 +27,7 @@ REVIEWER="$ROOT/pipeline/agents/code-reviewer.md"
26
27
  PREFS="$ROOT/pipeline/schemas/prefs.schema.json"
27
28
  FIX_IOS="$ROOT/pipeline/scripts/fixtures/diff-risk-ios.diff"
28
29
  FIX_AND="$ROOT/pipeline/scripts/fixtures/diff-risk-android.diff"
30
+ FIX_TESTRM="$ROOT/pipeline/scripts/fixtures/diff-risk-test-removal.diff"
29
31
 
30
32
  pass=0
31
33
  fail=0
@@ -38,10 +40,11 @@ printf '→ smoke-diff-risk (v8.3.0): pre-review risk scoring contract\n'
38
40
  [ -f "$SCHEMA" ] || { record_fail "schema missing: $SCHEMA"; exit 1; }
39
41
  [ -f "$FIX_IOS" ] || { record_fail "fixture missing: $FIX_IOS"; exit 1; }
40
42
  [ -f "$FIX_AND" ] || { record_fail "fixture missing: $FIX_AND"; exit 1; }
43
+ [ -f "$FIX_TESTRM" ] || { record_fail "fixture missing: $FIX_TESTRM"; exit 1; }
41
44
 
42
45
  # --- 1: iOS fixture produces JSON ---
43
46
  out_ios=$(node "$SCORE" --diff "$FIX_IOS" 2>/dev/null)
44
- if jq -e '.schemaVersion == "1.0.0"' <<< "$out_ios" >/dev/null 2>&1; then
47
+ if jq -e '.schemaVersion == "1.1.0"' <<< "$out_ios" >/dev/null 2>&1; then
45
48
  record_pass "iOS fixture renders schema-versioned JSON"
46
49
  else
47
50
  record_fail "iOS fixture JSON malformed or missing schemaVersion"
@@ -150,6 +153,32 @@ else
150
153
  record_fail "prefs.schema.json missing global.diffRiskAdvisory"
151
154
  fi
152
155
 
156
+ # --- 11: test_lines_removed signal fires on the test-removal fixture ---
157
+ out_testrm=$(node "$SCORE" --diff "$FIX_TESTRM" 2>/dev/null)
158
+ sig_value=$(jq -r '.files[] | select(.path == "MyAppTests/LoginViewModelTests.swift")
159
+ | .signals[] | select(.name == "test_lines_removed") | .value' <<< "$out_testrm")
160
+ if [ "$sig_value" = "16" ]; then
161
+ record_pass "test_lines_removed fires with value=16 (18 removed - 2 added)"
162
+ else
163
+ record_fail "test_lines_removed should fire with value=16, got: ${sig_value:-missing}"
164
+ fi
165
+ sig_on_source=$(jq -r '[.files[] | select(.path == "MyApp/Sources/Auth/LoginViewModel.swift")
166
+ | .signals[] | select(.name == "test_lines_removed")] | length' <<< "$out_testrm")
167
+ if [ "$sig_on_source" = "0" ]; then
168
+ record_pass "test_lines_removed does not fire on source files"
169
+ else
170
+ record_fail "test_lines_removed must only fire on test-classified paths"
171
+ fi
172
+ set +e
173
+ echo "$out_testrm" | node "$VALIDATE" - >/dev/null 2>&1
174
+ rc_testrm=$?
175
+ set -e
176
+ if [ "$rc_testrm" -eq 0 ]; then
177
+ record_pass "validator accepts output carrying test_lines_removed"
178
+ else
179
+ record_fail "validator rejected test_lines_removed output (rc=$rc_testrm)"
180
+ fi
181
+
153
182
  # --- Summary ---
154
183
  total=$((pass + fail))
155
184
  printf '\n→ smoke-diff-risk: %d/%d passed\n' "$pass" "$total"
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env bash
2
+ # smoke-handoff-contract.sh
3
+ #
4
+ # Verifies the v10.8.0 structured-handoff contract (fresh-context re-entry):
5
+ # 1. operations.md documents the Handoff block with all 5 required lines
6
+ # 2. operations.md compaction trigger re-reads state AND the latest handoff
7
+ # 3. log-format.md documents the Handoff section in the canonical log shape
8
+ # 4. resume.md Step 3 reads the latest handoff FIRST with pre-v10.8 fallback
9
+ #
10
+ # Exit 0 = all pass, 1 = any failure.
11
+
12
+ set -euo pipefail
13
+
14
+ ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
15
+ OPS="$ROOT/pipeline/commands/multi-agent/refs/phases/operations.md"
16
+ LOGFMT="$ROOT/pipeline/commands/multi-agent/refs/phases/log-format.md"
17
+ RESUME="$ROOT/pipeline/commands/multi-agent/resume.md"
18
+
19
+ pass=0
20
+ fail=0
21
+ failures=()
22
+ record_pass() { pass=$((pass + 1)); printf ' \033[0;32mPASS\033[0m %s\n' "$1"; }
23
+ record_fail() { fail=$((fail + 1)); failures+=("$1"); printf ' \033[0;31mFAIL\033[0m %s\n' "$1"; }
24
+
25
+ printf '→ smoke-handoff-contract: structured handoff (fresh-context re-entry)\n'
26
+
27
+ # 1. operations.md documents the Handoff block with the 5 required lines
28
+ if [ ! -f "$OPS" ]; then
29
+ record_fail "operations.md missing"
30
+ else
31
+ if grep -qF "Handoff block (v10.8.0)" "$OPS"; then
32
+ record_pass "operations.md documents the Handoff block"
33
+ else
34
+ record_fail "operations.md missing 'Handoff block (v10.8.0)' spec"
35
+ fi
36
+ for line in "- Done:" "- Remaining:" "- Decisions:" "- Open findings:" "- Next:"; do
37
+ if grep -qF -- "$line" "$OPS"; then
38
+ record_pass "operations.md handoff spec has '$line'"
39
+ else
40
+ record_fail "operations.md handoff spec missing '$line'"
41
+ fi
42
+ done
43
+ if grep -qF "no agent dispatch, no extra LLM call" "$OPS"; then
44
+ record_pass "operations.md states handoff is orchestrator-written (no LLM call)"
45
+ else
46
+ record_fail "operations.md must state the handoff costs no LLM call"
47
+ fi
48
+ fi
49
+
50
+ # 2. Compaction trigger re-reads state AND latest handoff
51
+ if grep -qE 'agent-state\.json.*AND the latest.*Handoff' "$OPS"; then
52
+ record_pass "compaction trigger re-reads state + latest handoff"
53
+ else
54
+ record_fail "operations.md compaction trigger must re-read agent-state.json AND the latest Handoff block"
55
+ fi
56
+
57
+ # 3. log-format.md documents the Handoff section
58
+ if grep -qF "## Handoff - end of Phase" "$LOGFMT"; then
59
+ record_pass "log-format.md documents the Handoff section"
60
+ else
61
+ record_fail "log-format.md missing the Handoff section"
62
+ fi
63
+ if grep -qF "LATEST block is authoritative" "$LOGFMT"; then
64
+ record_pass "log-format.md states latest-block-wins semantics"
65
+ else
66
+ record_fail "log-format.md must state the latest handoff block is authoritative"
67
+ fi
68
+
69
+ # 4. resume.md reads handoff first, with fallback for older logs
70
+ if grep -qE 'LATEST .?## Handoff.? block' "$RESUME"; then
71
+ record_pass "resume.md Step 3 reads the latest Handoff block first"
72
+ else
73
+ record_fail "resume.md Step 3 must read the latest Handoff block first"
74
+ fi
75
+ if grep -qiF "fall back to per-phase findings" "$RESUME"; then
76
+ record_pass "resume.md keeps the pre-v10.8 per-phase fallback"
77
+ else
78
+ record_fail "resume.md must keep the pre-v10.8 per-phase findings fallback"
79
+ fi
80
+ if grep -qF "trust state on mismatch" "$RESUME"; then
81
+ record_pass "resume.md defines state-wins conflict rule"
82
+ else
83
+ record_fail "resume.md must define the handoff-vs-state conflict rule (state wins)"
84
+ fi
85
+
86
+ printf '\n══ handoff-contract smoke: %d passed, %d failed ══\n' "$pass" "$fail"
87
+ if [ "$fail" -gt 0 ]; then
88
+ printf '\nFailures:\n'
89
+ for msg in "${failures[@]}"; do printf ' - %s\n' "$msg"; done
90
+ exit 1
91
+ fi
92
+ exit 0
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env bash
2
+ # smoke-verify-by-test.sh
3
+ #
4
+ # Verifies the Phase 4 Step 3.7 verify-by-test contract:
5
+ # 1. phase-4-review.md documents Step 3.7 with evidence-gate invocation + feature-doc pointer
6
+ # 2. refs/features/verify-by-test.md exists and covers the verdict table + red-test handoff
7
+ # 3. prefs.schema.json exposes global.verifyByTest.{enabled,maxFindings,model,stepTimeoutSec}
8
+ # 4. verifyByTest.enabled defaults to false (opt-in, no surprise cost)
9
+ # 5. triage-output.schema.json is v3.2.0 with the $defs.verification result enum
10
+ # 6. validate-triage.mjs accepts a valid `confirmed` verification and rejects bad ones
11
+ # 7. phase-3-dev.md documents the redTests rework re-entry
12
+ #
13
+ # Exit 0 = all pass, 1 = any failure.
14
+
15
+ set -euo pipefail
16
+
17
+ ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
18
+ PHASE4_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-4-review.md"
19
+ PHASE3_DOC="$ROOT/pipeline/commands/multi-agent/refs/phases/phase-3-dev.md"
20
+ FEATURE_DOC="$ROOT/pipeline/commands/multi-agent/refs/features/verify-by-test.md"
21
+ PREFS_SCHEMA="$ROOT/pipeline/schemas/prefs.schema.json"
22
+ TRIAGE_SCHEMA="$ROOT/pipeline/schemas/triage-output.schema.json"
23
+ VALIDATOR="$ROOT/pipeline/scripts/validate-triage.mjs"
24
+
25
+ pass=0
26
+ fail=0
27
+ failures=()
28
+ record_pass() { pass=$((pass + 1)); printf ' \033[0;32mPASS\033[0m %s\n' "$1"; }
29
+ record_fail() { fail=$((fail + 1)); failures+=("$1"); printf ' \033[0;31mFAIL\033[0m %s\n' "$1"; }
30
+
31
+ printf '→ smoke-verify-by-test: Phase 4 Step 3.7 contract\n'
32
+
33
+ # 1. Phase 4 doc documents Step 3.7
34
+ if [ ! -f "$PHASE4_DOC" ]; then
35
+ record_fail "phase-4-review.md missing"
36
+ else
37
+ if grep -qF "3.7 Verify-by-test" "$PHASE4_DOC"; then
38
+ record_pass "phase-4-review.md documents Step 3.7"
39
+ else
40
+ record_fail "phase-4-review.md missing Step 3.7 section"
41
+ fi
42
+ if grep -qF "evidence-gate.mjs --claim test --status passed" "$PHASE4_DOC"; then
43
+ record_pass "Step 3.7 downgrade is evidence-gated"
44
+ else
45
+ record_fail "Step 3.7 must gate downgrades via evidence-gate.mjs --claim test --status passed"
46
+ fi
47
+ if grep -qF "refs/features/verify-by-test.md" "$PHASE4_DOC"; then
48
+ record_pass "phase-4-review.md points to the feature doc"
49
+ else
50
+ record_fail "phase-4-review.md must reference refs/features/verify-by-test.md"
51
+ fi
52
+ if grep -qF "review.verify_by_test" "$PHASE4_DOC"; then
53
+ record_pass "Step 3.7 emits review.verify_by_test telemetry"
54
+ else
55
+ record_fail "Step 3.7 must document the review.verify_by_test metric"
56
+ fi
57
+ fi
58
+
59
+ # 2. Feature doc exists with verdict + handoff coverage
60
+ if [ ! -f "$FEATURE_DOC" ]; then
61
+ record_fail "refs/features/verify-by-test.md missing"
62
+ else
63
+ for token in "not-reproduced" "inconclusive" "redTests" "Off by default"; do
64
+ if grep -qF "$token" "$FEATURE_DOC"; then
65
+ record_pass "feature doc covers '$token'"
66
+ else
67
+ record_fail "feature doc missing '$token'"
68
+ fi
69
+ done
70
+ fi
71
+
72
+ # 3. Prefs schema exposes verifyByTest knobs
73
+ for prop in enabled maxFindings model stepTimeoutSec; do
74
+ if jq -e ".properties.global.properties.verifyByTest.properties.${prop}" "$PREFS_SCHEMA" >/dev/null 2>&1; then
75
+ record_pass "prefs schema exposes verifyByTest.${prop}"
76
+ else
77
+ record_fail "prefs schema missing verifyByTest.${prop}"
78
+ fi
79
+ done
80
+
81
+ # 4. Off by default - preserves existing-user baseline
82
+ if jq -e '.properties.global.properties.verifyByTest.properties.enabled
83
+ | has("default") and .default == false' "$PREFS_SCHEMA" >/dev/null 2>&1; then
84
+ record_pass "verifyByTest.enabled defaults to false (opt-in)"
85
+ else
86
+ record_fail "verifyByTest.enabled must default to false"
87
+ fi
88
+
89
+ # 5. Triage schema version + verification enum
90
+ schema_version=$(jq -r '.version // empty' "$TRIAGE_SCHEMA")
91
+ if [ "$schema_version" = "3.2.0" ]; then
92
+ record_pass "triage-output schema version is 3.2.0"
93
+ else
94
+ record_fail "triage-output schema version should be 3.2.0 (was: ${schema_version:-missing})"
95
+ fi
96
+ if jq -e '.["$defs"].verification.properties.result.enum
97
+ | (index("confirmed") != null
98
+ and index("not-reproduced") != null
99
+ and index("inconclusive") != null)' "$TRIAGE_SCHEMA" >/dev/null 2>&1; then
100
+ record_pass "schema verification.result enum complete"
101
+ else
102
+ record_fail "schema \$defs.verification.result enum must be confirmed/not-reproduced/inconclusive"
103
+ fi
104
+
105
+ # 6. Behavioral validator round-trips
106
+ valid_fixture='{"accepted":[{"severity":"blocking","file":"Sources/Auth/Login.swift","line":42,"issue":"expired token accepted as valid","fix":"reject tokens past expiry in validateToken()","reviewer":"fable","verification":{"result":"confirmed","testRef":"AuthTests/LoginTests/testExpiredTokenRejected","evidencePath":".pipeline/verify-1.test.log"}}],"deferred":[],"rejected":[],"approved":false}'
107
+ if printf '%s' "$valid_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
108
+ record_pass "validator accepts confirmed verification with testRef+evidencePath"
109
+ else
110
+ record_fail "validator rejected a valid confirmed verification"
111
+ fi
112
+
113
+ bad_result_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"maybe"}}],"deferred":[],"rejected":[],"approved":false}'
114
+ if printf '%s' "$bad_result_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
115
+ record_fail "validator must reject verification.result 'maybe'"
116
+ else
117
+ record_pass "validator rejects bad verification.result"
118
+ fi
119
+
120
+ missing_ref_fixture='{"accepted":[{"severity":"blocking","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"fable","verification":{"result":"confirmed"}}],"deferred":[],"rejected":[],"approved":false}'
121
+ if printf '%s' "$missing_ref_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
122
+ record_fail "validator must reject confirmed verification without testRef/evidencePath"
123
+ else
124
+ record_pass "validator rejects confirmed verification lacking testRef/evidencePath"
125
+ fi
126
+
127
+ # Reviewer enum parity: fable (Claude Code default) and gpt (Copilot CLI) accepted
128
+ fable_fixture='{"accepted":[{"severity":"important","file":"a.swift","line":1,"issue":"some issue here","fix":"do the fix","reviewer":"gpt"}],"deferred":[],"rejected":[],"approved":true}'
129
+ if printf '%s' "$fable_fixture" | node "$VALIDATOR" - >/dev/null 2>&1; then
130
+ record_pass "validator accepts schema-allowed reviewers (fable/gpt)"
131
+ else
132
+ record_fail "validator must accept reviewer values fable and gpt (schema v3.1.0 parity)"
133
+ fi
134
+
135
+ # 7. Phase 3 doc documents the red-test rework re-entry
136
+ if grep -qF "verifyByTest.redTests" "$PHASE3_DOC"; then
137
+ record_pass "phase-3-dev.md documents redTests rework re-entry"
138
+ else
139
+ record_fail "phase-3-dev.md must document verifyByTest.redTests re-entry"
140
+ fi
141
+
142
+ printf '\n══ verify-by-test smoke: %d passed, %d failed ══\n' "$pass" "$fail"
143
+ if [ "$fail" -gt 0 ]; then
144
+ printf '\nFailures:\n'
145
+ for msg in "${failures[@]}"; do printf ' - %s\n' "$msg"; done
146
+ exit 1
147
+ fi
148
+ exit 0