@event4u/agent-config 2.11.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/.agent-src/commands/council/analysis.md +142 -0
  2. package/.agent-src/commands/council/debate.md +129 -0
  3. package/.agent-src/commands/council/default.md +8 -0
  4. package/.agent-src/commands/council/design.md +16 -12
  5. package/.agent-src/commands/council/optimize.md +16 -15
  6. package/.agent-src/commands/council/pr.md +12 -12
  7. package/.agent-src/commands/council.md +48 -2
  8. package/.agent-src/personas/advisors/contrarian.md +95 -0
  9. package/.agent-src/personas/advisors/executor.md +99 -0
  10. package/.agent-src/personas/advisors/expansionist.md +98 -0
  11. package/.agent-src/personas/advisors/first-principles.md +98 -0
  12. package/.agent-src/personas/advisors/outsider.md +102 -0
  13. package/.agent-src/rules/copilot-routing.md +19 -0
  14. package/.agent-src/rules/devcontainer-routing.md +20 -0
  15. package/.agent-src/rules/laravel-routing.md +20 -0
  16. package/.agent-src/rules/symfony-routing.md +20 -0
  17. package/.agent-src/skills/ai-council/SKILL.md +180 -2
  18. package/.agent-src/skills/canvas-design/SKILL.md +132 -0
  19. package/.agent-src/skills/canvas-design/evals/triggers.json +16 -0
  20. package/.agent-src/skills/copilot-config/SKILL.md +1 -1
  21. package/.agent-src/skills/devcontainer/SKILL.md +1 -1
  22. package/.agent-src/skills/doc-coauthoring/SKILL.md +129 -0
  23. package/.agent-src/skills/doc-coauthoring/evals/triggers.json +16 -0
  24. package/.agent-src/skills/laravel/SKILL.md +1 -1
  25. package/.agent-src/skills/project-analysis-core/SKILL.md +1 -1
  26. package/.agent-src/skills/project-analyzer/SKILL.md +1 -1
  27. package/.agent-src/skills/skill-writing/SKILL.md +101 -16
  28. package/.agent-src/skills/sql-writing/SKILL.md +1 -1
  29. package/.agent-src/skills/symfony-workflow/SKILL.md +1 -1
  30. package/.agent-src/skills/universal-project-analysis/SKILL.md +1 -1
  31. package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
  32. package/.claude-plugin/marketplace.json +5 -1
  33. package/AGENTS.md +1 -1
  34. package/CHANGELOG.md +78 -0
  35. package/CONTRIBUTING.md +5 -0
  36. package/README.md +3 -3
  37. package/config/agent-settings.template.yml +5 -84
  38. package/docs/architecture/multi-tool-projection.md +53 -0
  39. package/docs/architecture/{compression.md → source-projection.md} +21 -3
  40. package/docs/architecture.md +6 -6
  41. package/docs/catalog.md +21 -11
  42. package/docs/contracts/adr-architectural-consensus-mechanism.md +67 -0
  43. package/docs/contracts/adr-level-6-productization.md +2 -2
  44. package/docs/contracts/ai-council-config.md +186 -0
  45. package/docs/contracts/command-clusters.md +57 -1
  46. package/docs/contracts/multi-tool-projection-fidelity.md +109 -0
  47. package/docs/getting-started.md +2 -2
  48. package/package.json +1 -1
  49. package/scripts/_archive/README.md +59 -0
  50. package/scripts/ai_council/_default_prices.py +10 -1
  51. package/scripts/ai_council/advisors.py +148 -0
  52. package/scripts/ai_council/clients.py +189 -4
  53. package/scripts/ai_council/config.py +368 -0
  54. package/scripts/ai_council/consensus.py +290 -0
  55. package/scripts/ai_council/orchestrator.py +634 -16
  56. package/scripts/ai_council/prompts.py +335 -0
  57. package/scripts/check_compressed_paths.py +6 -1
  58. package/scripts/check_references.py +25 -0
  59. package/scripts/ci_time_ratio.py +168 -0
  60. package/scripts/council_cli.py +1007 -32
  61. package/scripts/measure_projection_bytes.py +159 -0
  62. package/scripts/measure_roadmap_trajectory.py +112 -0
  63. package/scripts/probe_projection_fidelity.py +202 -0
  64. package/scripts/run_skill_evals.py +185 -0
  65. package/scripts/schemas/skill.schema.json +4 -0
  66. package/scripts/score_skill_selection.py +198 -0
  67. package/scripts/skill_collision_clusters.py +162 -0
  68. package/scripts/skill_linter.py +71 -1
  69. /package/scripts/{_backfill_skill_domains.py → _archive/_backfill_skill_domains.py} +0 -0
  70. /package/scripts/{_bootstrap_tier_frontmatter.py → _archive/_bootstrap_tier_frontmatter.py} +0 -0
  71. /package/scripts/{_p43_bodies.py → _archive/_p43_bodies.py} +0 -0
  72. /package/scripts/{_p43_compress.py → _archive/_p43_compress.py} +0 -0
  73. /package/scripts/{_p4_migrate.py → _archive/_p4_migrate.py} +0 -0
  74. /package/scripts/{_phase2_shim_helper.py → _archive/_phase2_shim_helper.py} +0 -0
  75. /package/scripts/{_pilot_council_question.py → _archive/_pilot_council_question.py} +0 -0
@@ -27,7 +27,23 @@ from scripts.ai_council.budget_guard import (
27
27
  today_spend_usd as _today_spend_usd,
28
28
  would_exceed as _would_exceed_daily,
29
29
  )
30
- from scripts.ai_council.clients import CouncilResponse, ExternalAIClient
30
+ from scripts.ai_council.clients import (
31
+ DEFAULT_MAX_TOKENS,
32
+ CouncilResponse,
33
+ ExternalAIClient,
34
+ )
35
+ from scripts.ai_council.consensus import (
36
+ ConsensusBucket,
37
+ ConsensusMetadata,
38
+ Finding,
39
+ FindingScore,
40
+ aggregate_scores,
41
+ anonymize_findings,
42
+ anonymize_responses,
43
+ bucket_by_threshold,
44
+ parse_findings_response,
45
+ parse_scores_response,
46
+ )
31
47
  from scripts.ai_council.pricing import (
32
48
  CostEstimate,
33
49
  PriceTable,
@@ -35,7 +51,16 @@ from scripts.ai_council.pricing import (
35
51
  estimate_input_tokens,
36
52
  )
37
53
  from scripts.ai_council.project_context import ProjectContext
38
- from scripts.ai_council.prompts import system_prompt_for
54
+ from scripts.ai_council.advisors import AdvisorPlan
55
+ from scripts.ai_council.prompts import (
56
+ advisor_system_prompt,
57
+ build_extraction_user_prompt,
58
+ build_peer_review_user_prompt,
59
+ build_scoring_user_prompt,
60
+ peer_review_synthesis_addendum,
61
+ synthesis_template,
62
+ system_prompt_for,
63
+ )
39
64
 
40
65
 
41
66
  @dataclass
@@ -51,7 +76,7 @@ class CostBudget:
51
76
  class CouncilQuestion:
52
77
  mode: str # one of: prompt, roadmap, diff, files
53
78
  user_prompt: str # bundled artefact text
54
- max_tokens: int = 1024
79
+ max_tokens: int = DEFAULT_MAX_TOKENS
55
80
 
56
81
 
57
82
  @dataclass
@@ -81,21 +106,41 @@ def estimate(
81
106
  *,
82
107
  project: ProjectContext | None = None,
83
108
  original_ask: str = "",
109
+ advisor_plans: dict[str, AdvisorPlan] | None = None,
84
110
  ) -> list[CostEstimate]:
85
111
  """Return a pre-call cost estimate per member, in input order.
86
112
 
87
113
  `project` and `original_ask` are passed through to
88
114
  `system_prompt_for()` so the estimate covers the handoff preamble
89
115
  bytes too. Both default to v1-shape (no preamble extension).
116
+
117
+ `advisor_plans` (Phase 6) — when a member's name has a plan, the
118
+ estimate uses the advisor persona system prompt (typically larger
119
+ than the bare mode addendum). The cost estimator must mirror
120
+ `_run_round` exactly so the pre-call preview never under-states
121
+ the advisor-mode bill.
90
122
  """
91
- sys_prompt = system_prompt_for(
123
+ plans = advisor_plans or {}
124
+ base_user_tokens = estimate_input_tokens(question.user_prompt)
125
+ base_sys = system_prompt_for(
92
126
  question.mode, project=project, original_ask=original_ask,
93
127
  )
94
- input_tokens = estimate_input_tokens(question.user_prompt) + estimate_input_tokens(sys_prompt)
95
- return [
96
- estimate_cost(m.name, m.model, input_tokens, question.max_tokens, table)
97
- for m in members
98
- ]
128
+ base_sys_tokens = estimate_input_tokens(base_sys)
129
+ estimates: list[CostEstimate] = []
130
+ for m in members:
131
+ plan = plans.get(m.name)
132
+ if plan is None:
133
+ sys_tokens = base_sys_tokens
134
+ else:
135
+ sys_prompt = advisor_system_prompt(
136
+ plan.persona_text, project=project, original_ask=original_ask,
137
+ )
138
+ sys_tokens = estimate_input_tokens(sys_prompt)
139
+ input_tokens = base_user_tokens + sys_tokens
140
+ estimates.append(
141
+ estimate_cost(m.name, m.model, input_tokens, question.max_tokens, table),
142
+ )
143
+ return estimates
99
144
 
100
145
 
101
146
  def consult(
@@ -109,6 +154,7 @@ def consult(
109
154
  original_ask: str = "",
110
155
  rounds: int = 1,
111
156
  on_round_complete: Callable[[int, list[CouncilResponse]], None] | None = None,
157
+ advisor_plans: dict[str, AdvisorPlan] | None = None,
112
158
  ) -> list[CouncilResponse]:
113
159
  """Sequentially fan out `question` to every enabled member.
114
160
 
@@ -129,6 +175,9 @@ def consult(
129
175
  accumulate across rounds. Returns the FINAL round's responses;
130
176
  use `on_round_complete(round_idx, responses)` to capture
131
177
  intermediate rounds.
178
+ - `advisor_plans` (Phase 6) keyed by provider name swaps the
179
+ member's system prompt for the advisor persona via
180
+ `advisor_system_prompt()`. Replace-mode: no extra calls.
132
181
  """
133
182
  if rounds < 1:
134
183
  raise ValueError(f"rounds must be >= 1 (got {rounds})")
@@ -158,6 +207,7 @@ def consult(
158
207
  members, round_question, budget, spent,
159
208
  table=table, on_overrun=on_overrun,
160
209
  project=project, original_ask=original_ask,
210
+ advisor_plans=advisor_plans,
161
211
  )
162
212
  if on_round_complete is not None:
163
213
  on_round_complete(round_idx, last_results)
@@ -179,14 +229,29 @@ def _run_round(
179
229
  on_overrun: OnOverrunCallback | None,
180
230
  project: ProjectContext | None,
181
231
  original_ask: str,
232
+ advisor_plans: dict[str, AdvisorPlan] | None = None,
182
233
  ) -> list[CouncilResponse]:
183
234
  """Run a single round; mutate `spent` with cumulative totals."""
184
- system_prompt = system_prompt_for(
235
+ plans = advisor_plans or {}
236
+ base_system_prompt = system_prompt_for(
185
237
  question.mode, project=project, original_ask=original_ask,
186
238
  )
239
+
240
+ def _system_prompt_for_member(m: ExternalAIClient) -> str:
241
+ plan = plans.get(m.name)
242
+ if plan is None:
243
+ return base_system_prompt
244
+ return advisor_system_prompt(
245
+ plan.persona_text, project=project, original_ask=original_ask,
246
+ )
247
+
187
248
  results: list[CouncilResponse] = []
188
249
  estimates = (
189
- estimate(question, members, table, project=project, original_ask=original_ask)
250
+ estimate(
251
+ question, members, table,
252
+ project=project, original_ask=original_ask,
253
+ advisor_plans=advisor_plans,
254
+ )
190
255
  if table is not None
191
256
  else None
192
257
  )
@@ -198,7 +263,10 @@ def _run_round(
198
263
  # observability, but no projection / budget breach can apply.
199
264
  if not getattr(member, "billable", True):
200
265
  try:
201
- response = member.ask(system_prompt, question.user_prompt, question.max_tokens)
266
+ response = member.ask(
267
+ _system_prompt_for_member(member),
268
+ question.user_prompt, question.max_tokens,
269
+ )
202
270
  except Exception as exc: # noqa: BLE001 - last-resort safety net
203
271
  response = CouncilResponse(
204
272
  provider=member.name, model=member.model, text="",
@@ -261,7 +329,10 @@ def _run_round(
261
329
 
262
330
  # ── actual call ──────────────────────────────────────────────
263
331
  try:
264
- response = member.ask(system_prompt, question.user_prompt, question.max_tokens)
332
+ response = member.ask(
333
+ _system_prompt_for_member(member),
334
+ question.user_prompt, question.max_tokens,
335
+ )
265
336
  except Exception as exc: # noqa: BLE001 - last-resort safety net
266
337
  response = CouncilResponse(
267
338
  provider=member.name, model=member.model, text="",
@@ -333,9 +404,491 @@ def _augment_for_next_round(
333
404
  )
334
405
 
335
406
 
336
- def render(responses: list[CouncilResponse]) -> str:
337
- """Render stacked sections + a Convergence/Divergence summary slot."""
407
+ @dataclass
408
+ class DebateCheckpoint:
409
+ """Snapshot passed to the continue-prompt callback between rounds.
410
+
411
+ Phase 7 progressive-disclosure contract — the orchestrator pauses
412
+ after each completed round, builds this checkpoint, and asks the
413
+ caller whether to continue. Returning False stops the debate
414
+ gracefully (caller receives every completed round).
415
+ """
416
+
417
+ completed_round: int # 1-based index of the round just finished
418
+ total_planned_rounds: int
419
+ cost_so_far_usd: float
420
+ next_round_estimate_usd: float
421
+ last_round_responses: list[CouncilResponse]
422
+
423
+
424
+ class DebateCapExceeded(RuntimeError):
425
+ """Raised when projected next-round spend would breach the budget cap.
426
+
427
+ The CLI catches this *after* writing the partial artefact, so the
428
+ user always has a recoverable trail of the rounds that completed
429
+ before the cap fired.
430
+ """
431
+
432
+ def __init__(
433
+ self, *,
434
+ completed_round: int,
435
+ cost_so_far: float,
436
+ next_estimate: float,
437
+ cap: float,
438
+ ) -> None:
439
+ self.completed_round = completed_round
440
+ self.cost_so_far = cost_so_far
441
+ self.next_estimate = next_estimate
442
+ self.cap = cap
443
+ super().__init__(
444
+ f"Debate hard-cap: round {completed_round + 1} would push spend "
445
+ f"to ${cost_so_far + next_estimate:.4f} (cap=${cap:.4f}); "
446
+ f"stopping after round {completed_round}."
447
+ )
448
+
449
+
450
+ # Continue-prompt callback. Receives a DebateCheckpoint, returns True to
451
+ # proceed with the next round, False to stop gracefully.
452
+ DebateContinuePrompt = Callable[[DebateCheckpoint], bool]
453
+
454
+
455
+ def _augment_for_debate_round(
456
+ original_prompt: str,
457
+ prior_responses: list[CouncilResponse],
458
+ next_round_number: int,
459
+ ) -> str:
460
+ """Build the round-N user prompt for a debate — rebuttal framing.
461
+
462
+ Same anonymisation rules as `_augment_for_next_round` (Iron Law of
463
+ Neutrality § multi-round): provider/model identifiers stripped,
464
+ "Reviewer A / B / C…" labels assigned in input order, errors
465
+ skipped. The instruction block is debate-specific: each reviewer
466
+ is asked to identify the strongest opposing position and write a
467
+ rebuttal, NOT to find common ground.
468
+ """
469
+ blocks: list[str] = []
470
+ label_idx = 0
471
+ for r in prior_responses:
472
+ if r.error or not r.text.strip():
473
+ continue
474
+ label = chr(ord("A") + label_idx)
475
+ label_idx += 1
476
+ blocks.append(f"### Reviewer {label}\n\n{r.text.strip()}")
477
+ if not blocks:
478
+ return original_prompt
479
+ prior_block = "\n\n".join(blocks)
480
+ return (
481
+ f"{original_prompt}\n\n"
482
+ f"---\n\n"
483
+ f"## Prior round positions (round {next_round_number - 1})\n\n"
484
+ f"You are now in round {next_round_number} of a structured\n"
485
+ f"debate. Below are anonymised positions from independent\n"
486
+ f"reviewers in the previous round. You do NOT know which model\n"
487
+ f"produced which position.\n\n"
488
+ f"Identify the SINGLE strongest opposing position and write a\n"
489
+ f"rebuttal addressed at its strongest steel-manned form. Do NOT\n"
490
+ f"search for common ground — name the load-bearing flaw the\n"
491
+ f"opposing reviewer missed and state the evidence behind your\n"
492
+ f"counter-position.\n\n"
493
+ f"{prior_block}"
494
+ )
495
+
496
+
497
+ def run_debate(
498
+ members: list[ExternalAIClient],
499
+ question: CouncilQuestion,
500
+ *,
501
+ budget: CostBudget | None = None,
502
+ table: PriceTable | None = None,
503
+ on_overrun: OnOverrunCallback | None = None,
504
+ project: ProjectContext | None = None,
505
+ original_ask: str = "",
506
+ max_rounds: int = 2,
507
+ on_round_complete: Callable[[int, list[CouncilResponse]], None] | None = None,
508
+ on_continue: DebateContinuePrompt | None = None,
509
+ advisor_plans: dict[str, AdvisorPlan] | None = None,
510
+ seed_round_1: list[CouncilResponse] | None = None,
511
+ ) -> list[list[CouncilResponse]]:
512
+ """Run a structured multi-round debate with progressive disclosure.
513
+
514
+ Returns every completed round in order — caller persists each
515
+ round incrementally via `on_round_complete` for crash safety.
516
+
517
+ Round 1: each member produces an initial position. When
518
+ `seed_round_1` is provided, it is reused verbatim (no calls) so
519
+ `/council debate --continue-as-debate` can pivot from an existing
520
+ `/council default` session.
521
+
522
+ Round 2+: `_augment_for_debate_round` wraps the original prompt
523
+ with anonymised prior positions and asks each member for a
524
+ rebuttal addressed at the strongest opposing view.
525
+
526
+ Between rounds: `on_continue(checkpoint)` is consulted. Returning
527
+ False stops the debate; the caller receives every completed round.
528
+ `None` (the default) auto-continues — the CLI wires its
529
+ interactive y/N prompt here, `--auto-continue` passes `None`.
530
+
531
+ Hard cap: before kicking off round N+1, the orchestrator compares
532
+ `spent_usd + next_round_estimate` to `budget.max_total_usd`. A
533
+ projected breach raises `DebateCapExceeded`; the CLI catches it
534
+ after persisting the partial debate.
535
+ """
536
+ if max_rounds < 1:
537
+ raise ValueError(f"max_rounds must be >= 1 (got {max_rounds})")
538
+ if not members:
539
+ return []
540
+ budget = budget or CostBudget()
541
+ if len(members) > budget.max_calls:
542
+ raise ValueError(
543
+ f"Debate has {len(members)} members but budget caps at "
544
+ f"{budget.max_calls} calls."
545
+ )
546
+
547
+ spent: dict[str, float] = {"input": 0, "output": 0, "usd": 0.0}
548
+ all_rounds: list[list[CouncilResponse]] = []
549
+ current_user_prompt = question.user_prompt
550
+
551
+ for round_idx in range(max_rounds):
552
+ round_number = round_idx + 1
553
+ if round_idx == 0 and seed_round_1 is not None:
554
+ # Pivot from /council default — reuse the existing round 1
555
+ # verbatim. No calls billed; spend stays at $0 until round 2.
556
+ results = list(seed_round_1)
557
+ else:
558
+ round_question = (
559
+ question if round_idx == 0
560
+ else CouncilQuestion(
561
+ mode=question.mode,
562
+ user_prompt=current_user_prompt,
563
+ max_tokens=question.max_tokens,
564
+ )
565
+ )
566
+ results = _run_round(
567
+ members, round_question, budget, spent,
568
+ table=table, on_overrun=on_overrun,
569
+ project=project, original_ask=original_ask,
570
+ advisor_plans=advisor_plans,
571
+ )
572
+
573
+ all_rounds.append(results)
574
+ if on_round_complete is not None:
575
+ on_round_complete(round_number, results)
576
+
577
+ # Prep the user-prompt for the next round so the cost estimate
578
+ # below covers the augmented bytes.
579
+ if round_idx + 1 < max_rounds:
580
+ current_user_prompt = _augment_for_debate_round(
581
+ question.user_prompt, results, round_number + 1,
582
+ )
583
+ # Hard-cap + continue-prompt gating before kicking off N+1.
584
+ if table is not None:
585
+ next_question = CouncilQuestion(
586
+ mode=question.mode,
587
+ user_prompt=current_user_prompt,
588
+ max_tokens=question.max_tokens,
589
+ )
590
+ next_estimates = estimate(
591
+ next_question, members, table,
592
+ project=project, original_ask=original_ask,
593
+ advisor_plans=advisor_plans,
594
+ )
595
+ next_round_usd = sum(e.total_usd for e in next_estimates)
596
+ else:
597
+ next_round_usd = 0.0
598
+
599
+ if (
600
+ budget.max_total_usd > 0
601
+ and spent["usd"] + next_round_usd > budget.max_total_usd
602
+ ):
603
+ raise DebateCapExceeded(
604
+ completed_round=round_number,
605
+ cost_so_far=spent["usd"],
606
+ next_estimate=next_round_usd,
607
+ cap=budget.max_total_usd,
608
+ )
609
+
610
+ if on_continue is not None:
611
+ checkpoint = DebateCheckpoint(
612
+ completed_round=round_number,
613
+ total_planned_rounds=max_rounds,
614
+ cost_so_far_usd=spent["usd"],
615
+ next_round_estimate_usd=next_round_usd,
616
+ last_round_responses=results,
617
+ )
618
+ if not on_continue(checkpoint):
619
+ return all_rounds
620
+
621
+ return all_rounds
622
+
623
+
624
+ @dataclass
625
+ class PeerReviewResult:
626
+ """Bundle returned by `run_peer_review()` (Phase 5 / F1).
627
+
628
+ `responses` carries the per-reviewer critiques. `label_to_source`
629
+ is the anonymisation map captured server-side so the audit-trail
630
+ JSON can rehydrate it without leaking provider identity to the
631
+ member at prompt time.
632
+
633
+ `persona_labels` is the (optional) Phase 6 / Step 3a wiring: when
634
+ the deliberation was an advisor-mode run, the source → persona
635
+ map flows through to the renderer so peer-review output can render
636
+ as `Response A (Contrarian)`. Plain-member runs leave it empty.
637
+ """
638
+
639
+ responses: list[CouncilResponse]
640
+ label_to_source: dict[str, str]
641
+ persona_labels: dict[str, str]
642
+
643
+
644
+ def run_peer_review(
645
+ members: list[ExternalAIClient],
646
+ deliberation_responses: list[CouncilResponse],
647
+ *,
648
+ budget: CostBudget | None = None,
649
+ table: PriceTable | None = None,
650
+ on_overrun: OnOverrunCallback | None = None,
651
+ project: ProjectContext | None = None,
652
+ original_ask: str = "",
653
+ max_tokens: int = DEFAULT_MAX_TOKENS,
654
+ persona_labels: dict[str, str] | None = None,
655
+ ) -> PeerReviewResult:
656
+ """Karpathy peer-review pass (Phase 5 / F1).
657
+
658
+ After the final deliberation round, each member sees the OTHER
659
+ members' deliberation outputs under neutral `Response-A` labels
660
+ (provider identity stripped; advisor persona labels preserved per
661
+ Phase 6 Step 3a) and emits a Karpathy-style critique:
662
+ strongest / weakest blind spot / what all missed / refinement.
663
+
664
+ Members never see their own response — the orchestrator filters
665
+ self before building the anonymised prompt. Errors in one member's
666
+ pass tag that member but never abort the round.
667
+
668
+ Cost gates flow through `consult([member], ...)`, so the same
669
+ budget + daily-ledger semantics as deliberation apply.
670
+ """
671
+ if not members or not deliberation_responses:
672
+ return PeerReviewResult(
673
+ responses=[], label_to_source={}, persona_labels={},
674
+ )
675
+
676
+ member_by_name = {m.name: m for m in members}
677
+ # ── source map: deliberation responses keyed by `provider:model` ─
678
+ # Errors and empty bodies are skipped — they leak nothing useful
679
+ # and would clutter the anonymised prompt with blanks.
680
+ by_source: dict[str, CouncilResponse] = {}
681
+ for r in deliberation_responses:
682
+ if r.error or not r.text.strip():
683
+ continue
684
+ source = f"{r.provider}:{r.model}"
685
+ by_source[source] = r
686
+
687
+ if len(by_source) < 2:
688
+ # Peer-review needs ≥ 2 distinct deliberation outputs (a
689
+ # reviewer with nothing else to review is a no-op).
690
+ return PeerReviewResult(
691
+ responses=[], label_to_source={}, persona_labels={},
692
+ )
693
+
694
+ persona_labels = dict(persona_labels or {})
695
+ review_responses: list[CouncilResponse] = []
696
+ # ── final label_to_source map captured from the LAST member call
697
+ # so the renderer / JSON dump has the deterministic A/B mapping.
698
+ # Each member sees a different N-1 subset (self filtered), but the
699
+ # ordering of `by_source` stays stable, so the label assignment is
700
+ # deterministic per artefact run.
701
+ last_label_to_source: dict[str, str] = {}
702
+
703
+ for reviewer in members:
704
+ scorer = f"{reviewer.name}:{reviewer.model}"
705
+ if reviewer.name not in member_by_name:
706
+ continue
707
+ others_pairs = [
708
+ (src, resp.text) for src, resp in by_source.items() if src != scorer
709
+ ]
710
+ if len(others_pairs) == 0:
711
+ continue
712
+ anon_text, label_to_source = anonymize_responses(
713
+ others_pairs, persona_labels=persona_labels,
714
+ )
715
+ if not anon_text:
716
+ continue
717
+ last_label_to_source = label_to_source
718
+ question = CouncilQuestion(
719
+ mode="prompt",
720
+ user_prompt=build_peer_review_user_prompt(anon_text),
721
+ max_tokens=max_tokens,
722
+ )
723
+ reviewed = consult(
724
+ [reviewer], question,
725
+ budget=budget, table=table, on_overrun=on_overrun,
726
+ project=project, original_ask=original_ask,
727
+ )
728
+ review_responses.extend(reviewed)
729
+
730
+ return PeerReviewResult(
731
+ responses=review_responses,
732
+ label_to_source=last_label_to_source,
733
+ persona_labels=persona_labels,
734
+ )
735
+
736
+
737
+ @dataclass
738
+ class ConsensusResult:
739
+ """Bundle returned by `run_consensus_scoring()`.
740
+
741
+ `bucket` is renderer-ready; `findings`, `scores`, and `metadata`
742
+ are kept for audit-trail JSON (council-sessions/*.json).
743
+ """
744
+
745
+ bucket: ConsensusBucket
746
+ findings: list[Finding]
747
+ scores: list[FindingScore]
748
+ metadata: dict[str, ConsensusMetadata]
749
+ extraction_responses: list[CouncilResponse]
750
+ scoring_responses: list[CouncilResponse]
751
+
752
+
753
+ def run_consensus_scoring(
754
+ members: list[ExternalAIClient],
755
+ deliberation_responses: list[CouncilResponse],
756
+ *,
757
+ budget: CostBudget | None = None,
758
+ table: PriceTable | None = None,
759
+ on_overrun: OnOverrunCallback | None = None,
760
+ project: ProjectContext | None = None,
761
+ original_ask: str = "",
762
+ max_tokens: int = DEFAULT_MAX_TOKENS,
763
+ strong_threshold: float = 0.7,
764
+ minority_threshold: float = 0.4,
765
+ ) -> ConsensusResult:
766
+ """Two-pass consensus round (Phase 4 / F3).
767
+
768
+ Pass 1 — extraction: each member re-emits its own deliberation as
769
+ a JSON array of `{id, text}` findings. Pass 2 — scoring: each
770
+ member sees the *other* members' findings under anonymous labels
771
+ and rates them 1-10 + agree/disagree + reason.
772
+
773
+ The cost budget is shared across both passes; the daily ledger
774
+ receives both. Errors in one member's extraction or scoring tag
775
+ that member but never abort the round.
776
+ """
777
+ if not members or not deliberation_responses:
778
+ return ConsensusResult(
779
+ bucket=ConsensusBucket(), findings=[], scores=[], metadata={},
780
+ extraction_responses=[], scoring_responses=[],
781
+ )
782
+
783
+ # ── Pass 1: extraction ──────────────────────────────────────────
784
+ member_by_name = {m.name: m for m in members}
785
+ extraction_responses: list[CouncilResponse] = []
786
+ all_findings: list[Finding] = []
787
+ for resp in deliberation_responses:
788
+ member = member_by_name.get(resp.provider)
789
+ if member is None or resp.error or not resp.text.strip():
790
+ continue
791
+ question = CouncilQuestion(
792
+ mode="prompt",
793
+ user_prompt=build_extraction_user_prompt(resp.text),
794
+ max_tokens=max_tokens,
795
+ )
796
+ extracted = consult(
797
+ [member], question,
798
+ budget=budget, table=table, on_overrun=on_overrun,
799
+ project=project, original_ask=original_ask,
800
+ )
801
+ extraction_responses.extend(extracted)
802
+ if not extracted or extracted[0].error:
803
+ continue
804
+ source = f"{member.name}:{member.model}"
805
+ all_findings.extend(
806
+ parse_findings_response(extracted[0].text, source=source),
807
+ )
808
+
809
+ if not all_findings:
810
+ return ConsensusResult(
811
+ bucket=ConsensusBucket(), findings=[], scores=[], metadata={},
812
+ extraction_responses=extraction_responses, scoring_responses=[],
813
+ )
814
+
815
+ # ── Pass 2: scoring (each member rates the OTHERS' findings) ────
816
+ scoring_responses: list[CouncilResponse] = []
817
+ all_scores: list[FindingScore] = []
818
+ for member in members:
819
+ scorer = f"{member.name}:{member.model}"
820
+ others = [f for f in all_findings if f.source != scorer]
821
+ if not others:
822
+ continue
823
+ anon = anonymize_findings(others)
824
+ label_to_id = {label: f.id for label, f in anon.items()}
825
+ anon_text = {label: f.text for label, f in anon.items()}
826
+ question = CouncilQuestion(
827
+ mode="prompt",
828
+ user_prompt=build_scoring_user_prompt(anon_text),
829
+ max_tokens=max_tokens,
830
+ )
831
+ scored = consult(
832
+ [member], question,
833
+ budget=budget, table=table, on_overrun=on_overrun,
834
+ project=project, original_ask=original_ask,
835
+ )
836
+ scoring_responses.extend(scored)
837
+ if not scored or scored[0].error:
838
+ continue
839
+ for s in parse_scores_response(scored[0].text, scorer=scorer):
840
+ real_id = label_to_id.get(s.finding_id)
841
+ if real_id is None:
842
+ continue
843
+ all_scores.append(FindingScore(
844
+ finding_id=real_id, scorer=s.scorer, score=s.score,
845
+ agree=s.agree, reason=s.reason,
846
+ ))
847
+
848
+ metadata = aggregate_scores(all_findings, all_scores)
849
+ bucket = bucket_by_threshold(
850
+ all_findings, metadata,
851
+ strong=strong_threshold, minority=minority_threshold,
852
+ )
853
+ return ConsensusResult(
854
+ bucket=bucket, findings=all_findings, scores=all_scores,
855
+ metadata=metadata, extraction_responses=extraction_responses,
856
+ scoring_responses=scoring_responses,
857
+ )
858
+
859
+
860
+ def render(
861
+ responses: list[CouncilResponse],
862
+ *,
863
+ mode: str | None = None,
864
+ prose_synthesis: bool | None = None,
865
+ consensus: ConsensusResult | None = None,
866
+ peer_review: PeerReviewResult | None = None,
867
+ ) -> str:
868
+ """Render stacked sections + a lens-aware synthesis prompt slot.
869
+
870
+ `mode` selects the synthesis template from `prompts.synthesis_template`.
871
+ `None` collapses to the default decision-lens template (back-compat).
872
+
873
+ `prose_synthesis` is the R4 Q4 escape hatch:
874
+ - `True` → force creative-lens passthrough (bare slot) regardless of mode
875
+ - `False` → force decision-lens default template even on creative lenses
876
+ - `None` → honour the lens default from the table
877
+
878
+ `consensus` (Phase 4 / F3) prepends Strong Consensus / Findings /
879
+ Minority Views sections when the analysis lens scored its findings.
880
+
881
+ `peer_review` (Phase 5 / F1) appends a Peer-Review block listing
882
+ each member's critique (under Reviewer-A / Reviewer-B labels, in
883
+ member input order so the audit trail is deterministic) and
884
+ extends the synthesis template with the
885
+ `Peer-Review-Surfaced Blind Spots` addendum.
886
+ """
338
887
  blocks: list[str] = []
888
+ if consensus is not None and (
889
+ consensus.bucket.strong or consensus.bucket.findings or consensus.bucket.minority
890
+ ):
891
+ blocks.append(_render_consensus(consensus.bucket))
339
892
  for r in responses:
340
893
  header = f"## {r.provider} · {r.model}"
341
894
  if r.error:
@@ -346,5 +899,70 @@ def render(responses: list[CouncilResponse]) -> str:
346
899
  f"{r.latency_ms} ms*"
347
900
  )
348
901
  blocks.append(f"{header}\n\n{meta}\n\n{r.text}")
349
- blocks.append("## Convergence / Divergence\n\n*to be summarised by the host agent*")
902
+ if peer_review is not None and peer_review.responses:
903
+ blocks.append(_render_peer_review(peer_review))
904
+ if prose_synthesis is True:
905
+ template = ""
906
+ elif prose_synthesis is False:
907
+ template = synthesis_template("default")
908
+ else:
909
+ template = synthesis_template(mode)
910
+ if peer_review is not None and peer_review.responses:
911
+ addendum = peer_review_synthesis_addendum()
912
+ template = f"{template}\n{addendum}" if template else addendum.lstrip()
913
+ if template:
914
+ body = template
915
+ else:
916
+ body = "*to be summarised by the host agent*"
917
+ blocks.append(f"## Convergence / Divergence\n\n{body}")
350
918
  return "\n\n---\n\n".join(blocks)
919
+
920
+
921
+ def _render_peer_review(peer_review: PeerReviewResult) -> str:
922
+ """Render the peer-review block under deterministic Reviewer labels.
923
+
924
+ Each successful reviewer gets a `### Reviewer X` sub-section. Errors
925
+ keep their slot (so the audit trail still surfaces the breach) but
926
+ render `ERROR: <tag>` instead of the prompt body.
927
+ """
928
+ lines = ["## Peer-Review (Karpathy)"]
929
+ label_idx = 0
930
+ for r in peer_review.responses:
931
+ label = chr(ord("A") + label_idx)
932
+ label_idx += 1
933
+ if r.error:
934
+ lines.append(f"### Reviewer {label}\n\n*ERROR:* `{r.error}`")
935
+ continue
936
+ lines.append(f"### Reviewer {label}\n\n{r.text.strip()}")
937
+ return "\n\n".join(lines)
938
+
939
+
940
+ def _render_consensus(bucket: ConsensusBucket) -> str:
941
+ """Render Strong / Findings / Minority sections in renderer order."""
942
+ parts: list[str] = []
943
+ if bucket.strong:
944
+ parts.append("## Strong Consensus\n\n" + _render_bucket(bucket.strong))
945
+ if bucket.findings:
946
+ parts.append("## Findings\n\n" + _render_bucket(bucket.findings))
947
+ if bucket.minority:
948
+ parts.append(
949
+ "## Minority Views\n\n"
950
+ "*Sub-threshold by consensus; kept for audit trail.*\n\n"
951
+ + _render_bucket(bucket.minority)
952
+ )
953
+ return "\n\n".join(parts)
954
+
955
+
956
+ def _render_bucket(
957
+ items: list[tuple[Finding, ConsensusMetadata]],
958
+ ) -> str:
959
+ lines: list[str] = []
960
+ for f, m in items:
961
+ badge = (
962
+ f"strength {m.consensus_strength:.2f} · "
963
+ f"mean {m.mean_score:.1f}/10 · "
964
+ f"{len(m.scorers)} scorers · "
965
+ f"{m.dissent_count} dissent"
966
+ )
967
+ lines.append(f"- **{f.id}** — {f.text} \n _{badge}_")
968
+ return "\n".join(lines)