@event4u/agent-config 2.12.0 → 2.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/council/analysis.md +142 -0
- package/.agent-src/commands/council/debate.md +129 -0
- package/.agent-src/commands/council/default.md +8 -0
- package/.agent-src/commands/council/design.md +16 -12
- package/.agent-src/commands/council/optimize.md +16 -15
- package/.agent-src/commands/council/pr.md +12 -12
- package/.agent-src/commands/council.md +48 -2
- package/.agent-src/commands/memory/learn-low-impact.md +143 -0
- package/.agent-src/personas/advisors/contrarian.md +95 -0
- package/.agent-src/personas/advisors/executor.md +99 -0
- package/.agent-src/personas/advisors/expansionist.md +98 -0
- package/.agent-src/personas/advisors/first-principles.md +98 -0
- package/.agent-src/personas/advisors/outsider.md +102 -0
- package/.agent-src/rules/ask-when-uncertain.md +10 -6
- package/.agent-src/rules/copilot-routing.md +19 -0
- package/.agent-src/rules/devcontainer-routing.md +20 -0
- package/.agent-src/rules/external-reference-deep-dive.md +1 -1
- package/.agent-src/rules/fast-path-marker-visibility.md +38 -0
- package/.agent-src/rules/laravel-routing.md +20 -0
- package/.agent-src/rules/low-impact-corpus-privacy-floor.md +74 -0
- package/.agent-src/rules/symfony-routing.md +20 -0
- package/.agent-src/skills/ai-council/SKILL.md +388 -10
- package/.agent-src/skills/copilot-config/SKILL.md +1 -1
- package/.agent-src/skills/devcontainer/SKILL.md +1 -1
- package/.agent-src/skills/laravel/SKILL.md +1 -1
- package/.agent-src/skills/project-analysis-core/SKILL.md +1 -1
- package/.agent-src/skills/project-analyzer/SKILL.md +1 -1
- package/.agent-src/skills/symfony-workflow/SKILL.md +1 -1
- package/.agent-src/skills/universal-project-analysis/SKILL.md +1 -1
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.claude-plugin/marketplace.json +4 -1
- package/AGENTS.md +1 -1
- package/CHANGELOG.md +346 -124
- package/CONTRIBUTING.md +5 -0
- package/README.md +6 -6
- package/config/agent-settings.template.yml +5 -93
- package/config/gitignore-block.txt +6 -0
- package/docs/architecture/multi-tool-projection.md +53 -0
- package/docs/architecture/{compression.md → source-projection.md} +21 -3
- package/docs/architecture.md +15 -15
- package/docs/archive/CHANGELOG-pre-2.11.0.md +141 -0
- package/docs/catalog.md +25 -12
- package/docs/contracts/adr-architectural-consensus-mechanism.md +68 -0
- package/docs/contracts/adr-level-6-productization.md +7 -9
- package/docs/contracts/ai-council-config.md +658 -0
- package/docs/contracts/command-clusters.md +58 -2
- package/docs/contracts/command-surface-tiers.md +3 -2
- package/docs/contracts/cost-profile-defaults.md +5 -0
- package/docs/contracts/decision-engine-gates.md +5 -0
- package/docs/contracts/decision-trace-v1.md +2 -2
- package/docs/contracts/file-ownership-matrix.json +1735 -72
- package/docs/contracts/installed-tools-lockfile.md +2 -1
- package/docs/contracts/low-impact-corpus-format.md +95 -0
- package/docs/contracts/mcp-beta-criteria.md +6 -5
- package/docs/contracts/mcp-cloud-scope.md +5 -4
- package/docs/contracts/multi-tool-projection-fidelity.md +115 -0
- package/docs/contracts/release-trunk-sync.md +4 -3
- package/docs/contracts/tier-3-contrib-plugin.md +5 -6
- package/docs/getting-started.md +2 -2
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +2 -1
- package/docs/installation.md +32 -0
- package/package.json +1 -1
- package/scripts/_archive/README.md +59 -0
- package/scripts/_cli/cmd_doctor.py +134 -0
- package/scripts/ai_council/_default_prices.py +10 -1
- package/scripts/ai_council/advisors.py +148 -0
- package/scripts/ai_council/airgap.py +165 -0
- package/scripts/ai_council/cli_hints.py +123 -0
- package/scripts/ai_council/clients.py +959 -5
- package/scripts/ai_council/compile_corpus.py +178 -0
- package/scripts/ai_council/confidence_gate.py +156 -0
- package/scripts/ai_council/config.py +1364 -0
- package/scripts/ai_council/consensus.py +329 -0
- package/scripts/ai_council/events_log.py +137 -0
- package/scripts/ai_council/learn_low_impact_preview.py +252 -0
- package/scripts/ai_council/low_impact.py +714 -0
- package/scripts/ai_council/low_impact_corpus.py +466 -0
- package/scripts/ai_council/low_impact_intake.py +163 -0
- package/scripts/ai_council/modes.py +6 -1
- package/scripts/ai_council/necessity.py +782 -0
- package/scripts/ai_council/orchestrator.py +872 -20
- package/scripts/ai_council/probation_gate.py +152 -0
- package/scripts/ai_council/prompts.py +335 -0
- package/scripts/ai_council/redact_low_impact_entry.py +155 -0
- package/scripts/ai_council/replay.py +155 -0
- package/scripts/ai_council/session.py +19 -1
- package/scripts/ai_council/shadow_dispatch.py +235 -0
- package/scripts/ai_council/solo_dispatch.py +226 -0
- package/scripts/audit_cloud_compatibility.py +74 -0
- package/scripts/audit_command_surface.py +363 -0
- package/scripts/check_compressed_paths.py +6 -1
- package/scripts/check_council_layout.py +11 -0
- package/scripts/ci_time_ratio.py +168 -0
- package/scripts/council_cli.py +2005 -30
- package/scripts/install.sh +12 -0
- package/scripts/measure_projection_bytes.py +159 -0
- package/scripts/measure_roadmap_trajectory.py +112 -0
- package/scripts/probe_projection_fidelity.py +202 -0
- package/scripts/score_skill_selection.py +198 -0
- package/scripts/skill_collision_clusters.py +162 -0
- /package/scripts/{_backfill_skill_domains.py → _archive/_backfill_skill_domains.py} +0 -0
- /package/scripts/{_bootstrap_tier_frontmatter.py → _archive/_bootstrap_tier_frontmatter.py} +0 -0
- /package/scripts/{_p43_bodies.py → _archive/_p43_bodies.py} +0 -0
- /package/scripts/{_p43_compress.py → _archive/_p43_compress.py} +0 -0
- /package/scripts/{_p4_migrate.py → _archive/_p4_migrate.py} +0 -0
- /package/scripts/{_phase2_shim_helper.py → _archive/_phase2_shim_helper.py} +0 -0
- /package/scripts/{_pilot_council_question.py → _archive/_pilot_council_question.py} +0 -0
|
@@ -20,7 +20,7 @@ CouncilResponse, never raise) is unchanged.
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
22
|
from dataclasses import dataclass
|
|
23
|
-
from typing import Callable
|
|
23
|
+
from typing import Any, Callable
|
|
24
24
|
|
|
25
25
|
from scripts.ai_council.budget_guard import (
|
|
26
26
|
record_spend as _record_daily_spend,
|
|
@@ -32,6 +32,18 @@ from scripts.ai_council.clients import (
|
|
|
32
32
|
CouncilResponse,
|
|
33
33
|
ExternalAIClient,
|
|
34
34
|
)
|
|
35
|
+
from scripts.ai_council.consensus import (
|
|
36
|
+
ConsensusBucket,
|
|
37
|
+
ConsensusMetadata,
|
|
38
|
+
Finding,
|
|
39
|
+
FindingScore,
|
|
40
|
+
aggregate_scores,
|
|
41
|
+
anonymize_findings,
|
|
42
|
+
anonymize_responses,
|
|
43
|
+
bucket_by_threshold,
|
|
44
|
+
parse_findings_response,
|
|
45
|
+
parse_scores_response,
|
|
46
|
+
)
|
|
35
47
|
from scripts.ai_council.pricing import (
|
|
36
48
|
CostEstimate,
|
|
37
49
|
PriceTable,
|
|
@@ -39,7 +51,16 @@ from scripts.ai_council.pricing import (
|
|
|
39
51
|
estimate_input_tokens,
|
|
40
52
|
)
|
|
41
53
|
from scripts.ai_council.project_context import ProjectContext
|
|
42
|
-
from scripts.ai_council.
|
|
54
|
+
from scripts.ai_council.advisors import AdvisorPlan
|
|
55
|
+
from scripts.ai_council.prompts import (
|
|
56
|
+
advisor_system_prompt,
|
|
57
|
+
build_extraction_user_prompt,
|
|
58
|
+
build_peer_review_user_prompt,
|
|
59
|
+
build_scoring_user_prompt,
|
|
60
|
+
peer_review_synthesis_addendum,
|
|
61
|
+
synthesis_template,
|
|
62
|
+
system_prompt_for,
|
|
63
|
+
)
|
|
43
64
|
|
|
44
65
|
|
|
45
66
|
@dataclass
|
|
@@ -78,6 +99,99 @@ class OverrunEvent:
|
|
|
78
99
|
OnOverrunCallback = Callable[[OverrunEvent], bool]
|
|
79
100
|
|
|
80
101
|
|
|
102
|
+
@dataclass(frozen=True)
|
|
103
|
+
class DebateCostEstimate:
|
|
104
|
+
"""Pre-flight debate cost summary (Phase 8).
|
|
105
|
+
|
|
106
|
+
``low_usd`` / ``expected_usd`` / ``high_usd`` are the rolled-up
|
|
107
|
+
spend bounds across every billable member × ``rounds``. The
|
|
108
|
+
expected estimate matches the per-round ``estimate()`` total
|
|
109
|
+
multiplied by rounds (worst-case ``max_output_tokens``). ``low_usd``
|
|
110
|
+
discounts output to 25% of the ceiling — most members do not hit
|
|
111
|
+
their token budget. ``high_usd`` adds a 20% over-run buffer per the
|
|
112
|
+
roadmap's ±20% accuracy target.
|
|
113
|
+
|
|
114
|
+
``per_member`` carries one entry per billable member with the same
|
|
115
|
+
bound triple, plus the member's transport label (api / cli /
|
|
116
|
+
manual). ``subscription_members`` lists non-billable members so the
|
|
117
|
+
disclosure block can call out the "covered by subscription" rows
|
|
118
|
+
without summing them into USD totals.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
rounds: int
|
|
122
|
+
low_usd: float
|
|
123
|
+
expected_usd: float
|
|
124
|
+
high_usd: float
|
|
125
|
+
per_member: list[dict[str, Any]]
|
|
126
|
+
subscription_members: list[dict[str, str]]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def estimate_debate_cost(
|
|
130
|
+
question: CouncilQuestion,
|
|
131
|
+
members: list[ExternalAIClient],
|
|
132
|
+
table: PriceTable,
|
|
133
|
+
*,
|
|
134
|
+
rounds: int,
|
|
135
|
+
project: ProjectContext | None = None,
|
|
136
|
+
original_ask: str = "",
|
|
137
|
+
advisor_plans: dict[str, AdvisorPlan] | None = None,
|
|
138
|
+
) -> DebateCostEstimate:
|
|
139
|
+
"""Project total spend for an N-round debate across all members.
|
|
140
|
+
|
|
141
|
+
Mirrors :func:`estimate` per-member, then multiplies by ``rounds``
|
|
142
|
+
to account for the per-round preamble + critique pass. CLI / manual
|
|
143
|
+
members (``billable=False``) are excluded from USD totals and
|
|
144
|
+
surfaced separately in ``subscription_members`` so the disclosure
|
|
145
|
+
block can label them as covered by the user's flat-rate plan.
|
|
146
|
+
"""
|
|
147
|
+
if rounds < 1:
|
|
148
|
+
raise ValueError(f"rounds must be >= 1 (got {rounds!r}).")
|
|
149
|
+
billable_members = [m for m in members if getattr(m, "billable", True)]
|
|
150
|
+
sub_members = [
|
|
151
|
+
{
|
|
152
|
+
"name": m.name,
|
|
153
|
+
"model": m.model,
|
|
154
|
+
"transport": getattr(m, "transport", "api"),
|
|
155
|
+
"subscription_label": getattr(m, "subscription_label", ""),
|
|
156
|
+
}
|
|
157
|
+
for m in members
|
|
158
|
+
if not getattr(m, "billable", True)
|
|
159
|
+
]
|
|
160
|
+
per_round = estimate(
|
|
161
|
+
question, billable_members, table,
|
|
162
|
+
project=project, original_ask=original_ask,
|
|
163
|
+
advisor_plans=advisor_plans,
|
|
164
|
+
)
|
|
165
|
+
expected = sum(e.total_usd for e in per_round) * rounds
|
|
166
|
+
# Low bound: output tokens rarely reach `max_output_tokens` ceiling.
|
|
167
|
+
# Use input-only cost + 25% of the output ceiling — empirical floor
|
|
168
|
+
# from manual debate traces.
|
|
169
|
+
low = (
|
|
170
|
+
sum(e.input_usd + 0.25 * e.output_usd for e in per_round) * rounds
|
|
171
|
+
)
|
|
172
|
+
# High bound: +20% over-run buffer (roadmap ±20% accuracy target).
|
|
173
|
+
high = expected * 1.20
|
|
174
|
+
per_member: list[dict[str, Any]] = []
|
|
175
|
+
for member, est in zip(billable_members, per_round):
|
|
176
|
+
member_expected = est.total_usd * rounds
|
|
177
|
+
per_member.append({
|
|
178
|
+
"name": member.name,
|
|
179
|
+
"model": member.model,
|
|
180
|
+
"transport": getattr(member, "transport", "api"),
|
|
181
|
+
"low_usd": (est.input_usd + 0.25 * est.output_usd) * rounds,
|
|
182
|
+
"expected_usd": member_expected,
|
|
183
|
+
"high_usd": member_expected * 1.20,
|
|
184
|
+
})
|
|
185
|
+
return DebateCostEstimate(
|
|
186
|
+
rounds=rounds,
|
|
187
|
+
low_usd=low,
|
|
188
|
+
expected_usd=expected,
|
|
189
|
+
high_usd=high,
|
|
190
|
+
per_member=per_member,
|
|
191
|
+
subscription_members=sub_members,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
81
195
|
def estimate(
|
|
82
196
|
question: CouncilQuestion,
|
|
83
197
|
members: list[ExternalAIClient],
|
|
@@ -85,21 +199,41 @@ def estimate(
|
|
|
85
199
|
*,
|
|
86
200
|
project: ProjectContext | None = None,
|
|
87
201
|
original_ask: str = "",
|
|
202
|
+
advisor_plans: dict[str, AdvisorPlan] | None = None,
|
|
88
203
|
) -> list[CostEstimate]:
|
|
89
204
|
"""Return a pre-call cost estimate per member, in input order.
|
|
90
205
|
|
|
91
206
|
`project` and `original_ask` are passed through to
|
|
92
207
|
`system_prompt_for()` so the estimate covers the handoff preamble
|
|
93
208
|
bytes too. Both default to v1-shape (no preamble extension).
|
|
209
|
+
|
|
210
|
+
`advisor_plans` (Phase 6) — when a member's name has a plan, the
|
|
211
|
+
estimate uses the advisor persona system prompt (typically larger
|
|
212
|
+
than the bare mode addendum). The cost estimator must mirror
|
|
213
|
+
`_run_round` exactly so the pre-call preview never under-states
|
|
214
|
+
the advisor-mode bill.
|
|
94
215
|
"""
|
|
95
|
-
|
|
216
|
+
plans = advisor_plans or {}
|
|
217
|
+
base_user_tokens = estimate_input_tokens(question.user_prompt)
|
|
218
|
+
base_sys = system_prompt_for(
|
|
96
219
|
question.mode, project=project, original_ask=original_ask,
|
|
97
220
|
)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
221
|
+
base_sys_tokens = estimate_input_tokens(base_sys)
|
|
222
|
+
estimates: list[CostEstimate] = []
|
|
223
|
+
for m in members:
|
|
224
|
+
plan = plans.get(m.name)
|
|
225
|
+
if plan is None:
|
|
226
|
+
sys_tokens = base_sys_tokens
|
|
227
|
+
else:
|
|
228
|
+
sys_prompt = advisor_system_prompt(
|
|
229
|
+
plan.persona_text, project=project, original_ask=original_ask,
|
|
230
|
+
)
|
|
231
|
+
sys_tokens = estimate_input_tokens(sys_prompt)
|
|
232
|
+
input_tokens = base_user_tokens + sys_tokens
|
|
233
|
+
estimates.append(
|
|
234
|
+
estimate_cost(m.name, m.model, input_tokens, question.max_tokens, table),
|
|
235
|
+
)
|
|
236
|
+
return estimates
|
|
103
237
|
|
|
104
238
|
|
|
105
239
|
def consult(
|
|
@@ -113,6 +247,7 @@ def consult(
|
|
|
113
247
|
original_ask: str = "",
|
|
114
248
|
rounds: int = 1,
|
|
115
249
|
on_round_complete: Callable[[int, list[CouncilResponse]], None] | None = None,
|
|
250
|
+
advisor_plans: dict[str, AdvisorPlan] | None = None,
|
|
116
251
|
) -> list[CouncilResponse]:
|
|
117
252
|
"""Sequentially fan out `question` to every enabled member.
|
|
118
253
|
|
|
@@ -133,6 +268,9 @@ def consult(
|
|
|
133
268
|
accumulate across rounds. Returns the FINAL round's responses;
|
|
134
269
|
use `on_round_complete(round_idx, responses)` to capture
|
|
135
270
|
intermediate rounds.
|
|
271
|
+
- `advisor_plans` (Phase 6) keyed by provider name swaps the
|
|
272
|
+
member's system prompt for the advisor persona via
|
|
273
|
+
`advisor_system_prompt()`. Replace-mode: no extra calls.
|
|
136
274
|
"""
|
|
137
275
|
if rounds < 1:
|
|
138
276
|
raise ValueError(f"rounds must be >= 1 (got {rounds})")
|
|
@@ -162,6 +300,7 @@ def consult(
|
|
|
162
300
|
members, round_question, budget, spent,
|
|
163
301
|
table=table, on_overrun=on_overrun,
|
|
164
302
|
project=project, original_ask=original_ask,
|
|
303
|
+
advisor_plans=advisor_plans,
|
|
165
304
|
)
|
|
166
305
|
if on_round_complete is not None:
|
|
167
306
|
on_round_complete(round_idx, last_results)
|
|
@@ -183,14 +322,29 @@ def _run_round(
|
|
|
183
322
|
on_overrun: OnOverrunCallback | None,
|
|
184
323
|
project: ProjectContext | None,
|
|
185
324
|
original_ask: str,
|
|
325
|
+
advisor_plans: dict[str, AdvisorPlan] | None = None,
|
|
186
326
|
) -> list[CouncilResponse]:
|
|
187
327
|
"""Run a single round; mutate `spent` with cumulative totals."""
|
|
188
|
-
|
|
328
|
+
plans = advisor_plans or {}
|
|
329
|
+
base_system_prompt = system_prompt_for(
|
|
189
330
|
question.mode, project=project, original_ask=original_ask,
|
|
190
331
|
)
|
|
332
|
+
|
|
333
|
+
def _system_prompt_for_member(m: ExternalAIClient) -> str:
|
|
334
|
+
plan = plans.get(m.name)
|
|
335
|
+
if plan is None:
|
|
336
|
+
return base_system_prompt
|
|
337
|
+
return advisor_system_prompt(
|
|
338
|
+
plan.persona_text, project=project, original_ask=original_ask,
|
|
339
|
+
)
|
|
340
|
+
|
|
191
341
|
results: list[CouncilResponse] = []
|
|
192
342
|
estimates = (
|
|
193
|
-
estimate(
|
|
343
|
+
estimate(
|
|
344
|
+
question, members, table,
|
|
345
|
+
project=project, original_ask=original_ask,
|
|
346
|
+
advisor_plans=advisor_plans,
|
|
347
|
+
)
|
|
194
348
|
if table is not None
|
|
195
349
|
else None
|
|
196
350
|
)
|
|
@@ -202,12 +356,16 @@ def _run_round(
|
|
|
202
356
|
# observability, but no projection / budget breach can apply.
|
|
203
357
|
if not getattr(member, "billable", True):
|
|
204
358
|
try:
|
|
205
|
-
response = member.ask(
|
|
359
|
+
response = member.ask(
|
|
360
|
+
_system_prompt_for_member(member),
|
|
361
|
+
question.user_prompt, question.max_tokens,
|
|
362
|
+
)
|
|
206
363
|
except Exception as exc: # noqa: BLE001 - last-resort safety net
|
|
207
364
|
response = CouncilResponse(
|
|
208
365
|
provider=member.name, model=member.model, text="",
|
|
209
366
|
error=f"{type(exc).__name__}: {exc}",
|
|
210
367
|
)
|
|
368
|
+
_stamp_transport_metadata(response, member)
|
|
211
369
|
results.append(response)
|
|
212
370
|
spent["input"] += response.input_tokens
|
|
213
371
|
spent["output"] += response.output_tokens
|
|
@@ -265,7 +423,10 @@ def _run_round(
|
|
|
265
423
|
|
|
266
424
|
# ── actual call ──────────────────────────────────────────────
|
|
267
425
|
try:
|
|
268
|
-
response = member.ask(
|
|
426
|
+
response = member.ask(
|
|
427
|
+
_system_prompt_for_member(member),
|
|
428
|
+
question.user_prompt, question.max_tokens,
|
|
429
|
+
)
|
|
269
430
|
except Exception as exc: # noqa: BLE001 - last-resort safety net
|
|
270
431
|
response = CouncilResponse(
|
|
271
432
|
provider=member.name, model=member.model, text="",
|
|
@@ -274,6 +435,7 @@ def _run_round(
|
|
|
274
435
|
results.append(response)
|
|
275
436
|
spent["input"] += response.input_tokens
|
|
276
437
|
spent["output"] += response.output_tokens
|
|
438
|
+
actual_usd: float | None = None
|
|
277
439
|
if estimates is not None and table is not None:
|
|
278
440
|
# Bill the actual output against the budget using the
|
|
279
441
|
# member's per-1M output rate. Re-use estimate_cost with
|
|
@@ -282,6 +444,7 @@ def _run_round(
|
|
|
282
444
|
member.name, member.model,
|
|
283
445
|
response.input_tokens, response.output_tokens, table,
|
|
284
446
|
)
|
|
447
|
+
actual_usd = actual.total_usd
|
|
285
448
|
spent["usd"] += actual.total_usd
|
|
286
449
|
# Persist to the rolling 24h ledger when the daily cap is
|
|
287
450
|
# active. Errors are swallowed inside record_spend.
|
|
@@ -289,14 +452,44 @@ def _run_round(
|
|
|
289
452
|
_record_daily_spend(
|
|
290
453
|
actual.total_usd, member.name, member.model,
|
|
291
454
|
)
|
|
455
|
+
_stamp_transport_metadata(response, member, cost_usd=actual_usd)
|
|
292
456
|
|
|
293
457
|
return results
|
|
294
458
|
|
|
295
459
|
|
|
296
460
|
def _aborted(member: ExternalAIClient, reason: str) -> CouncilResponse:
|
|
297
|
-
|
|
461
|
+
response = CouncilResponse(
|
|
298
462
|
provider=member.name, model=member.model, text="", error=reason,
|
|
299
463
|
)
|
|
464
|
+
_stamp_transport_metadata(response, member)
|
|
465
|
+
return response
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _stamp_transport_metadata(
|
|
469
|
+
response: CouncilResponse,
|
|
470
|
+
member: ExternalAIClient,
|
|
471
|
+
*,
|
|
472
|
+
cost_usd: float | None = None,
|
|
473
|
+
) -> None:
|
|
474
|
+
"""Annotate `response.metadata` with transport / billable / cost info.
|
|
475
|
+
|
|
476
|
+
Phase 5 / Step 1 — the session writer and orchestrator renderer key
|
|
477
|
+
off these fields to format the cost line as either
|
|
478
|
+
``cost: subscription (claude-pro)`` (non-billable vendor CLI) or
|
|
479
|
+
``cost: $0.NNNN (… in / … out)`` (billable api or community CLI).
|
|
480
|
+
Stamped here (and not in each client) so the writer stays decoupled
|
|
481
|
+
from the client class hierarchy.
|
|
482
|
+
"""
|
|
483
|
+
meta = dict(response.metadata or {})
|
|
484
|
+
transport = getattr(member, "transport", "api")
|
|
485
|
+
meta.setdefault("transport", transport)
|
|
486
|
+
meta.setdefault("billable", bool(getattr(member, "billable", True)))
|
|
487
|
+
label = getattr(member, "subscription_label", "") or ""
|
|
488
|
+
if label and not meta.get("billable", True):
|
|
489
|
+
meta.setdefault("subscription_label", label)
|
|
490
|
+
if cost_usd is not None:
|
|
491
|
+
meta["cost_usd"] = float(cost_usd)
|
|
492
|
+
response.metadata = meta
|
|
300
493
|
|
|
301
494
|
|
|
302
495
|
def _augment_for_next_round(
|
|
@@ -337,18 +530,677 @@ def _augment_for_next_round(
|
|
|
337
530
|
)
|
|
338
531
|
|
|
339
532
|
|
|
340
|
-
|
|
341
|
-
|
|
533
|
+
@dataclass
|
|
534
|
+
class DebateCheckpoint:
|
|
535
|
+
"""Snapshot passed to the continue-prompt callback between rounds.
|
|
536
|
+
|
|
537
|
+
Phase 7 progressive-disclosure contract — the orchestrator pauses
|
|
538
|
+
after each completed round, builds this checkpoint, and asks the
|
|
539
|
+
caller whether to continue. Returning False stops the debate
|
|
540
|
+
gracefully (caller receives every completed round).
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
completed_round: int # 1-based index of the round just finished
|
|
544
|
+
total_planned_rounds: int
|
|
545
|
+
cost_so_far_usd: float
|
|
546
|
+
next_round_estimate_usd: float
|
|
547
|
+
last_round_responses: list[CouncilResponse]
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
class DebateCapExceeded(RuntimeError):
|
|
551
|
+
"""Raised when projected next-round spend would breach the budget cap.
|
|
552
|
+
|
|
553
|
+
The CLI catches this *after* writing the partial artefact, so the
|
|
554
|
+
user always has a recoverable trail of the rounds that completed
|
|
555
|
+
before the cap fired.
|
|
556
|
+
"""
|
|
557
|
+
|
|
558
|
+
def __init__(
|
|
559
|
+
self, *,
|
|
560
|
+
completed_round: int,
|
|
561
|
+
cost_so_far: float,
|
|
562
|
+
next_estimate: float,
|
|
563
|
+
cap: float,
|
|
564
|
+
) -> None:
|
|
565
|
+
self.completed_round = completed_round
|
|
566
|
+
self.cost_so_far = cost_so_far
|
|
567
|
+
self.next_estimate = next_estimate
|
|
568
|
+
self.cap = cap
|
|
569
|
+
super().__init__(
|
|
570
|
+
f"Debate hard-cap: round {completed_round + 1} would push spend "
|
|
571
|
+
f"to ${cost_so_far + next_estimate:.4f} (cap=${cap:.4f}); "
|
|
572
|
+
f"stopping after round {completed_round}."
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
# Continue-prompt callback. Receives a DebateCheckpoint, returns True to
|
|
577
|
+
# proceed with the next round, False to stop gracefully.
|
|
578
|
+
DebateContinuePrompt = Callable[[DebateCheckpoint], bool]
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def _augment_for_debate_round(
|
|
582
|
+
original_prompt: str,
|
|
583
|
+
prior_responses: list[CouncilResponse],
|
|
584
|
+
next_round_number: int,
|
|
585
|
+
) -> str:
|
|
586
|
+
"""Build the round-N user prompt for a debate — rebuttal framing.
|
|
587
|
+
|
|
588
|
+
Same anonymisation rules as `_augment_for_next_round` (Iron Law of
|
|
589
|
+
Neutrality § multi-round): provider/model identifiers stripped,
|
|
590
|
+
"Reviewer A / B / C…" labels assigned in input order, errors
|
|
591
|
+
skipped. The instruction block is debate-specific: each reviewer
|
|
592
|
+
is asked to identify the strongest opposing position and write a
|
|
593
|
+
rebuttal, NOT to find common ground.
|
|
594
|
+
"""
|
|
342
595
|
blocks: list[str] = []
|
|
596
|
+
label_idx = 0
|
|
597
|
+
for r in prior_responses:
|
|
598
|
+
if r.error or not r.text.strip():
|
|
599
|
+
continue
|
|
600
|
+
label = chr(ord("A") + label_idx)
|
|
601
|
+
label_idx += 1
|
|
602
|
+
blocks.append(f"### Reviewer {label}\n\n{r.text.strip()}")
|
|
603
|
+
if not blocks:
|
|
604
|
+
return original_prompt
|
|
605
|
+
prior_block = "\n\n".join(blocks)
|
|
606
|
+
return (
|
|
607
|
+
f"{original_prompt}\n\n"
|
|
608
|
+
f"---\n\n"
|
|
609
|
+
f"## Prior round positions (round {next_round_number - 1})\n\n"
|
|
610
|
+
f"You are now in round {next_round_number} of a structured\n"
|
|
611
|
+
f"debate. Below are anonymised positions from independent\n"
|
|
612
|
+
f"reviewers in the previous round. You do NOT know which model\n"
|
|
613
|
+
f"produced which position.\n\n"
|
|
614
|
+
f"Identify the SINGLE strongest opposing position and write a\n"
|
|
615
|
+
f"rebuttal addressed at its strongest steel-manned form. Do NOT\n"
|
|
616
|
+
f"search for common ground — name the load-bearing flaw the\n"
|
|
617
|
+
f"opposing reviewer missed and state the evidence behind your\n"
|
|
618
|
+
f"counter-position.\n\n"
|
|
619
|
+
f"{prior_block}"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def run_debate(
|
|
624
|
+
members: list[ExternalAIClient],
|
|
625
|
+
question: CouncilQuestion,
|
|
626
|
+
*,
|
|
627
|
+
budget: CostBudget | None = None,
|
|
628
|
+
table: PriceTable | None = None,
|
|
629
|
+
on_overrun: OnOverrunCallback | None = None,
|
|
630
|
+
project: ProjectContext | None = None,
|
|
631
|
+
original_ask: str = "",
|
|
632
|
+
max_rounds: int = 2,
|
|
633
|
+
on_round_complete: Callable[[int, list[CouncilResponse]], None] | None = None,
|
|
634
|
+
on_continue: DebateContinuePrompt | None = None,
|
|
635
|
+
advisor_plans: dict[str, AdvisorPlan] | None = None,
|
|
636
|
+
seed_round_1: list[CouncilResponse] | None = None,
|
|
637
|
+
) -> list[list[CouncilResponse]]:
|
|
638
|
+
"""Run a structured multi-round debate with progressive disclosure.
|
|
639
|
+
|
|
640
|
+
Returns every completed round in order — caller persists each
|
|
641
|
+
round incrementally via `on_round_complete` for crash safety.
|
|
642
|
+
|
|
643
|
+
Round 1: each member produces an initial position. When
|
|
644
|
+
`seed_round_1` is provided, it is reused verbatim (no calls) so
|
|
645
|
+
`/council debate --continue-as-debate` can pivot from an existing
|
|
646
|
+
`/council default` session.
|
|
647
|
+
|
|
648
|
+
Round 2+: `_augment_for_debate_round` wraps the original prompt
|
|
649
|
+
with anonymised prior positions and asks each member for a
|
|
650
|
+
rebuttal addressed at the strongest opposing view.
|
|
651
|
+
|
|
652
|
+
Between rounds: `on_continue(checkpoint)` is consulted. Returning
|
|
653
|
+
False stops the debate; the caller receives every completed round.
|
|
654
|
+
`None` (the default) auto-continues — the CLI wires its
|
|
655
|
+
interactive y/N prompt here, `--auto-continue` passes `None`.
|
|
656
|
+
|
|
657
|
+
Hard cap: before kicking off round N+1, the orchestrator compares
|
|
658
|
+
`spent_usd + next_round_estimate` to `budget.max_total_usd`. A
|
|
659
|
+
projected breach raises `DebateCapExceeded`; the CLI catches it
|
|
660
|
+
after persisting the partial debate.
|
|
661
|
+
"""
|
|
662
|
+
if max_rounds < 1:
|
|
663
|
+
raise ValueError(f"max_rounds must be >= 1 (got {max_rounds})")
|
|
664
|
+
if not members:
|
|
665
|
+
return []
|
|
666
|
+
budget = budget or CostBudget()
|
|
667
|
+
if len(members) > budget.max_calls:
|
|
668
|
+
raise ValueError(
|
|
669
|
+
f"Debate has {len(members)} members but budget caps at "
|
|
670
|
+
f"{budget.max_calls} calls."
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
spent: dict[str, float] = {"input": 0, "output": 0, "usd": 0.0}
|
|
674
|
+
all_rounds: list[list[CouncilResponse]] = []
|
|
675
|
+
current_user_prompt = question.user_prompt
|
|
676
|
+
|
|
677
|
+
for round_idx in range(max_rounds):
|
|
678
|
+
round_number = round_idx + 1
|
|
679
|
+
if round_idx == 0 and seed_round_1 is not None:
|
|
680
|
+
# Pivot from /council default — reuse the existing round 1
|
|
681
|
+
# verbatim. No calls billed; spend stays at $0 until round 2.
|
|
682
|
+
results = list(seed_round_1)
|
|
683
|
+
else:
|
|
684
|
+
round_question = (
|
|
685
|
+
question if round_idx == 0
|
|
686
|
+
else CouncilQuestion(
|
|
687
|
+
mode=question.mode,
|
|
688
|
+
user_prompt=current_user_prompt,
|
|
689
|
+
max_tokens=question.max_tokens,
|
|
690
|
+
)
|
|
691
|
+
)
|
|
692
|
+
results = _run_round(
|
|
693
|
+
members, round_question, budget, spent,
|
|
694
|
+
table=table, on_overrun=on_overrun,
|
|
695
|
+
project=project, original_ask=original_ask,
|
|
696
|
+
advisor_plans=advisor_plans,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
all_rounds.append(results)
|
|
700
|
+
if on_round_complete is not None:
|
|
701
|
+
on_round_complete(round_number, results)
|
|
702
|
+
|
|
703
|
+
# Prep the user-prompt for the next round so the cost estimate
|
|
704
|
+
# below covers the augmented bytes.
|
|
705
|
+
if round_idx + 1 < max_rounds:
|
|
706
|
+
current_user_prompt = _augment_for_debate_round(
|
|
707
|
+
question.user_prompt, results, round_number + 1,
|
|
708
|
+
)
|
|
709
|
+
# Hard-cap + continue-prompt gating before kicking off N+1.
|
|
710
|
+
if table is not None:
|
|
711
|
+
next_question = CouncilQuestion(
|
|
712
|
+
mode=question.mode,
|
|
713
|
+
user_prompt=current_user_prompt,
|
|
714
|
+
max_tokens=question.max_tokens,
|
|
715
|
+
)
|
|
716
|
+
next_estimates = estimate(
|
|
717
|
+
next_question, members, table,
|
|
718
|
+
project=project, original_ask=original_ask,
|
|
719
|
+
advisor_plans=advisor_plans,
|
|
720
|
+
)
|
|
721
|
+
next_round_usd = sum(e.total_usd for e in next_estimates)
|
|
722
|
+
else:
|
|
723
|
+
next_round_usd = 0.0
|
|
724
|
+
|
|
725
|
+
if (
|
|
726
|
+
budget.max_total_usd > 0
|
|
727
|
+
and spent["usd"] + next_round_usd > budget.max_total_usd
|
|
728
|
+
):
|
|
729
|
+
raise DebateCapExceeded(
|
|
730
|
+
completed_round=round_number,
|
|
731
|
+
cost_so_far=spent["usd"],
|
|
732
|
+
next_estimate=next_round_usd,
|
|
733
|
+
cap=budget.max_total_usd,
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
if on_continue is not None:
|
|
737
|
+
checkpoint = DebateCheckpoint(
|
|
738
|
+
completed_round=round_number,
|
|
739
|
+
total_planned_rounds=max_rounds,
|
|
740
|
+
cost_so_far_usd=spent["usd"],
|
|
741
|
+
next_round_estimate_usd=next_round_usd,
|
|
742
|
+
last_round_responses=results,
|
|
743
|
+
)
|
|
744
|
+
if not on_continue(checkpoint):
|
|
745
|
+
return all_rounds
|
|
746
|
+
|
|
747
|
+
return all_rounds
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
@dataclass
|
|
751
|
+
class PeerReviewResult:
|
|
752
|
+
"""Bundle returned by `run_peer_review()` (Phase 5 / F1).
|
|
753
|
+
|
|
754
|
+
`responses` carries the per-reviewer critiques. `label_to_source`
|
|
755
|
+
is the anonymisation map captured server-side so the audit-trail
|
|
756
|
+
JSON can rehydrate it without leaking provider identity to the
|
|
757
|
+
member at prompt time.
|
|
758
|
+
|
|
759
|
+
`persona_labels` is the (optional) Phase 6 / Step 3a wiring: when
|
|
760
|
+
the deliberation was an advisor-mode run, the source → persona
|
|
761
|
+
map flows through to the renderer so peer-review output can render
|
|
762
|
+
as `Response A (Contrarian)`. Plain-member runs leave it empty.
|
|
763
|
+
"""
|
|
764
|
+
|
|
765
|
+
responses: list[CouncilResponse]
|
|
766
|
+
label_to_source: dict[str, str]
|
|
767
|
+
persona_labels: dict[str, str]
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def run_peer_review(
|
|
771
|
+
members: list[ExternalAIClient],
|
|
772
|
+
deliberation_responses: list[CouncilResponse],
|
|
773
|
+
*,
|
|
774
|
+
budget: CostBudget | None = None,
|
|
775
|
+
table: PriceTable | None = None,
|
|
776
|
+
on_overrun: OnOverrunCallback | None = None,
|
|
777
|
+
project: ProjectContext | None = None,
|
|
778
|
+
original_ask: str = "",
|
|
779
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
780
|
+
persona_labels: dict[str, str] | None = None,
|
|
781
|
+
) -> PeerReviewResult:
|
|
782
|
+
"""Karpathy peer-review pass (Phase 5 / F1).
|
|
783
|
+
|
|
784
|
+
After the final deliberation round, each member sees the OTHER
|
|
785
|
+
members' deliberation outputs under neutral `Response-A` labels
|
|
786
|
+
(provider identity stripped; advisor persona labels preserved per
|
|
787
|
+
Phase 6 Step 3a) and emits a Karpathy-style critique:
|
|
788
|
+
strongest / weakest blind spot / what all missed / refinement.
|
|
789
|
+
|
|
790
|
+
Members never see their own response — the orchestrator filters
|
|
791
|
+
self before building the anonymised prompt. Errors in one member's
|
|
792
|
+
pass tag that member but never abort the round.
|
|
793
|
+
|
|
794
|
+
Cost gates flow through `consult([member], ...)`, so the same
|
|
795
|
+
budget + daily-ledger semantics as deliberation apply.
|
|
796
|
+
"""
|
|
797
|
+
if not members or not deliberation_responses:
|
|
798
|
+
return PeerReviewResult(
|
|
799
|
+
responses=[], label_to_source={}, persona_labels={},
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
member_by_name = {m.name: m for m in members}
|
|
803
|
+
# ── source map: deliberation responses keyed by `provider:model` ─
|
|
804
|
+
# Errors and empty bodies are skipped — they leak nothing useful
|
|
805
|
+
# and would clutter the anonymised prompt with blanks.
|
|
806
|
+
by_source: dict[str, CouncilResponse] = {}
|
|
807
|
+
for r in deliberation_responses:
|
|
808
|
+
if r.error or not r.text.strip():
|
|
809
|
+
continue
|
|
810
|
+
source = f"{r.provider}:{r.model}"
|
|
811
|
+
by_source[source] = r
|
|
812
|
+
|
|
813
|
+
if len(by_source) < 2:
|
|
814
|
+
# Peer-review needs ≥ 2 distinct deliberation outputs (a
|
|
815
|
+
# reviewer with nothing else to review is a no-op).
|
|
816
|
+
return PeerReviewResult(
|
|
817
|
+
responses=[], label_to_source={}, persona_labels={},
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
persona_labels = dict(persona_labels or {})
|
|
821
|
+
review_responses: list[CouncilResponse] = []
|
|
822
|
+
# ── final label_to_source map captured from the LAST member call
|
|
823
|
+
# so the renderer / JSON dump has the deterministic A/B mapping.
|
|
824
|
+
# Each member sees a different N-1 subset (self filtered), but the
|
|
825
|
+
# ordering of `by_source` stays stable, so the label assignment is
|
|
826
|
+
# deterministic per artefact run.
|
|
827
|
+
last_label_to_source: dict[str, str] = {}
|
|
828
|
+
|
|
829
|
+
for reviewer in members:
|
|
830
|
+
scorer = f"{reviewer.name}:{reviewer.model}"
|
|
831
|
+
if reviewer.name not in member_by_name:
|
|
832
|
+
continue
|
|
833
|
+
others_pairs = [
|
|
834
|
+
(src, resp.text) for src, resp in by_source.items() if src != scorer
|
|
835
|
+
]
|
|
836
|
+
if len(others_pairs) == 0:
|
|
837
|
+
continue
|
|
838
|
+
anon_text, label_to_source = anonymize_responses(
|
|
839
|
+
others_pairs, persona_labels=persona_labels,
|
|
840
|
+
)
|
|
841
|
+
if not anon_text:
|
|
842
|
+
continue
|
|
843
|
+
last_label_to_source = label_to_source
|
|
844
|
+
question = CouncilQuestion(
|
|
845
|
+
mode="prompt",
|
|
846
|
+
user_prompt=build_peer_review_user_prompt(anon_text),
|
|
847
|
+
max_tokens=max_tokens,
|
|
848
|
+
)
|
|
849
|
+
reviewed = consult(
|
|
850
|
+
[reviewer], question,
|
|
851
|
+
budget=budget, table=table, on_overrun=on_overrun,
|
|
852
|
+
project=project, original_ask=original_ask,
|
|
853
|
+
)
|
|
854
|
+
review_responses.extend(reviewed)
|
|
855
|
+
|
|
856
|
+
return PeerReviewResult(
|
|
857
|
+
responses=review_responses,
|
|
858
|
+
label_to_source=last_label_to_source,
|
|
859
|
+
persona_labels=persona_labels,
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
@dataclass
|
|
864
|
+
class ConsensusResult:
|
|
865
|
+
"""Bundle returned by `run_consensus_scoring()`.
|
|
866
|
+
|
|
867
|
+
`bucket` is renderer-ready; `findings`, `scores`, and `metadata`
|
|
868
|
+
are kept for audit-trail JSON (council-sessions/*.json).
|
|
869
|
+
"""
|
|
870
|
+
|
|
871
|
+
bucket: ConsensusBucket
|
|
872
|
+
findings: list[Finding]
|
|
873
|
+
scores: list[FindingScore]
|
|
874
|
+
metadata: dict[str, ConsensusMetadata]
|
|
875
|
+
extraction_responses: list[CouncilResponse]
|
|
876
|
+
scoring_responses: list[CouncilResponse]
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def run_consensus_scoring(
|
|
880
|
+
members: list[ExternalAIClient],
|
|
881
|
+
deliberation_responses: list[CouncilResponse],
|
|
882
|
+
*,
|
|
883
|
+
budget: CostBudget | None = None,
|
|
884
|
+
table: PriceTable | None = None,
|
|
885
|
+
on_overrun: OnOverrunCallback | None = None,
|
|
886
|
+
project: ProjectContext | None = None,
|
|
887
|
+
original_ask: str = "",
|
|
888
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
889
|
+
strong_threshold: float = 0.7,
|
|
890
|
+
minority_threshold: float = 0.4,
|
|
891
|
+
) -> ConsensusResult:
|
|
892
|
+
"""Two-pass consensus round (Phase 4 / F3).
|
|
893
|
+
|
|
894
|
+
Pass 1 — extraction: each member re-emits its own deliberation as
|
|
895
|
+
a JSON array of `{id, text}` findings. Pass 2 — scoring: each
|
|
896
|
+
member sees the *other* members' findings under anonymous labels
|
|
897
|
+
and rates them 1-10 + agree/disagree + reason.
|
|
898
|
+
|
|
899
|
+
The cost budget is shared across both passes; the daily ledger
|
|
900
|
+
receives both. Errors in one member's extraction or scoring tag
|
|
901
|
+
that member but never abort the round.
|
|
902
|
+
"""
|
|
903
|
+
if not members or not deliberation_responses:
|
|
904
|
+
return ConsensusResult(
|
|
905
|
+
bucket=ConsensusBucket(), findings=[], scores=[], metadata={},
|
|
906
|
+
extraction_responses=[], scoring_responses=[],
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
# ── Pass 1: extraction ──────────────────────────────────────────
|
|
910
|
+
member_by_name = {m.name: m for m in members}
|
|
911
|
+
extraction_responses: list[CouncilResponse] = []
|
|
912
|
+
all_findings: list[Finding] = []
|
|
913
|
+
for resp in deliberation_responses:
|
|
914
|
+
member = member_by_name.get(resp.provider)
|
|
915
|
+
if member is None or resp.error or not resp.text.strip():
|
|
916
|
+
continue
|
|
917
|
+
question = CouncilQuestion(
|
|
918
|
+
mode="prompt",
|
|
919
|
+
user_prompt=build_extraction_user_prompt(resp.text),
|
|
920
|
+
max_tokens=max_tokens,
|
|
921
|
+
)
|
|
922
|
+
extracted = consult(
|
|
923
|
+
[member], question,
|
|
924
|
+
budget=budget, table=table, on_overrun=on_overrun,
|
|
925
|
+
project=project, original_ask=original_ask,
|
|
926
|
+
)
|
|
927
|
+
extraction_responses.extend(extracted)
|
|
928
|
+
if not extracted or extracted[0].error:
|
|
929
|
+
continue
|
|
930
|
+
source = f"{member.name}:{member.model}"
|
|
931
|
+
all_findings.extend(
|
|
932
|
+
parse_findings_response(extracted[0].text, source=source),
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
if not all_findings:
|
|
936
|
+
return ConsensusResult(
|
|
937
|
+
bucket=ConsensusBucket(), findings=[], scores=[], metadata={},
|
|
938
|
+
extraction_responses=extraction_responses, scoring_responses=[],
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
# ── Pass 2: scoring (each member rates the OTHERS' findings) ────
|
|
942
|
+
scoring_responses: list[CouncilResponse] = []
|
|
943
|
+
all_scores: list[FindingScore] = []
|
|
944
|
+
for member in members:
|
|
945
|
+
scorer = f"{member.name}:{member.model}"
|
|
946
|
+
others = [f for f in all_findings if f.source != scorer]
|
|
947
|
+
if not others:
|
|
948
|
+
continue
|
|
949
|
+
anon = anonymize_findings(others)
|
|
950
|
+
label_to_id = {label: f.id for label, f in anon.items()}
|
|
951
|
+
anon_text = {label: f.text for label, f in anon.items()}
|
|
952
|
+
question = CouncilQuestion(
|
|
953
|
+
mode="prompt",
|
|
954
|
+
user_prompt=build_scoring_user_prompt(anon_text),
|
|
955
|
+
max_tokens=max_tokens,
|
|
956
|
+
)
|
|
957
|
+
scored = consult(
|
|
958
|
+
[member], question,
|
|
959
|
+
budget=budget, table=table, on_overrun=on_overrun,
|
|
960
|
+
project=project, original_ask=original_ask,
|
|
961
|
+
)
|
|
962
|
+
scoring_responses.extend(scored)
|
|
963
|
+
if not scored or scored[0].error:
|
|
964
|
+
continue
|
|
965
|
+
for s in parse_scores_response(scored[0].text, scorer=scorer):
|
|
966
|
+
real_id = label_to_id.get(s.finding_id)
|
|
967
|
+
if real_id is None:
|
|
968
|
+
continue
|
|
969
|
+
all_scores.append(FindingScore(
|
|
970
|
+
finding_id=real_id, scorer=s.scorer, score=s.score,
|
|
971
|
+
agree=s.agree, reason=s.reason,
|
|
972
|
+
))
|
|
973
|
+
|
|
974
|
+
metadata = aggregate_scores(all_findings, all_scores)
|
|
975
|
+
bucket = bucket_by_threshold(
|
|
976
|
+
all_findings, metadata,
|
|
977
|
+
strong=strong_threshold, minority=minority_threshold,
|
|
978
|
+
)
|
|
979
|
+
return ConsensusResult(
|
|
980
|
+
bucket=bucket, findings=all_findings, scores=all_scores,
|
|
981
|
+
metadata=metadata, extraction_responses=extraction_responses,
|
|
982
|
+
scoring_responses=scoring_responses,
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def _render_response_meta(r: CouncilResponse) -> str:
|
|
987
|
+
"""Format the per-member meta line — tokens, cost (or subscription), latency.
|
|
988
|
+
|
|
989
|
+
Phase 5 / Step 1 — non-billable vendor-CLI calls render
|
|
990
|
+
``cost: subscription (<label>)`` with no token detail (the local
|
|
991
|
+
session counted them but the user is on a flat rate). Billable
|
|
992
|
+
calls (api or community CLI) render ``cost: $X.XXXX`` plus tokens.
|
|
993
|
+
Tokens marked ``estimated=True`` get a ``~`` prefix so the audit
|
|
994
|
+
trail flags heuristic counts.
|
|
995
|
+
"""
|
|
996
|
+
meta_dict = r.metadata or {}
|
|
997
|
+
billable = bool(meta_dict.get("billable", True))
|
|
998
|
+
estimated = bool(meta_dict.get("tokens_estimated", False))
|
|
999
|
+
parts: list[str] = []
|
|
1000
|
+
if not billable:
|
|
1001
|
+
label = meta_dict.get("subscription_label") or "flat-rate"
|
|
1002
|
+
parts.append(f"cost: subscription ({label})")
|
|
1003
|
+
else:
|
|
1004
|
+
cost_usd = meta_dict.get("cost_usd")
|
|
1005
|
+
if isinstance(cost_usd, (int, float)):
|
|
1006
|
+
parts.append(f"cost: ${cost_usd:.4f}")
|
|
1007
|
+
prefix = "~" if estimated else ""
|
|
1008
|
+
parts.append(
|
|
1009
|
+
f"tokens: {prefix}{r.input_tokens} in / {prefix}{r.output_tokens} out"
|
|
1010
|
+
)
|
|
1011
|
+
parts.append(f"{r.latency_ms} ms")
|
|
1012
|
+
return f"*{' · '.join(parts)}*"
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
# Lens defaults for the Phase 9 confidence-explanation badge. The PR
|
|
1016
|
+
# lens stays terse so the existing "Must-fix / Nice-to-have" structure
|
|
1017
|
+
# isn't drowned in scorer prose; every other decision lens shows the
|
|
1018
|
+
# explanation by default. Creative lenses (design/optimize) never reach
|
|
1019
|
+
# this code path because they skip consensus scoring entirely.
|
|
1020
|
+
_DEFAULT_EXPLAIN_LENSES: frozenset[str] = frozenset({
|
|
1021
|
+
"default", "analysis", "debate", "prompt", "roadmap", "diff", "files",
|
|
1022
|
+
})
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def _default_explain_confidence(mode: str | None) -> bool:
|
|
1026
|
+
"""Decide whether the confidence-explanation badge fires by default.
|
|
1027
|
+
|
|
1028
|
+
Pulled into a helper so the CLI ``--explain-confidence`` /
|
|
1029
|
+
``--no-explain-confidence`` flags and the lens override path share
|
|
1030
|
+
one truth source.
|
|
1031
|
+
"""
|
|
1032
|
+
if mode is None:
|
|
1033
|
+
return True
|
|
1034
|
+
return mode in _DEFAULT_EXPLAIN_LENSES
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
def render(
|
|
1038
|
+
responses: list[CouncilResponse],
|
|
1039
|
+
*,
|
|
1040
|
+
mode: str | None = None,
|
|
1041
|
+
prose_synthesis: bool | None = None,
|
|
1042
|
+
consensus: ConsensusResult | None = None,
|
|
1043
|
+
peer_review: PeerReviewResult | None = None,
|
|
1044
|
+
explain_confidence: bool | None = None,
|
|
1045
|
+
) -> str:
|
|
1046
|
+
"""Render stacked sections + a lens-aware synthesis prompt slot.
|
|
1047
|
+
|
|
1048
|
+
`mode` selects the synthesis template from `prompts.synthesis_template`.
|
|
1049
|
+
`None` collapses to the default decision-lens template (back-compat).
|
|
1050
|
+
|
|
1051
|
+
`prose_synthesis` is the R4 Q4 escape hatch:
|
|
1052
|
+
- `True` → force creative-lens passthrough (bare slot) regardless of mode
|
|
1053
|
+
- `False` → force decision-lens default template even on creative lenses
|
|
1054
|
+
- `None` → honour the lens default from the table
|
|
1055
|
+
|
|
1056
|
+
`consensus` (Phase 4 / F3) prepends Strong Consensus / Findings /
|
|
1057
|
+
Minority Views sections when the analysis lens scored its findings.
|
|
1058
|
+
|
|
1059
|
+
`peer_review` (Phase 5 / F1) appends a Peer-Review block listing
|
|
1060
|
+
each member's critique (under Reviewer-A / Reviewer-B labels, in
|
|
1061
|
+
member input order so the audit trail is deterministic) and
|
|
1062
|
+
extends the synthesis template with the
|
|
1063
|
+
`Peer-Review-Surfaced Blind Spots` addendum.
|
|
1064
|
+
"""
|
|
1065
|
+
blocks: list[str] = []
|
|
1066
|
+
explain = (
|
|
1067
|
+
explain_confidence
|
|
1068
|
+
if explain_confidence is not None
|
|
1069
|
+
else _default_explain_confidence(mode)
|
|
1070
|
+
)
|
|
1071
|
+
if consensus is not None and (
|
|
1072
|
+
consensus.bucket.strong or consensus.bucket.findings or consensus.bucket.minority
|
|
1073
|
+
):
|
|
1074
|
+
blocks.append(_render_consensus(consensus.bucket, explain=explain))
|
|
343
1075
|
for r in responses:
|
|
344
1076
|
header = f"## {r.provider} · {r.model}"
|
|
345
1077
|
if r.error:
|
|
346
1078
|
blocks.append(f"{header}\n\n*ERROR:* `{r.error}`")
|
|
347
1079
|
continue
|
|
348
|
-
meta = (
|
|
349
|
-
f"*tokens: {r.input_tokens} in / {r.output_tokens} out · "
|
|
350
|
-
f"{r.latency_ms} ms*"
|
|
351
|
-
)
|
|
1080
|
+
meta = _render_response_meta(r)
|
|
352
1081
|
blocks.append(f"{header}\n\n{meta}\n\n{r.text}")
|
|
353
|
-
|
|
1082
|
+
if peer_review is not None and peer_review.responses:
|
|
1083
|
+
blocks.append(_render_peer_review(peer_review))
|
|
1084
|
+
if prose_synthesis is True:
|
|
1085
|
+
template = ""
|
|
1086
|
+
elif prose_synthesis is False:
|
|
1087
|
+
template = synthesis_template("default")
|
|
1088
|
+
else:
|
|
1089
|
+
template = synthesis_template(mode)
|
|
1090
|
+
if peer_review is not None and peer_review.responses:
|
|
1091
|
+
addendum = peer_review_synthesis_addendum()
|
|
1092
|
+
template = f"{template}\n{addendum}" if template else addendum.lstrip()
|
|
1093
|
+
if template:
|
|
1094
|
+
body = template
|
|
1095
|
+
else:
|
|
1096
|
+
body = "*to be summarised by the host agent*"
|
|
1097
|
+
blocks.append(f"## Convergence / Divergence\n\n{body}")
|
|
354
1098
|
return "\n\n---\n\n".join(blocks)
|
|
1099
|
+
|
|
1100
|
+
|
|
1101
|
+
def _render_peer_review(peer_review: PeerReviewResult) -> str:
|
|
1102
|
+
"""Render the peer-review block under deterministic Reviewer labels.
|
|
1103
|
+
|
|
1104
|
+
Each successful reviewer gets a `### Reviewer X` sub-section. Errors
|
|
1105
|
+
keep their slot (so the audit trail still surfaces the breach) but
|
|
1106
|
+
render `ERROR: <tag>` instead of the prompt body.
|
|
1107
|
+
"""
|
|
1108
|
+
lines = ["## Peer-Review (Karpathy)"]
|
|
1109
|
+
label_idx = 0
|
|
1110
|
+
for r in peer_review.responses:
|
|
1111
|
+
label = chr(ord("A") + label_idx)
|
|
1112
|
+
label_idx += 1
|
|
1113
|
+
if r.error:
|
|
1114
|
+
lines.append(f"### Reviewer {label}\n\n*ERROR:* `{r.error}`")
|
|
1115
|
+
continue
|
|
1116
|
+
lines.append(f"### Reviewer {label}\n\n{r.text.strip()}")
|
|
1117
|
+
return "\n\n".join(lines)
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def _render_consensus(bucket: ConsensusBucket, *, explain: bool = True) -> str:
|
|
1121
|
+
"""Render Strong / Findings / Minority sections in renderer order.
|
|
1122
|
+
|
|
1123
|
+
``explain`` toggles the Phase 9 confidence-explanation badge — when
|
|
1124
|
+
``False`` the renderer falls back to the terse Phase 4 badge so the
|
|
1125
|
+
PR lens (and any caller passing ``--no-explain-confidence``) keeps
|
|
1126
|
+
its compact output.
|
|
1127
|
+
"""
|
|
1128
|
+
parts: list[str] = []
|
|
1129
|
+
if bucket.strong:
|
|
1130
|
+
parts.append(
|
|
1131
|
+
"## Strong Consensus\n\n"
|
|
1132
|
+
+ _render_bucket(bucket.strong, explain=explain),
|
|
1133
|
+
)
|
|
1134
|
+
if bucket.findings:
|
|
1135
|
+
parts.append(
|
|
1136
|
+
"## Findings\n\n"
|
|
1137
|
+
+ _render_bucket(bucket.findings, explain=explain),
|
|
1138
|
+
)
|
|
1139
|
+
if bucket.minority:
|
|
1140
|
+
parts.append(
|
|
1141
|
+
"## Minority Views\n\n"
|
|
1142
|
+
"*Sub-threshold by consensus; kept for audit trail.*\n\n"
|
|
1143
|
+
+ _render_bucket(bucket.minority, explain=explain),
|
|
1144
|
+
)
|
|
1145
|
+
return "\n\n".join(parts)
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
def _truncate_reason(reason: str, *, limit: int = 120) -> str:
|
|
1149
|
+
"""Collapse a multi-line scorer reason to a single ≤``limit``-char line.
|
|
1150
|
+
|
|
1151
|
+
Phase 9 — the dissent summary must fit on one line; we keep the
|
|
1152
|
+
first sentence-ish chunk and add an ellipsis when truncating. Empty
|
|
1153
|
+
reasons render as ``no rationale``.
|
|
1154
|
+
"""
|
|
1155
|
+
flat = " ".join(reason.split()) if reason else ""
|
|
1156
|
+
if not flat:
|
|
1157
|
+
return "no rationale"
|
|
1158
|
+
if len(flat) <= limit:
|
|
1159
|
+
return flat
|
|
1160
|
+
return flat[: limit - 1].rstrip() + "…"
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
def _render_bucket(
|
|
1164
|
+
items: list[tuple[Finding, ConsensusMetadata]],
|
|
1165
|
+
*,
|
|
1166
|
+
explain: bool = True,
|
|
1167
|
+
) -> str:
|
|
1168
|
+
"""Render one bucket of (finding, metadata) tuples.
|
|
1169
|
+
|
|
1170
|
+
The Phase 4 terse badge (``strength · mean · scorers · dissent``)
|
|
1171
|
+
is preserved on the first line. Phase 9 adds a second
|
|
1172
|
+
confidence-explanation line whenever ``explain`` is true *and* at
|
|
1173
|
+
least one scorer rated the finding — the explanation needs scorer
|
|
1174
|
+
data to be meaningful.
|
|
1175
|
+
"""
|
|
1176
|
+
lines: list[str] = []
|
|
1177
|
+
for f, m in items:
|
|
1178
|
+
terse_badge = (
|
|
1179
|
+
f"strength {m.consensus_strength:.2f} · "
|
|
1180
|
+
f"mean {m.mean_score:.1f}/10 · "
|
|
1181
|
+
f"{len(m.scorers)} scorers · "
|
|
1182
|
+
f"{m.dissent_count} dissent"
|
|
1183
|
+
)
|
|
1184
|
+
block = f"- **{f.id}** — {f.text} \n _{terse_badge}_"
|
|
1185
|
+
if explain and m.scorers:
|
|
1186
|
+
total = m.concur_count + m.dissent_count
|
|
1187
|
+
if total <= 0:
|
|
1188
|
+
total = len(m.scorers)
|
|
1189
|
+
parts: list[str] = [
|
|
1190
|
+
f"{m.concur_count}/{total} members concur",
|
|
1191
|
+
]
|
|
1192
|
+
if m.dissent_reasons:
|
|
1193
|
+
first = m.dissent_reasons[0]
|
|
1194
|
+
parts.append(
|
|
1195
|
+
f"{first[0]} dissented citing "
|
|
1196
|
+
f"{_truncate_reason(first[1])}",
|
|
1197
|
+
)
|
|
1198
|
+
extra = len(m.dissent_reasons) - 1
|
|
1199
|
+
if extra > 0:
|
|
1200
|
+
parts.append(f"{extra} other dissent(s)")
|
|
1201
|
+
else:
|
|
1202
|
+
parts.append("no dissent")
|
|
1203
|
+
parts.append(f"mean evidence-quality {m.evidence_quality}")
|
|
1204
|
+
block += " \n _" + "; ".join(parts) + "_"
|
|
1205
|
+
lines.append(block)
|
|
1206
|
+
return "\n".join(lines)
|