claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,459 @@
1
+ """Multi-provider judge worker: runs a panel of Bedrock models over sessions.
2
+
3
+ Uses the Bedrock **Converse API** (``bedrock-runtime.converse``) which is
4
+ the only path that works uniformly across Anthropic, Moonshot, DeepSeek,
5
+ MiniMax, Mistral, Z.AI, Qwen, Writer, and NVIDIA Nemotron on Bedrock.
6
+
7
+ For each (session, axis) the worker dispatches one call per judge and
8
+ writes a parquet row with the score and free-text rationale. No tool_use
9
+ machinery — the prompt instructs each judge to output `score=<int>\\n
10
+ rationale=<text>` and we parse it. Stable across model families; the
11
+ tradeoff is we rely on rubric-prompted discipline rather than schema
12
+ enforcement. That is deliberate: a structured-output API tied to one
13
+ provider would leak within-family bias back into the ensemble.
14
+
15
+ Cost guard: defaults to ``dry_run=True`` per the project convention —
16
+ emits a plan dict with ``(n_calls, est_tokens, est_dollars)`` rather
17
+ than spending.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import asyncio
23
+ import json
24
+ import re
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+ from typing import Any
29
+
30
+ import boto3
31
+ import polars as pl
32
+ from botocore.config import Config as BotoConfig
33
+ from botocore.exceptions import (
34
+ ClientError,
35
+ ConnectionError as BotoConnectionError,
36
+ EndpointConnectionError,
37
+ ReadTimeoutError,
38
+ SSLError,
39
+ )
40
+ from loguru import logger
41
+ from tenacity import (
42
+ retry,
43
+ retry_if_exception,
44
+ stop_after_attempt,
45
+ wait_exponential,
46
+ )
47
+
48
+ from claude_sql import judges as judge_catalog
49
+ from claude_sql.judges import Judge
50
+ from claude_sql.logging_setup import loguru_before_sleep
51
+
52
+ _RETRY_CODES: set[str] = {
53
+ "ThrottlingException",
54
+ "ServiceUnavailableException",
55
+ "ModelTimeoutException",
56
+ "ModelErrorException",
57
+ "InternalServerException",
58
+ }
59
+
60
+
61
+ def _is_retryable(exc: BaseException) -> bool:
62
+ """Retry policy for Bedrock Converse calls.
63
+
64
+ Matches ``embed_worker`` / ``llm_worker`` convention so throttling
65
+ behaves the same across the whole CLI.
66
+ """
67
+ if isinstance(exc, SSLError | BotoConnectionError | EndpointConnectionError | ReadTimeoutError):
68
+ return True
69
+ if isinstance(exc, ClientError):
70
+ code = exc.response.get("Error", {}).get("Code", "")
71
+ return code in _RETRY_CODES
72
+ return False
73
+
74
+
75
+ # Rough per-1M-token cost estimates (USD). Conservative; updated as
76
+ # Bedrock publishes list prices. Missing entries fall through to a
77
+ # 2.00/10.00 fallback.
78
+ _JUDGE_PRICING: dict[str, tuple[float, float]] = {
79
+ "moonshotai.kimi-k2.5": (0.60, 2.50),
80
+ "moonshot.kimi-k2-thinking": (1.00, 4.00),
81
+ "deepseek.v3.2": (0.30, 1.20),
82
+ "minimax.minimax-m2.5": (0.50, 2.00),
83
+ "zai.glm-5": (0.60, 2.40),
84
+ "qwen.qwen3-next-80b-a3b": (0.80, 3.20),
85
+ "mistral.mistral-large-3-675b-instruct": (3.00, 12.00),
86
+ "mistral.magistral-small-2509": (0.40, 1.60),
87
+ "writer.palmyra-x5-v1:0": (2.50, 10.00),
88
+ "nvidia.nemotron-super-3-120b": (1.20, 4.80),
89
+ "anthropic.claude-opus-4-7": (15.00, 75.00),
90
+ "anthropic.claude-sonnet-4-6": (3.00, 15.00),
91
+ "amazon.nova-2-lite-v1:0:256k": (0.06, 0.24),
92
+ }
93
+ _FALLBACK_PRICE: tuple[float, float] = (2.00, 10.00)
94
+
95
+
96
+ @dataclass(frozen=True)
97
+ class GradePlan:
98
+ """Dry-run plan: what ``run`` would do if ``dry_run=False``."""
99
+
100
+ n_sessions: int
101
+ n_judges: int
102
+ n_axes: int
103
+ n_calls: int
104
+ est_input_tokens: int
105
+ est_output_tokens: int
106
+ est_usd: float
107
+
108
+
109
+ @dataclass(frozen=True)
110
+ class Axis:
111
+ """One scoring axis from the rubric."""
112
+
113
+ name: str
114
+ description: str
115
+ levels: dict[int, str] # score -> level description
116
+ detector_vs_grader: str # "detector" or "grader"
117
+
118
+
119
+ def parse_rubric(rubric_yaml: str) -> list[Axis]:
120
+ """Parse a rubric YAML into ``Axis`` objects.
121
+
122
+ Minimal YAML subset: key:value pairs, ``-`` list items, 2-space
123
+ indent. We avoid a pyyaml dep here; rubrics are agent-authored
124
+ and shape-constrained.
125
+
126
+ Expected shape::
127
+
128
+ axes:
129
+ - name: correction_required
130
+ description: "..."
131
+ detector_vs_grader: detector
132
+ levels:
133
+ 0: "no correction needed"
134
+ 1: "correction needed"
135
+ """
136
+ import yaml # lazy import; add pyyaml to deps if unused elsewhere
137
+
138
+ data = yaml.safe_load(rubric_yaml) or {}
139
+ raw = data.get("axes") or []
140
+ if not isinstance(raw, list):
141
+ raise TypeError("rubric.axes must be a list")
142
+ out: list[Axis] = []
143
+ for i, entry in enumerate(raw):
144
+ if not isinstance(entry, dict):
145
+ raise TypeError(f"rubric.axes[{i}] must be a mapping")
146
+ name = entry.get("name")
147
+ if not name:
148
+ raise ValueError(f"rubric.axes[{i}] is missing 'name'")
149
+ out.append(
150
+ Axis(
151
+ name=str(name),
152
+ description=str(entry.get("description", "")),
153
+ levels={int(k): str(v) for k, v in (entry.get("levels") or {}).items()},
154
+ detector_vs_grader=str(entry.get("detector_vs_grader", "grader")),
155
+ )
156
+ )
157
+ if not out:
158
+ raise ValueError("rubric has zero axes")
159
+ return out
160
+
161
+
162
+ def render_prompt(session_text: str, axis: Axis) -> str:
163
+ """Render the single-turn grading prompt.
164
+
165
+ Prompt discipline, not schema enforcement, is what we rely on
166
+ across non-Anthropic judges. Keep it minimal and explicit.
167
+ """
168
+ levels_block = "\n".join(f" {k}: {v}" for k, v in sorted(axis.levels.items()))
169
+ return (
170
+ f"You are scoring an agent transcript on ONE axis.\n"
171
+ f"\n"
172
+ f"Axis name: {axis.name}\n"
173
+ f"Axis description: {axis.description}\n"
174
+ f"Scoring levels:\n{levels_block}\n"
175
+ f"\n"
176
+ f"Respond in EXACTLY this format, nothing else:\n"
177
+ f"score=<int>\n"
178
+ f"rationale=<one-paragraph explanation>\n"
179
+ f"\n"
180
+ f"=== TRANSCRIPT BEGIN ===\n"
181
+ f"{session_text}\n"
182
+ f"=== TRANSCRIPT END ===\n"
183
+ )
184
+
185
+
186
+ _SCORE_RE = re.compile(r"^\s*score\s*=\s*(-?\d+)\s*$", re.MULTILINE)
187
+ _RATIONALE_RE = re.compile(r"^\s*rationale\s*=\s*(.*)$", re.MULTILINE | re.DOTALL)
188
+
189
+
190
+ def parse_judge_response(text: str) -> tuple[int | None, str]:
191
+ """Extract (score, rationale) from a free-form judge response.
192
+
193
+ Tolerant: if the format is slightly off (JSON object, extra prose
194
+ before the fields, etc.), we still try to recover a score. If we
195
+ cannot parse an integer score, return ``(None, <raw text>)`` so
196
+ callers can log the refusal and continue.
197
+ """
198
+ m = _SCORE_RE.search(text)
199
+ score: int | None = int(m.group(1)) if m else None
200
+ if score is None:
201
+ # Try JSON-shaped fallback
202
+ try:
203
+ obj = json.loads(text)
204
+ if isinstance(obj, dict) and "score" in obj:
205
+ score = int(obj["score"])
206
+ except (ValueError, TypeError):
207
+ pass
208
+ r = _RATIONALE_RE.search(text)
209
+ rationale = r.group(1).strip() if r else text.strip()
210
+ return score, rationale
211
+
212
+
213
+ # ---------------------------------------------------------------------------
214
+ # Bedrock Converse dispatch
215
+ # ---------------------------------------------------------------------------
216
+
217
+
218
+ def _bedrock_client(region: str = "us-east-1") -> Any:
219
+ """Return a tuned boto3 bedrock-runtime client."""
220
+ cfg = BotoConfig(
221
+ region_name=region,
222
+ retries={"max_attempts": 0, "mode": "standard"},
223
+ read_timeout=120,
224
+ connect_timeout=10,
225
+ )
226
+ return boto3.client("bedrock-runtime", config=cfg)
227
+
228
+
229
+ @retry(
230
+ stop=stop_after_attempt(10),
231
+ wait=wait_exponential(multiplier=2, min=2, max=60),
232
+ retry=retry_if_exception(_is_retryable),
233
+ before_sleep=loguru_before_sleep("WARNING"),
234
+ reraise=True,
235
+ )
236
+ def _converse_once(
237
+ client: Any, model_id: str, prompt: str, max_tokens: int = 4096, temperature: float = 0.0
238
+ ) -> str:
239
+ """One Bedrock Converse call; returns the model's plain text response.
240
+
241
+ Retries throttling / ServiceUnavailable / connection errors with
242
+ exponential backoff (10 attempts, 2\u201360s). Botocore's own retry
243
+ policy is disabled in ``_bedrock_client`` so tenacity owns backoff.
244
+ """
245
+ resp = client.converse(
246
+ modelId=model_id,
247
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
248
+ inferenceConfig={
249
+ "maxTokens": max_tokens,
250
+ "temperature": temperature,
251
+ },
252
+ )
253
+ msg = resp.get("output", {}).get("message", {})
254
+ parts = msg.get("content", []) or []
255
+ for p in parts:
256
+ if "text" in p:
257
+ return p["text"]
258
+ return ""
259
+
260
+
261
+ # ---------------------------------------------------------------------------
262
+ # Planning (dry-run) # noqa: ERA001 — section header, not commented-out code
263
+ # ---------------------------------------------------------------------------
264
+
265
+
266
+ def estimate_tokens(text: str) -> int:
267
+ """Approximate tokens as ``chars / 4`` — standard Bedrock heuristic."""
268
+ return max(1, len(text) // 4)
269
+
270
+
271
+ def plan(
272
+ sessions: list[tuple[str, str]],
273
+ panel: list[Judge],
274
+ axes: list[Axis],
275
+ out_tokens: int = 256,
276
+ ) -> GradePlan:
277
+ """Estimate call count + token + dollar cost of a grade run."""
278
+ n_calls = len(sessions) * len(panel) * len(axes)
279
+ total_in = 0
280
+ total_out = 0
281
+ total_usd = 0.0
282
+ for _, text in sessions:
283
+ for axis in axes:
284
+ prompt = render_prompt(text, axis)
285
+ tin = estimate_tokens(prompt)
286
+ tout = out_tokens
287
+ for j in panel:
288
+ total_in += tin
289
+ total_out += tout
290
+ pin, pout = _JUDGE_PRICING.get(j.model_id, _FALLBACK_PRICE)
291
+ total_usd += (tin / 1_000_000) * pin + (tout / 1_000_000) * pout
292
+ return GradePlan(
293
+ n_sessions=len(sessions),
294
+ n_judges=len(panel),
295
+ n_axes=len(axes),
296
+ n_calls=n_calls,
297
+ est_input_tokens=total_in,
298
+ est_output_tokens=total_out,
299
+ est_usd=round(total_usd, 4),
300
+ )
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # Execution
305
+ # ---------------------------------------------------------------------------
306
+
307
+
308
+ @dataclass(frozen=True)
309
+ class JudgeScore:
310
+ """One row of the output parquet."""
311
+
312
+ session_id: str
313
+ axis: str
314
+ judge_shortname: str
315
+ judge_model_id: str
316
+ score: int
317
+ rationale: str
318
+ freeze_sha: str
319
+
320
+
321
+ async def _grade_one(
322
+ loop: asyncio.AbstractEventLoop,
323
+ executor: ThreadPoolExecutor,
324
+ client: Any,
325
+ session_id: str,
326
+ session_text: str,
327
+ judge: Judge,
328
+ axis: Axis,
329
+ freeze_sha: str,
330
+ ) -> JudgeScore | None:
331
+ prompt = render_prompt(session_text, axis)
332
+ try:
333
+ text = await loop.run_in_executor(
334
+ executor, lambda: _converse_once(client, judge.model_id, prompt)
335
+ )
336
+ except Exception as exc: # noqa: BLE001 — log + skip; the study has 10+ judges
337
+ logger.warning("judge {} failed on {}/{}: {}", judge.shortname, session_id, axis.name, exc)
338
+ return None
339
+ score, rationale = parse_judge_response(text)
340
+ if score is None:
341
+ logger.warning(
342
+ "judge {} returned unparseable response on {}/{}: {!r}",
343
+ judge.shortname,
344
+ session_id,
345
+ axis.name,
346
+ text[:2000],
347
+ )
348
+ # Persist the full text as the rationale so post-hoc triage can
349
+ # see what the judge actually returned. score=None is the sentinel.
350
+ return JudgeScore(
351
+ session_id=session_id,
352
+ axis=axis.name,
353
+ judge_shortname=judge.shortname,
354
+ judge_model_id=judge.model_id,
355
+ score=-1,
356
+ rationale=f"[unparseable] {rationale}",
357
+ freeze_sha=freeze_sha,
358
+ )
359
+ return JudgeScore(
360
+ session_id=session_id,
361
+ axis=axis.name,
362
+ judge_shortname=judge.shortname,
363
+ judge_model_id=judge.model_id,
364
+ score=score,
365
+ rationale=rationale,
366
+ freeze_sha=freeze_sha,
367
+ )
368
+
369
+
370
+ async def run_async(
371
+ sessions: list[tuple[str, str]],
372
+ panel: list[Judge],
373
+ axes: list[Axis],
374
+ freeze_sha: str,
375
+ concurrency: int = 4,
376
+ region: str = "us-east-1",
377
+ ) -> list[JudgeScore]:
378
+ """Grade every (session, judge, axis) triple concurrently."""
379
+ client = _bedrock_client(region=region)
380
+ loop = asyncio.get_running_loop()
381
+ out: list[JudgeScore] = []
382
+ with ThreadPoolExecutor(max_workers=concurrency) as executor:
383
+ sem = asyncio.Semaphore(concurrency)
384
+
385
+ async def bounded(coro):
386
+ async with sem:
387
+ return await coro
388
+
389
+ tasks = [
390
+ bounded(_grade_one(loop, executor, client, sid, text, judge, axis, freeze_sha))
391
+ for sid, text in sessions
392
+ for judge in panel
393
+ for axis in axes
394
+ ]
395
+ results = await asyncio.gather(*tasks)
396
+ out.extend(r for r in results if r is not None)
397
+ return out
398
+
399
+
400
+ def to_parquet(scores: list[JudgeScore], path: Path) -> None:
401
+ """Write scores to parquet with a stable schema."""
402
+ if not scores:
403
+ logger.warning("no scores to write; emitting empty parquet for schema stability")
404
+ df = pl.DataFrame(
405
+ {
406
+ "session_id": [s.session_id for s in scores],
407
+ "axis": [s.axis for s in scores],
408
+ "judge_shortname": [s.judge_shortname for s in scores],
409
+ "judge_model_id": [s.judge_model_id for s in scores],
410
+ "score": [s.score for s in scores],
411
+ "rationale": [s.rationale for s in scores],
412
+ "freeze_sha": [s.freeze_sha for s in scores],
413
+ },
414
+ schema={
415
+ "session_id": pl.String,
416
+ "axis": pl.String,
417
+ "judge_shortname": pl.String,
418
+ "judge_model_id": pl.String,
419
+ "score": pl.Int64,
420
+ "rationale": pl.String,
421
+ "freeze_sha": pl.String,
422
+ },
423
+ )
424
+ df.write_parquet(path)
425
+
426
+
427
+ def run(
428
+ sessions: list[tuple[str, str]],
429
+ panel_shortnames: list[str],
430
+ rubric_yaml_path: Path,
431
+ freeze_sha: str,
432
+ out_parquet: Path,
433
+ *,
434
+ dry_run: bool = True,
435
+ concurrency: int = 4,
436
+ region: str = "us-east-1",
437
+ ) -> GradePlan | list[JudgeScore]:
438
+ """Synchronous entry point used by the CLI."""
439
+ panel = judge_catalog.panel(panel_shortnames)
440
+ axes = parse_rubric(rubric_yaml_path.read_text(encoding="utf-8"))
441
+ p = plan(sessions, panel, axes)
442
+ logger.info(
443
+ "judge plan: {} calls across {} sessions × {} judges × {} axes; "
444
+ "~{} in-tok, ~{} out-tok, ~${:.4f}",
445
+ p.n_calls,
446
+ p.n_sessions,
447
+ p.n_judges,
448
+ p.n_axes,
449
+ p.est_input_tokens,
450
+ p.est_output_tokens,
451
+ p.est_usd,
452
+ )
453
+ if dry_run:
454
+ return p
455
+ scores = asyncio.run(
456
+ run_async(sessions, panel, axes, freeze_sha, concurrency=concurrency, region=region)
457
+ )
458
+ to_parquet(scores, out_parquet)
459
+ return scores
claude_sql/judges.py ADDED
@@ -0,0 +1,239 @@
1
+ """Cross-provider Bedrock judge panel catalog.
2
+
3
+ Used by the ``judge`` subcommand to dispatch agent-output grading to a
4
+ panel of foundation models with diverse training lineages. Goal:
5
+ ensemble *disagreement* surfaces bias; within-family agreement is the
6
+ bias we are measuring, not the signal.
7
+
8
+ All model IDs below are validated against ``aws bedrock list-foundation-models``
9
+ in ``us-east-1`` as of 2026-04-21 (lalsaado-handson profile).
10
+
11
+ Shortnames are stable CLI aliases so ``--panel kimi-k2.5,deepseek-v3.2``
12
+ does not rot every time a provider changes its ID suffix.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+ from typing import Literal
19
+
20
+ JudgeFamily = Literal["anthropic", "amazon", "non-anthropic-non-amazon"]
21
+ JudgeRole = Literal["judge", "bulk", "embed", "within-family-holdout"]
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Judge:
26
+ """One Bedrock foundation model wired into the judge panel."""
27
+
28
+ shortname: str
29
+ model_id: str
30
+ provider: str
31
+ family: JudgeFamily
32
+ role: JudgeRole
33
+ notes: str
34
+
35
+
36
+ #: Primary ensemble: non-Anthropic, non-Amazon judges for cross-lineage
37
+ #: variance. Eight distinct training corpora across Chinese, North
38
+ #: American, European labs.
39
+ #:
40
+ #: Mistral Large 3 and Magistral-Small were dropped after the first
41
+ #: gym run (study ``ab0bf2eeb481fdd2``, 2026-04-21) because they
42
+ #: produced 4/50 unparseable responses \u2014 worst rubric discipline of
43
+ #: the panel. They remain in ``EXCLUDED_JUDGES`` below so the history
44
+ #: is self-documenting and they can be re-opted-in via ``--panel``.
45
+ PRIMARY_PANEL: tuple[Judge, ...] = (
46
+ Judge(
47
+ shortname="kimi-k2.5",
48
+ model_id="moonshotai.kimi-k2.5",
49
+ provider="Moonshot AI",
50
+ family="non-anthropic-non-amazon",
51
+ role="judge",
52
+ notes="Chinese lineage, strong general reasoning. Primary tie-breaker judge.",
53
+ ),
54
+ Judge(
55
+ shortname="kimi-k2-thinking",
56
+ model_id="moonshot.kimi-k2-thinking",
57
+ provider="Moonshot AI",
58
+ family="non-anthropic-non-amazon",
59
+ role="judge",
60
+ notes="Thinking variant; use on fabrication_present where CoT matters.",
61
+ ),
62
+ Judge(
63
+ shortname="deepseek-v3.2",
64
+ model_id="deepseek.v3.2",
65
+ provider="DeepSeek",
66
+ family="non-anthropic-non-amazon",
67
+ role="judge",
68
+ notes="Different training corpus from Kimi; good disagreement signal.",
69
+ ),
70
+ Judge(
71
+ shortname="minimax-m2.5",
72
+ model_id="minimax.minimax-m2.5",
73
+ provider="MiniMax",
74
+ family="non-anthropic-non-amazon",
75
+ role="judge",
76
+ notes="Third Chinese-lab vote for ensemble diversity.",
77
+ ),
78
+ Judge(
79
+ shortname="glm-5",
80
+ model_id="zai.glm-5",
81
+ provider="Z.AI",
82
+ family="non-anthropic-non-amazon",
83
+ role="judge",
84
+ notes="GLM-5; GLM-4.7 available as lightweight fallback.",
85
+ ),
86
+ Judge(
87
+ shortname="qwen3-next-80b",
88
+ model_id="qwen.qwen3-next-80b-a3b",
89
+ provider="Qwen (Alibaba)",
90
+ family="non-anthropic-non-amazon",
91
+ role="judge",
92
+ notes="Qwen3-Next 80B MoE; structured-output strong.",
93
+ ),
94
+ Judge(
95
+ shortname="palmyra-x5",
96
+ model_id="writer.palmyra-x5-v1:0",
97
+ provider="Writer",
98
+ family="non-anthropic-non-amazon",
99
+ role="judge",
100
+ notes="Enterprise-text purpose-trained; different bias axis.",
101
+ ),
102
+ Judge(
103
+ shortname="nemotron-super-3",
104
+ model_id="nvidia.nemotron-super-3-120b",
105
+ provider="NVIDIA",
106
+ family="non-anthropic-non-amazon",
107
+ role="judge",
108
+ notes="NVIDIA's largest open reasoning model.",
109
+ ),
110
+ )
111
+
112
+ #: Judges evaluated and dropped. Reachable via ``resolve()`` /
113
+ #: ``--panel`` for anyone who wants to re-test them, but not part of
114
+ #: the default ensemble.
115
+ EXCLUDED_JUDGES: tuple[Judge, ...] = (
116
+ Judge(
117
+ shortname="mistral-large-3",
118
+ model_id="mistral.mistral-large-3-675b-instruct",
119
+ provider="Mistral AI",
120
+ family="non-anthropic-non-amazon",
121
+ role="judge",
122
+ notes=(
123
+ "Dropped 2026-04-21 after study ab0bf2eeb481fdd2: produced 4/50 "
124
+ "unparseable responses (worst rubric discipline in panel)."
125
+ ),
126
+ ),
127
+ Judge(
128
+ shortname="magistral-small",
129
+ model_id="mistral.magistral-small-2509",
130
+ provider="Mistral AI",
131
+ family="non-anthropic-non-amazon",
132
+ role="judge",
133
+ notes=(
134
+ "Dropped 2026-04-21 alongside mistral-large-3 (Mistral family "
135
+ "entirely excluded pending rubric-discipline fix)."
136
+ ),
137
+ ),
138
+ )
139
+
140
+ #: Within-family holdout judges. Kept explicitly so the study can
141
+ #: *measure* the within-family bias rather than silently avoid it.
142
+ WITHIN_FAMILY_HOLDOUT: tuple[Judge, ...] = (
143
+ Judge(
144
+ shortname="opus-4-7",
145
+ model_id="global.anthropic.claude-opus-4-7",
146
+ provider="Anthropic",
147
+ family="anthropic",
148
+ role="within-family-holdout",
149
+ notes=(
150
+ "Delta vs non-Anthropic ensemble = the bias we are measuring. "
151
+ "Uses global CRIS profile; direct model ID rejects on-demand."
152
+ ),
153
+ ),
154
+ Judge(
155
+ shortname="sonnet-4-6",
156
+ model_id="global.anthropic.claude-sonnet-4-6",
157
+ provider="Anthropic",
158
+ family="anthropic",
159
+ role="within-family-holdout",
160
+ notes="Intra-family agreement is its own data point. Uses global CRIS.",
161
+ ),
162
+ )
163
+
164
+ #: Bulk Amazon lane: cheap, fast, current-gen only. Nova Pro v1 is
165
+ #: explicitly excluded as stale.
166
+ BULK_PANEL: tuple[Judge, ...] = (
167
+ Judge(
168
+ shortname="nova-2-lite",
169
+ model_id="amazon.nova-2-lite-v1:0:256k",
170
+ provider="Amazon",
171
+ family="amazon",
172
+ role="bulk",
173
+ notes="Bulk classifier: hedge counting, entity spotting, reversal markers.",
174
+ ),
175
+ Judge(
176
+ shortname="nova-2-mm-embed",
177
+ model_id="amazon.nova-2-multimodal-embeddings-v1:0",
178
+ provider="Amazon",
179
+ family="amazon",
180
+ role="embed",
181
+ notes="Embedding path for dedup-neighbors cosine-contamination filter.",
182
+ ),
183
+ )
184
+
185
+ #: Flat lookup keyed by shortname (CLI-facing) and by model_id (internal).
186
+ #: Includes ``EXCLUDED_JUDGES`` so ``--panel mistral-large-3`` still
187
+ #: resolves for anyone re-testing a dropped judge.
188
+ _ALL: tuple[Judge, ...] = (
189
+ *PRIMARY_PANEL,
190
+ *WITHIN_FAMILY_HOLDOUT,
191
+ *BULK_PANEL,
192
+ *EXCLUDED_JUDGES,
193
+ )
194
+ _BY_SHORTNAME: dict[str, Judge] = {j.shortname: j for j in _ALL}
195
+ _BY_MODEL_ID: dict[str, Judge] = {j.model_id: j for j in _ALL}
196
+
197
+
198
+ def resolve(name: str) -> Judge:
199
+ """Resolve a shortname or model ID to a ``Judge``.
200
+
201
+ Raises ``KeyError`` with the full catalog when the name is unknown —
202
+ agents parsing stderr get a concrete hint on what is available.
203
+ """
204
+ if name in _BY_SHORTNAME:
205
+ return _BY_SHORTNAME[name]
206
+ if name in _BY_MODEL_ID:
207
+ return _BY_MODEL_ID[name]
208
+ available = ", ".join(sorted(_BY_SHORTNAME))
209
+ raise KeyError(f"unknown judge {name!r}; available: {available}")
210
+
211
+
212
+ def panel(names: list[str] | tuple[str, ...]) -> list[Judge]:
213
+ """Resolve a list of shortnames/model IDs into Judge records, preserving order."""
214
+ return [resolve(n) for n in names]
215
+
216
+
217
+ def all_primary() -> tuple[Judge, ...]:
218
+ """Return the full primary (non-within-family) panel."""
219
+ return PRIMARY_PANEL
220
+
221
+
222
+ def all_within_family() -> tuple[Judge, ...]:
223
+ """Return the within-family holdout panel."""
224
+ return WITHIN_FAMILY_HOLDOUT
225
+
226
+
227
+ def all_bulk() -> tuple[Judge, ...]:
228
+ """Return the Amazon bulk/embed lane."""
229
+ return BULK_PANEL
230
+
231
+
232
+ def all_excluded() -> tuple[Judge, ...]:
233
+ """Return judges that were evaluated and dropped from the primary panel."""
234
+ return EXCLUDED_JUDGES
235
+
236
+
237
+ def catalog() -> list[Judge]:
238
+ """Every judge in the catalog, primary first, then holdout, bulk, excluded."""
239
+ return [*PRIMARY_PANEL, *WITHIN_FAMILY_HOLDOUT, *BULK_PANEL, *EXCLUDED_JUDGES]