claude-sql 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_sql/__init__.py +5 -0
- claude_sql/binding.py +740 -0
- claude_sql/blind_handover.py +155 -0
- claude_sql/checkpointer.py +202 -0
- claude_sql/cli.py +2344 -0
- claude_sql/cluster_worker.py +208 -0
- claude_sql/community_worker.py +306 -0
- claude_sql/config.py +380 -0
- claude_sql/embed_worker.py +482 -0
- claude_sql/freeze.py +189 -0
- claude_sql/friction_worker.py +561 -0
- claude_sql/install_source.py +77 -0
- claude_sql/judge_worker.py +459 -0
- claude_sql/judges.py +239 -0
- claude_sql/kappa_worker.py +257 -0
- claude_sql/llm_worker.py +1760 -0
- claude_sql/logging_setup.py +95 -0
- claude_sql/output.py +248 -0
- claude_sql/parquet_shards.py +172 -0
- claude_sql/retry_queue.py +180 -0
- claude_sql/review_sheet_render.py +167 -0
- claude_sql/review_sheet_worker.py +463 -0
- claude_sql/schemas.py +454 -0
- claude_sql/session_text.py +387 -0
- claude_sql/skills_catalog.py +354 -0
- claude_sql/sql_views.py +1751 -0
- claude_sql/terms_worker.py +145 -0
- claude_sql/ungrounded_worker.py +190 -0
- claude_sql-0.4.0.dist-info/METADATA +530 -0
- claude_sql-0.4.0.dist-info/RECORD +32 -0
- claude_sql-0.4.0.dist-info/WHEEL +4 -0
- claude_sql-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
"""Multi-provider judge worker: runs a panel of Bedrock models over sessions.
|
|
2
|
+
|
|
3
|
+
Uses the Bedrock **Converse API** (``bedrock-runtime.converse``) which is
|
|
4
|
+
the only path that works uniformly across Anthropic, Moonshot, DeepSeek,
|
|
5
|
+
MiniMax, Mistral, Z.AI, Qwen, Writer, and NVIDIA Nemotron on Bedrock.
|
|
6
|
+
|
|
7
|
+
For each (session, axis) the worker dispatches one call per judge and
|
|
8
|
+
writes a parquet row with the score and free-text rationale. No tool_use
|
|
9
|
+
machinery — the prompt instructs each judge to output `score=<int>\\n
|
|
10
|
+
rationale=<text>` and we parse it. Stable across model families; the
|
|
11
|
+
tradeoff is we rely on rubric-prompted discipline rather than schema
|
|
12
|
+
enforcement. That is deliberate: a structured-output API tied to one
|
|
13
|
+
provider would leak within-family bias back into the ensemble.
|
|
14
|
+
|
|
15
|
+
Cost guard: defaults to ``dry_run=True`` per the project convention —
|
|
16
|
+
emits a plan dict with ``(n_calls, est_tokens, est_dollars)`` rather
|
|
17
|
+
than spending.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import asyncio
|
|
23
|
+
import json
|
|
24
|
+
import re
|
|
25
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
import boto3
|
|
31
|
+
import polars as pl
|
|
32
|
+
from botocore.config import Config as BotoConfig
|
|
33
|
+
from botocore.exceptions import (
|
|
34
|
+
ClientError,
|
|
35
|
+
ConnectionError as BotoConnectionError,
|
|
36
|
+
EndpointConnectionError,
|
|
37
|
+
ReadTimeoutError,
|
|
38
|
+
SSLError,
|
|
39
|
+
)
|
|
40
|
+
from loguru import logger
|
|
41
|
+
from tenacity import (
|
|
42
|
+
retry,
|
|
43
|
+
retry_if_exception,
|
|
44
|
+
stop_after_attempt,
|
|
45
|
+
wait_exponential,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
from claude_sql import judges as judge_catalog
|
|
49
|
+
from claude_sql.judges import Judge
|
|
50
|
+
from claude_sql.logging_setup import loguru_before_sleep
|
|
51
|
+
|
|
52
|
+
_RETRY_CODES: set[str] = {
|
|
53
|
+
"ThrottlingException",
|
|
54
|
+
"ServiceUnavailableException",
|
|
55
|
+
"ModelTimeoutException",
|
|
56
|
+
"ModelErrorException",
|
|
57
|
+
"InternalServerException",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _is_retryable(exc: BaseException) -> bool:
|
|
62
|
+
"""Retry policy for Bedrock Converse calls.
|
|
63
|
+
|
|
64
|
+
Matches ``embed_worker`` / ``llm_worker`` convention so throttling
|
|
65
|
+
behaves the same across the whole CLI.
|
|
66
|
+
"""
|
|
67
|
+
if isinstance(exc, SSLError | BotoConnectionError | EndpointConnectionError | ReadTimeoutError):
|
|
68
|
+
return True
|
|
69
|
+
if isinstance(exc, ClientError):
|
|
70
|
+
code = exc.response.get("Error", {}).get("Code", "")
|
|
71
|
+
return code in _RETRY_CODES
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Rough per-1M-token cost estimates (USD). Conservative; updated as
|
|
76
|
+
# Bedrock publishes list prices. Missing entries fall through to a
|
|
77
|
+
# 2.00/10.00 fallback.
|
|
78
|
+
_JUDGE_PRICING: dict[str, tuple[float, float]] = {
|
|
79
|
+
"moonshotai.kimi-k2.5": (0.60, 2.50),
|
|
80
|
+
"moonshot.kimi-k2-thinking": (1.00, 4.00),
|
|
81
|
+
"deepseek.v3.2": (0.30, 1.20),
|
|
82
|
+
"minimax.minimax-m2.5": (0.50, 2.00),
|
|
83
|
+
"zai.glm-5": (0.60, 2.40),
|
|
84
|
+
"qwen.qwen3-next-80b-a3b": (0.80, 3.20),
|
|
85
|
+
"mistral.mistral-large-3-675b-instruct": (3.00, 12.00),
|
|
86
|
+
"mistral.magistral-small-2509": (0.40, 1.60),
|
|
87
|
+
"writer.palmyra-x5-v1:0": (2.50, 10.00),
|
|
88
|
+
"nvidia.nemotron-super-3-120b": (1.20, 4.80),
|
|
89
|
+
"anthropic.claude-opus-4-7": (15.00, 75.00),
|
|
90
|
+
"anthropic.claude-sonnet-4-6": (3.00, 15.00),
|
|
91
|
+
"amazon.nova-2-lite-v1:0:256k": (0.06, 0.24),
|
|
92
|
+
}
|
|
93
|
+
_FALLBACK_PRICE: tuple[float, float] = (2.00, 10.00)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass(frozen=True)
|
|
97
|
+
class GradePlan:
|
|
98
|
+
"""Dry-run plan: what ``run`` would do if ``dry_run=False``."""
|
|
99
|
+
|
|
100
|
+
n_sessions: int
|
|
101
|
+
n_judges: int
|
|
102
|
+
n_axes: int
|
|
103
|
+
n_calls: int
|
|
104
|
+
est_input_tokens: int
|
|
105
|
+
est_output_tokens: int
|
|
106
|
+
est_usd: float
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass(frozen=True)
|
|
110
|
+
class Axis:
|
|
111
|
+
"""One scoring axis from the rubric."""
|
|
112
|
+
|
|
113
|
+
name: str
|
|
114
|
+
description: str
|
|
115
|
+
levels: dict[int, str] # score -> level description
|
|
116
|
+
detector_vs_grader: str # "detector" or "grader"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def parse_rubric(rubric_yaml: str) -> list[Axis]:
|
|
120
|
+
"""Parse a rubric YAML into ``Axis`` objects.
|
|
121
|
+
|
|
122
|
+
Minimal YAML subset: key:value pairs, ``-`` list items, 2-space
|
|
123
|
+
indent. We avoid a pyyaml dep here; rubrics are agent-authored
|
|
124
|
+
and shape-constrained.
|
|
125
|
+
|
|
126
|
+
Expected shape::
|
|
127
|
+
|
|
128
|
+
axes:
|
|
129
|
+
- name: correction_required
|
|
130
|
+
description: "..."
|
|
131
|
+
detector_vs_grader: detector
|
|
132
|
+
levels:
|
|
133
|
+
0: "no correction needed"
|
|
134
|
+
1: "correction needed"
|
|
135
|
+
"""
|
|
136
|
+
import yaml # lazy import; add pyyaml to deps if unused elsewhere
|
|
137
|
+
|
|
138
|
+
data = yaml.safe_load(rubric_yaml) or {}
|
|
139
|
+
raw = data.get("axes") or []
|
|
140
|
+
if not isinstance(raw, list):
|
|
141
|
+
raise TypeError("rubric.axes must be a list")
|
|
142
|
+
out: list[Axis] = []
|
|
143
|
+
for i, entry in enumerate(raw):
|
|
144
|
+
if not isinstance(entry, dict):
|
|
145
|
+
raise TypeError(f"rubric.axes[{i}] must be a mapping")
|
|
146
|
+
name = entry.get("name")
|
|
147
|
+
if not name:
|
|
148
|
+
raise ValueError(f"rubric.axes[{i}] is missing 'name'")
|
|
149
|
+
out.append(
|
|
150
|
+
Axis(
|
|
151
|
+
name=str(name),
|
|
152
|
+
description=str(entry.get("description", "")),
|
|
153
|
+
levels={int(k): str(v) for k, v in (entry.get("levels") or {}).items()},
|
|
154
|
+
detector_vs_grader=str(entry.get("detector_vs_grader", "grader")),
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
if not out:
|
|
158
|
+
raise ValueError("rubric has zero axes")
|
|
159
|
+
return out
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def render_prompt(session_text: str, axis: Axis) -> str:
|
|
163
|
+
"""Render the single-turn grading prompt.
|
|
164
|
+
|
|
165
|
+
Prompt discipline, not schema enforcement, is what we rely on
|
|
166
|
+
across non-Anthropic judges. Keep it minimal and explicit.
|
|
167
|
+
"""
|
|
168
|
+
levels_block = "\n".join(f" {k}: {v}" for k, v in sorted(axis.levels.items()))
|
|
169
|
+
return (
|
|
170
|
+
f"You are scoring an agent transcript on ONE axis.\n"
|
|
171
|
+
f"\n"
|
|
172
|
+
f"Axis name: {axis.name}\n"
|
|
173
|
+
f"Axis description: {axis.description}\n"
|
|
174
|
+
f"Scoring levels:\n{levels_block}\n"
|
|
175
|
+
f"\n"
|
|
176
|
+
f"Respond in EXACTLY this format, nothing else:\n"
|
|
177
|
+
f"score=<int>\n"
|
|
178
|
+
f"rationale=<one-paragraph explanation>\n"
|
|
179
|
+
f"\n"
|
|
180
|
+
f"=== TRANSCRIPT BEGIN ===\n"
|
|
181
|
+
f"{session_text}\n"
|
|
182
|
+
f"=== TRANSCRIPT END ===\n"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
_SCORE_RE = re.compile(r"^\s*score\s*=\s*(-?\d+)\s*$", re.MULTILINE)
|
|
187
|
+
_RATIONALE_RE = re.compile(r"^\s*rationale\s*=\s*(.*)$", re.MULTILINE | re.DOTALL)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def parse_judge_response(text: str) -> tuple[int | None, str]:
|
|
191
|
+
"""Extract (score, rationale) from a free-form judge response.
|
|
192
|
+
|
|
193
|
+
Tolerant: if the format is slightly off (JSON object, extra prose
|
|
194
|
+
before the fields, etc.), we still try to recover a score. If we
|
|
195
|
+
cannot parse an integer score, return ``(None, <raw text>)`` so
|
|
196
|
+
callers can log the refusal and continue.
|
|
197
|
+
"""
|
|
198
|
+
m = _SCORE_RE.search(text)
|
|
199
|
+
score: int | None = int(m.group(1)) if m else None
|
|
200
|
+
if score is None:
|
|
201
|
+
# Try JSON-shaped fallback
|
|
202
|
+
try:
|
|
203
|
+
obj = json.loads(text)
|
|
204
|
+
if isinstance(obj, dict) and "score" in obj:
|
|
205
|
+
score = int(obj["score"])
|
|
206
|
+
except (ValueError, TypeError):
|
|
207
|
+
pass
|
|
208
|
+
r = _RATIONALE_RE.search(text)
|
|
209
|
+
rationale = r.group(1).strip() if r else text.strip()
|
|
210
|
+
return score, rationale
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# ---------------------------------------------------------------------------
|
|
214
|
+
# Bedrock Converse dispatch
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _bedrock_client(region: str = "us-east-1") -> Any:
|
|
219
|
+
"""Return a tuned boto3 bedrock-runtime client."""
|
|
220
|
+
cfg = BotoConfig(
|
|
221
|
+
region_name=region,
|
|
222
|
+
retries={"max_attempts": 0, "mode": "standard"},
|
|
223
|
+
read_timeout=120,
|
|
224
|
+
connect_timeout=10,
|
|
225
|
+
)
|
|
226
|
+
return boto3.client("bedrock-runtime", config=cfg)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@retry(
|
|
230
|
+
stop=stop_after_attempt(10),
|
|
231
|
+
wait=wait_exponential(multiplier=2, min=2, max=60),
|
|
232
|
+
retry=retry_if_exception(_is_retryable),
|
|
233
|
+
before_sleep=loguru_before_sleep("WARNING"),
|
|
234
|
+
reraise=True,
|
|
235
|
+
)
|
|
236
|
+
def _converse_once(
|
|
237
|
+
client: Any, model_id: str, prompt: str, max_tokens: int = 4096, temperature: float = 0.0
|
|
238
|
+
) -> str:
|
|
239
|
+
"""One Bedrock Converse call; returns the model's plain text response.
|
|
240
|
+
|
|
241
|
+
Retries throttling / ServiceUnavailable / connection errors with
|
|
242
|
+
exponential backoff (10 attempts, 2\u201360s). Botocore's own retry
|
|
243
|
+
policy is disabled in ``_bedrock_client`` so tenacity owns backoff.
|
|
244
|
+
"""
|
|
245
|
+
resp = client.converse(
|
|
246
|
+
modelId=model_id,
|
|
247
|
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
|
248
|
+
inferenceConfig={
|
|
249
|
+
"maxTokens": max_tokens,
|
|
250
|
+
"temperature": temperature,
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
msg = resp.get("output", {}).get("message", {})
|
|
254
|
+
parts = msg.get("content", []) or []
|
|
255
|
+
for p in parts:
|
|
256
|
+
if "text" in p:
|
|
257
|
+
return p["text"]
|
|
258
|
+
return ""
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# ---------------------------------------------------------------------------
|
|
262
|
+
# Planning (dry-run) # noqa: ERA001 — section header, not commented-out code
|
|
263
|
+
# ---------------------------------------------------------------------------
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def estimate_tokens(text: str) -> int:
|
|
267
|
+
"""Approximate tokens as ``chars / 4`` — standard Bedrock heuristic."""
|
|
268
|
+
return max(1, len(text) // 4)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def plan(
|
|
272
|
+
sessions: list[tuple[str, str]],
|
|
273
|
+
panel: list[Judge],
|
|
274
|
+
axes: list[Axis],
|
|
275
|
+
out_tokens: int = 256,
|
|
276
|
+
) -> GradePlan:
|
|
277
|
+
"""Estimate call count + token + dollar cost of a grade run."""
|
|
278
|
+
n_calls = len(sessions) * len(panel) * len(axes)
|
|
279
|
+
total_in = 0
|
|
280
|
+
total_out = 0
|
|
281
|
+
total_usd = 0.0
|
|
282
|
+
for _, text in sessions:
|
|
283
|
+
for axis in axes:
|
|
284
|
+
prompt = render_prompt(text, axis)
|
|
285
|
+
tin = estimate_tokens(prompt)
|
|
286
|
+
tout = out_tokens
|
|
287
|
+
for j in panel:
|
|
288
|
+
total_in += tin
|
|
289
|
+
total_out += tout
|
|
290
|
+
pin, pout = _JUDGE_PRICING.get(j.model_id, _FALLBACK_PRICE)
|
|
291
|
+
total_usd += (tin / 1_000_000) * pin + (tout / 1_000_000) * pout
|
|
292
|
+
return GradePlan(
|
|
293
|
+
n_sessions=len(sessions),
|
|
294
|
+
n_judges=len(panel),
|
|
295
|
+
n_axes=len(axes),
|
|
296
|
+
n_calls=n_calls,
|
|
297
|
+
est_input_tokens=total_in,
|
|
298
|
+
est_output_tokens=total_out,
|
|
299
|
+
est_usd=round(total_usd, 4),
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# ---------------------------------------------------------------------------
|
|
304
|
+
# Execution
|
|
305
|
+
# ---------------------------------------------------------------------------
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@dataclass(frozen=True)
|
|
309
|
+
class JudgeScore:
|
|
310
|
+
"""One row of the output parquet."""
|
|
311
|
+
|
|
312
|
+
session_id: str
|
|
313
|
+
axis: str
|
|
314
|
+
judge_shortname: str
|
|
315
|
+
judge_model_id: str
|
|
316
|
+
score: int
|
|
317
|
+
rationale: str
|
|
318
|
+
freeze_sha: str
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
async def _grade_one(
|
|
322
|
+
loop: asyncio.AbstractEventLoop,
|
|
323
|
+
executor: ThreadPoolExecutor,
|
|
324
|
+
client: Any,
|
|
325
|
+
session_id: str,
|
|
326
|
+
session_text: str,
|
|
327
|
+
judge: Judge,
|
|
328
|
+
axis: Axis,
|
|
329
|
+
freeze_sha: str,
|
|
330
|
+
) -> JudgeScore | None:
|
|
331
|
+
prompt = render_prompt(session_text, axis)
|
|
332
|
+
try:
|
|
333
|
+
text = await loop.run_in_executor(
|
|
334
|
+
executor, lambda: _converse_once(client, judge.model_id, prompt)
|
|
335
|
+
)
|
|
336
|
+
except Exception as exc: # noqa: BLE001 — log + skip; the study has 10+ judges
|
|
337
|
+
logger.warning("judge {} failed on {}/{}: {}", judge.shortname, session_id, axis.name, exc)
|
|
338
|
+
return None
|
|
339
|
+
score, rationale = parse_judge_response(text)
|
|
340
|
+
if score is None:
|
|
341
|
+
logger.warning(
|
|
342
|
+
"judge {} returned unparseable response on {}/{}: {!r}",
|
|
343
|
+
judge.shortname,
|
|
344
|
+
session_id,
|
|
345
|
+
axis.name,
|
|
346
|
+
text[:2000],
|
|
347
|
+
)
|
|
348
|
+
# Persist the full text as the rationale so post-hoc triage can
|
|
349
|
+
# see what the judge actually returned. score=None is the sentinel.
|
|
350
|
+
return JudgeScore(
|
|
351
|
+
session_id=session_id,
|
|
352
|
+
axis=axis.name,
|
|
353
|
+
judge_shortname=judge.shortname,
|
|
354
|
+
judge_model_id=judge.model_id,
|
|
355
|
+
score=-1,
|
|
356
|
+
rationale=f"[unparseable] {rationale}",
|
|
357
|
+
freeze_sha=freeze_sha,
|
|
358
|
+
)
|
|
359
|
+
return JudgeScore(
|
|
360
|
+
session_id=session_id,
|
|
361
|
+
axis=axis.name,
|
|
362
|
+
judge_shortname=judge.shortname,
|
|
363
|
+
judge_model_id=judge.model_id,
|
|
364
|
+
score=score,
|
|
365
|
+
rationale=rationale,
|
|
366
|
+
freeze_sha=freeze_sha,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
async def run_async(
|
|
371
|
+
sessions: list[tuple[str, str]],
|
|
372
|
+
panel: list[Judge],
|
|
373
|
+
axes: list[Axis],
|
|
374
|
+
freeze_sha: str,
|
|
375
|
+
concurrency: int = 4,
|
|
376
|
+
region: str = "us-east-1",
|
|
377
|
+
) -> list[JudgeScore]:
|
|
378
|
+
"""Grade every (session, judge, axis) triple concurrently."""
|
|
379
|
+
client = _bedrock_client(region=region)
|
|
380
|
+
loop = asyncio.get_running_loop()
|
|
381
|
+
out: list[JudgeScore] = []
|
|
382
|
+
with ThreadPoolExecutor(max_workers=concurrency) as executor:
|
|
383
|
+
sem = asyncio.Semaphore(concurrency)
|
|
384
|
+
|
|
385
|
+
async def bounded(coro):
|
|
386
|
+
async with sem:
|
|
387
|
+
return await coro
|
|
388
|
+
|
|
389
|
+
tasks = [
|
|
390
|
+
bounded(_grade_one(loop, executor, client, sid, text, judge, axis, freeze_sha))
|
|
391
|
+
for sid, text in sessions
|
|
392
|
+
for judge in panel
|
|
393
|
+
for axis in axes
|
|
394
|
+
]
|
|
395
|
+
results = await asyncio.gather(*tasks)
|
|
396
|
+
out.extend(r for r in results if r is not None)
|
|
397
|
+
return out
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def to_parquet(scores: list[JudgeScore], path: Path) -> None:
|
|
401
|
+
"""Write scores to parquet with a stable schema."""
|
|
402
|
+
if not scores:
|
|
403
|
+
logger.warning("no scores to write; emitting empty parquet for schema stability")
|
|
404
|
+
df = pl.DataFrame(
|
|
405
|
+
{
|
|
406
|
+
"session_id": [s.session_id for s in scores],
|
|
407
|
+
"axis": [s.axis for s in scores],
|
|
408
|
+
"judge_shortname": [s.judge_shortname for s in scores],
|
|
409
|
+
"judge_model_id": [s.judge_model_id for s in scores],
|
|
410
|
+
"score": [s.score for s in scores],
|
|
411
|
+
"rationale": [s.rationale for s in scores],
|
|
412
|
+
"freeze_sha": [s.freeze_sha for s in scores],
|
|
413
|
+
},
|
|
414
|
+
schema={
|
|
415
|
+
"session_id": pl.String,
|
|
416
|
+
"axis": pl.String,
|
|
417
|
+
"judge_shortname": pl.String,
|
|
418
|
+
"judge_model_id": pl.String,
|
|
419
|
+
"score": pl.Int64,
|
|
420
|
+
"rationale": pl.String,
|
|
421
|
+
"freeze_sha": pl.String,
|
|
422
|
+
},
|
|
423
|
+
)
|
|
424
|
+
df.write_parquet(path)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def run(
|
|
428
|
+
sessions: list[tuple[str, str]],
|
|
429
|
+
panel_shortnames: list[str],
|
|
430
|
+
rubric_yaml_path: Path,
|
|
431
|
+
freeze_sha: str,
|
|
432
|
+
out_parquet: Path,
|
|
433
|
+
*,
|
|
434
|
+
dry_run: bool = True,
|
|
435
|
+
concurrency: int = 4,
|
|
436
|
+
region: str = "us-east-1",
|
|
437
|
+
) -> GradePlan | list[JudgeScore]:
|
|
438
|
+
"""Synchronous entry point used by the CLI."""
|
|
439
|
+
panel = judge_catalog.panel(panel_shortnames)
|
|
440
|
+
axes = parse_rubric(rubric_yaml_path.read_text(encoding="utf-8"))
|
|
441
|
+
p = plan(sessions, panel, axes)
|
|
442
|
+
logger.info(
|
|
443
|
+
"judge plan: {} calls across {} sessions × {} judges × {} axes; "
|
|
444
|
+
"~{} in-tok, ~{} out-tok, ~${:.4f}",
|
|
445
|
+
p.n_calls,
|
|
446
|
+
p.n_sessions,
|
|
447
|
+
p.n_judges,
|
|
448
|
+
p.n_axes,
|
|
449
|
+
p.est_input_tokens,
|
|
450
|
+
p.est_output_tokens,
|
|
451
|
+
p.est_usd,
|
|
452
|
+
)
|
|
453
|
+
if dry_run:
|
|
454
|
+
return p
|
|
455
|
+
scores = asyncio.run(
|
|
456
|
+
run_async(sessions, panel, axes, freeze_sha, concurrency=concurrency, region=region)
|
|
457
|
+
)
|
|
458
|
+
to_parquet(scores, out_parquet)
|
|
459
|
+
return scores
|
claude_sql/judges.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Cross-provider Bedrock judge panel catalog.
|
|
2
|
+
|
|
3
|
+
Used by the ``judge`` subcommand to dispatch agent-output grading to a
|
|
4
|
+
panel of foundation models with diverse training lineages. Goal:
|
|
5
|
+
ensemble *disagreement* surfaces bias; within-family agreement is the
|
|
6
|
+
bias we are measuring, not the signal.
|
|
7
|
+
|
|
8
|
+
All model IDs below are validated against ``aws bedrock list-foundation-models``
|
|
9
|
+
in ``us-east-1`` as of 2026-04-21 (lalsaado-handson profile).
|
|
10
|
+
|
|
11
|
+
Shortnames are stable CLI aliases so ``--panel kimi-k2.5,deepseek-v3.2``
|
|
12
|
+
does not rot every time a provider changes its ID suffix.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Literal
|
|
19
|
+
|
|
20
|
+
JudgeFamily = Literal["anthropic", "amazon", "non-anthropic-non-amazon"]
|
|
21
|
+
JudgeRole = Literal["judge", "bulk", "embed", "within-family-holdout"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class Judge:
|
|
26
|
+
"""One Bedrock foundation model wired into the judge panel."""
|
|
27
|
+
|
|
28
|
+
shortname: str
|
|
29
|
+
model_id: str
|
|
30
|
+
provider: str
|
|
31
|
+
family: JudgeFamily
|
|
32
|
+
role: JudgeRole
|
|
33
|
+
notes: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
#: Primary ensemble: non-Anthropic, non-Amazon judges for cross-lineage
|
|
37
|
+
#: variance. Eight distinct training corpora across Chinese, North
|
|
38
|
+
#: American, European labs.
|
|
39
|
+
#:
|
|
40
|
+
#: Mistral Large 3 and Magistral-Small were dropped after the first
|
|
41
|
+
#: gym run (study ``ab0bf2eeb481fdd2``, 2026-04-21) because they
|
|
42
|
+
#: produced 4/50 unparseable responses \u2014 worst rubric discipline of
|
|
43
|
+
#: the panel. They remain in ``EXCLUDED_JUDGES`` below so the history
|
|
44
|
+
#: is self-documenting and they can be re-opted-in via ``--panel``.
|
|
45
|
+
PRIMARY_PANEL: tuple[Judge, ...] = (
|
|
46
|
+
Judge(
|
|
47
|
+
shortname="kimi-k2.5",
|
|
48
|
+
model_id="moonshotai.kimi-k2.5",
|
|
49
|
+
provider="Moonshot AI",
|
|
50
|
+
family="non-anthropic-non-amazon",
|
|
51
|
+
role="judge",
|
|
52
|
+
notes="Chinese lineage, strong general reasoning. Primary tie-breaker judge.",
|
|
53
|
+
),
|
|
54
|
+
Judge(
|
|
55
|
+
shortname="kimi-k2-thinking",
|
|
56
|
+
model_id="moonshot.kimi-k2-thinking",
|
|
57
|
+
provider="Moonshot AI",
|
|
58
|
+
family="non-anthropic-non-amazon",
|
|
59
|
+
role="judge",
|
|
60
|
+
notes="Thinking variant; use on fabrication_present where CoT matters.",
|
|
61
|
+
),
|
|
62
|
+
Judge(
|
|
63
|
+
shortname="deepseek-v3.2",
|
|
64
|
+
model_id="deepseek.v3.2",
|
|
65
|
+
provider="DeepSeek",
|
|
66
|
+
family="non-anthropic-non-amazon",
|
|
67
|
+
role="judge",
|
|
68
|
+
notes="Different training corpus from Kimi; good disagreement signal.",
|
|
69
|
+
),
|
|
70
|
+
Judge(
|
|
71
|
+
shortname="minimax-m2.5",
|
|
72
|
+
model_id="minimax.minimax-m2.5",
|
|
73
|
+
provider="MiniMax",
|
|
74
|
+
family="non-anthropic-non-amazon",
|
|
75
|
+
role="judge",
|
|
76
|
+
notes="Third Chinese-lab vote for ensemble diversity.",
|
|
77
|
+
),
|
|
78
|
+
Judge(
|
|
79
|
+
shortname="glm-5",
|
|
80
|
+
model_id="zai.glm-5",
|
|
81
|
+
provider="Z.AI",
|
|
82
|
+
family="non-anthropic-non-amazon",
|
|
83
|
+
role="judge",
|
|
84
|
+
notes="GLM-5; GLM-4.7 available as lightweight fallback.",
|
|
85
|
+
),
|
|
86
|
+
Judge(
|
|
87
|
+
shortname="qwen3-next-80b",
|
|
88
|
+
model_id="qwen.qwen3-next-80b-a3b",
|
|
89
|
+
provider="Qwen (Alibaba)",
|
|
90
|
+
family="non-anthropic-non-amazon",
|
|
91
|
+
role="judge",
|
|
92
|
+
notes="Qwen3-Next 80B MoE; structured-output strong.",
|
|
93
|
+
),
|
|
94
|
+
Judge(
|
|
95
|
+
shortname="palmyra-x5",
|
|
96
|
+
model_id="writer.palmyra-x5-v1:0",
|
|
97
|
+
provider="Writer",
|
|
98
|
+
family="non-anthropic-non-amazon",
|
|
99
|
+
role="judge",
|
|
100
|
+
notes="Enterprise-text purpose-trained; different bias axis.",
|
|
101
|
+
),
|
|
102
|
+
Judge(
|
|
103
|
+
shortname="nemotron-super-3",
|
|
104
|
+
model_id="nvidia.nemotron-super-3-120b",
|
|
105
|
+
provider="NVIDIA",
|
|
106
|
+
family="non-anthropic-non-amazon",
|
|
107
|
+
role="judge",
|
|
108
|
+
notes="NVIDIA's largest open reasoning model.",
|
|
109
|
+
),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
#: Judges evaluated and dropped. Reachable via ``resolve()`` /
|
|
113
|
+
#: ``--panel`` for anyone who wants to re-test them, but not part of
|
|
114
|
+
#: the default ensemble.
|
|
115
|
+
EXCLUDED_JUDGES: tuple[Judge, ...] = (
|
|
116
|
+
Judge(
|
|
117
|
+
shortname="mistral-large-3",
|
|
118
|
+
model_id="mistral.mistral-large-3-675b-instruct",
|
|
119
|
+
provider="Mistral AI",
|
|
120
|
+
family="non-anthropic-non-amazon",
|
|
121
|
+
role="judge",
|
|
122
|
+
notes=(
|
|
123
|
+
"Dropped 2026-04-21 after study ab0bf2eeb481fdd2: produced 4/50 "
|
|
124
|
+
"unparseable responses (worst rubric discipline in panel)."
|
|
125
|
+
),
|
|
126
|
+
),
|
|
127
|
+
Judge(
|
|
128
|
+
shortname="magistral-small",
|
|
129
|
+
model_id="mistral.magistral-small-2509",
|
|
130
|
+
provider="Mistral AI",
|
|
131
|
+
family="non-anthropic-non-amazon",
|
|
132
|
+
role="judge",
|
|
133
|
+
notes=(
|
|
134
|
+
"Dropped 2026-04-21 alongside mistral-large-3 (Mistral family "
|
|
135
|
+
"entirely excluded pending rubric-discipline fix)."
|
|
136
|
+
),
|
|
137
|
+
),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
#: Within-family holdout judges. Kept explicitly so the study can
|
|
141
|
+
#: *measure* the within-family bias rather than silently avoid it.
|
|
142
|
+
WITHIN_FAMILY_HOLDOUT: tuple[Judge, ...] = (
|
|
143
|
+
Judge(
|
|
144
|
+
shortname="opus-4-7",
|
|
145
|
+
model_id="global.anthropic.claude-opus-4-7",
|
|
146
|
+
provider="Anthropic",
|
|
147
|
+
family="anthropic",
|
|
148
|
+
role="within-family-holdout",
|
|
149
|
+
notes=(
|
|
150
|
+
"Delta vs non-Anthropic ensemble = the bias we are measuring. "
|
|
151
|
+
"Uses global CRIS profile; direct model ID rejects on-demand."
|
|
152
|
+
),
|
|
153
|
+
),
|
|
154
|
+
Judge(
|
|
155
|
+
shortname="sonnet-4-6",
|
|
156
|
+
model_id="global.anthropic.claude-sonnet-4-6",
|
|
157
|
+
provider="Anthropic",
|
|
158
|
+
family="anthropic",
|
|
159
|
+
role="within-family-holdout",
|
|
160
|
+
notes="Intra-family agreement is its own data point. Uses global CRIS.",
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
#: Bulk Amazon lane: cheap, fast, current-gen only. Nova Pro v1 is
|
|
165
|
+
#: explicitly excluded as stale.
|
|
166
|
+
BULK_PANEL: tuple[Judge, ...] = (
|
|
167
|
+
Judge(
|
|
168
|
+
shortname="nova-2-lite",
|
|
169
|
+
model_id="amazon.nova-2-lite-v1:0:256k",
|
|
170
|
+
provider="Amazon",
|
|
171
|
+
family="amazon",
|
|
172
|
+
role="bulk",
|
|
173
|
+
notes="Bulk classifier: hedge counting, entity spotting, reversal markers.",
|
|
174
|
+
),
|
|
175
|
+
Judge(
|
|
176
|
+
shortname="nova-2-mm-embed",
|
|
177
|
+
model_id="amazon.nova-2-multimodal-embeddings-v1:0",
|
|
178
|
+
provider="Amazon",
|
|
179
|
+
family="amazon",
|
|
180
|
+
role="embed",
|
|
181
|
+
notes="Embedding path for dedup-neighbors cosine-contamination filter.",
|
|
182
|
+
),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
#: Flat lookup keyed by shortname (CLI-facing) and by model_id (internal).
|
|
186
|
+
#: Includes ``EXCLUDED_JUDGES`` so ``--panel mistral-large-3`` still
|
|
187
|
+
#: resolves for anyone re-testing a dropped judge.
|
|
188
|
+
_ALL: tuple[Judge, ...] = (
|
|
189
|
+
*PRIMARY_PANEL,
|
|
190
|
+
*WITHIN_FAMILY_HOLDOUT,
|
|
191
|
+
*BULK_PANEL,
|
|
192
|
+
*EXCLUDED_JUDGES,
|
|
193
|
+
)
|
|
194
|
+
_BY_SHORTNAME: dict[str, Judge] = {j.shortname: j for j in _ALL}
|
|
195
|
+
_BY_MODEL_ID: dict[str, Judge] = {j.model_id: j for j in _ALL}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def resolve(name: str) -> Judge:
|
|
199
|
+
"""Resolve a shortname or model ID to a ``Judge``.
|
|
200
|
+
|
|
201
|
+
Raises ``KeyError`` with the full catalog when the name is unknown —
|
|
202
|
+
agents parsing stderr get a concrete hint on what is available.
|
|
203
|
+
"""
|
|
204
|
+
if name in _BY_SHORTNAME:
|
|
205
|
+
return _BY_SHORTNAME[name]
|
|
206
|
+
if name in _BY_MODEL_ID:
|
|
207
|
+
return _BY_MODEL_ID[name]
|
|
208
|
+
available = ", ".join(sorted(_BY_SHORTNAME))
|
|
209
|
+
raise KeyError(f"unknown judge {name!r}; available: {available}")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def panel(names: list[str] | tuple[str, ...]) -> list[Judge]:
|
|
213
|
+
"""Resolve a list of shortnames/model IDs into Judge records, preserving order."""
|
|
214
|
+
return [resolve(n) for n in names]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def all_primary() -> tuple[Judge, ...]:
|
|
218
|
+
"""Return the full primary (non-within-family) panel."""
|
|
219
|
+
return PRIMARY_PANEL
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def all_within_family() -> tuple[Judge, ...]:
|
|
223
|
+
"""Return the within-family holdout panel."""
|
|
224
|
+
return WITHIN_FAMILY_HOLDOUT
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def all_bulk() -> tuple[Judge, ...]:
|
|
228
|
+
"""Return the Amazon bulk/embed lane."""
|
|
229
|
+
return BULK_PANEL
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def all_excluded() -> tuple[Judge, ...]:
|
|
233
|
+
"""Return judges that were evaluated and dropped from the primary panel."""
|
|
234
|
+
return EXCLUDED_JUDGES
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def catalog() -> list[Judge]:
|
|
238
|
+
"""Every judge in the catalog, primary first, then holdout, bulk, excluded."""
|
|
239
|
+
return [*PRIMARY_PANEL, *WITHIN_FAMILY_HOLDOUT, *BULK_PANEL, *EXCLUDED_JUDGES]
|