claude-sql 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_sql/__init__.py +5 -0
- claude_sql/binding.py +740 -0
- claude_sql/blind_handover.py +155 -0
- claude_sql/checkpointer.py +202 -0
- claude_sql/cli.py +2344 -0
- claude_sql/cluster_worker.py +208 -0
- claude_sql/community_worker.py +306 -0
- claude_sql/config.py +380 -0
- claude_sql/embed_worker.py +482 -0
- claude_sql/freeze.py +189 -0
- claude_sql/friction_worker.py +561 -0
- claude_sql/install_source.py +77 -0
- claude_sql/judge_worker.py +459 -0
- claude_sql/judges.py +239 -0
- claude_sql/kappa_worker.py +257 -0
- claude_sql/llm_worker.py +1760 -0
- claude_sql/logging_setup.py +95 -0
- claude_sql/output.py +248 -0
- claude_sql/parquet_shards.py +172 -0
- claude_sql/retry_queue.py +180 -0
- claude_sql/review_sheet_render.py +167 -0
- claude_sql/review_sheet_worker.py +463 -0
- claude_sql/schemas.py +454 -0
- claude_sql/session_text.py +387 -0
- claude_sql/skills_catalog.py +354 -0
- claude_sql/sql_views.py +1751 -0
- claude_sql/terms_worker.py +145 -0
- claude_sql/ungrounded_worker.py +190 -0
- claude_sql-0.4.0.dist-info/METADATA +530 -0
- claude_sql-0.4.0.dist-info/RECORD +32 -0
- claude_sql-0.4.0.dist-info/WHEEL +4 -0
- claude_sql-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""Inter-rater reliability: Cohen's & Fleiss' kappa + bootstrapped CIs.
|
|
2
|
+
|
|
3
|
+
Consumes one or more judge-score parquets written by ``judge_worker``
|
|
4
|
+
(schema: ``session_id, axis, judge_shortname, score``) and computes:
|
|
5
|
+
|
|
6
|
+
1. **Cohen's kappa** for every pair of judges on every axis.
|
|
7
|
+
2. **Fleiss' kappa** across all judges on every axis (when ≥3 judges).
|
|
8
|
+
3. **Bootstrapped 95% CI** on both statistics via 1000 resamples.
|
|
9
|
+
4. **Stopping-rule gate**: with ``--floor 0.6 --delta-gate <prior.parquet>``
|
|
10
|
+
returns non-zero exit if the delta-kappa CI excludes zero, matching
|
|
11
|
+
the pre-registered rebaseline policy from the Bonk↔Clod session.
|
|
12
|
+
|
|
13
|
+
No Bedrock calls. Pure stats. Safe to run unlimited times.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import polars as pl
|
|
23
|
+
|
|
24
|
+
RNG_SEED = 42
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class PairKappa:
|
|
29
|
+
"""Cohen's kappa between two judges on a single axis."""
|
|
30
|
+
|
|
31
|
+
axis: str
|
|
32
|
+
judge_a: str
|
|
33
|
+
judge_b: str
|
|
34
|
+
n_items: int
|
|
35
|
+
kappa: float
|
|
36
|
+
ci_low: float
|
|
37
|
+
ci_high: float
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class FleissKappa:
|
|
42
|
+
"""Fleiss' kappa across all judges on a single axis."""
|
|
43
|
+
|
|
44
|
+
axis: str
|
|
45
|
+
n_judges: int
|
|
46
|
+
n_items: int
|
|
47
|
+
kappa: float
|
|
48
|
+
ci_low: float
|
|
49
|
+
ci_high: float
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Core kappa math
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def cohens_kappa(a: np.ndarray, b: np.ndarray) -> float:
|
|
58
|
+
"""Cohen's kappa for two rater arrays of equal length.
|
|
59
|
+
|
|
60
|
+
Returns 0.0 when observers never disagree *or* agree above chance
|
|
61
|
+
(i.e., ``pe == 1.0``), not NaN, so downstream stats stay valid.
|
|
62
|
+
"""
|
|
63
|
+
assert a.shape == b.shape, f"shape mismatch: {a.shape} vs {b.shape}" # noqa: S101 — input invariant
|
|
64
|
+
if len(a) == 0:
|
|
65
|
+
return 0.0
|
|
66
|
+
categories = sorted(set(a.tolist()) | set(b.tolist()))
|
|
67
|
+
po = float(np.mean(a == b))
|
|
68
|
+
pe = 0.0
|
|
69
|
+
for c in categories:
|
|
70
|
+
pa = float(np.mean(a == c))
|
|
71
|
+
pb = float(np.mean(b == c))
|
|
72
|
+
pe += pa * pb
|
|
73
|
+
if pe >= 1.0:
|
|
74
|
+
return 0.0
|
|
75
|
+
return (po - pe) / (1.0 - pe)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def fleiss_kappa(ratings: np.ndarray) -> float:
|
|
79
|
+
"""Fleiss' kappa for an (n_items, n_categories) count matrix.
|
|
80
|
+
|
|
81
|
+
Each row is one item; each column is the count of judges who
|
|
82
|
+
assigned that category. Row sums must be equal (``n_judges``).
|
|
83
|
+
"""
|
|
84
|
+
n_items, _ = ratings.shape
|
|
85
|
+
n_judges = int(ratings[0].sum())
|
|
86
|
+
if n_judges < 2 or n_items == 0:
|
|
87
|
+
return 0.0
|
|
88
|
+
|
|
89
|
+
# p_j = column proportion = share of all (item, judge) ratings in category j
|
|
90
|
+
p_j = ratings.sum(axis=0) / (n_items * n_judges)
|
|
91
|
+
|
|
92
|
+
# P_i = within-item agreement for item i
|
|
93
|
+
p_i = (np.sum(ratings**2, axis=1) - n_judges) / (n_judges * (n_judges - 1))
|
|
94
|
+
p_bar = float(np.mean(p_i))
|
|
95
|
+
pe_bar = float(np.sum(p_j**2))
|
|
96
|
+
if pe_bar >= 1.0:
|
|
97
|
+
return 0.0
|
|
98
|
+
return (p_bar - pe_bar) / (1.0 - pe_bar)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def bootstrap_kappa_ci(
|
|
102
|
+
a: np.ndarray,
|
|
103
|
+
b: np.ndarray,
|
|
104
|
+
n_bootstrap: int = 1000,
|
|
105
|
+
confidence: float = 0.95,
|
|
106
|
+
seed: int = RNG_SEED,
|
|
107
|
+
) -> tuple[float, float]:
|
|
108
|
+
"""Bootstrap 95% CI on Cohen's kappa by item resampling."""
|
|
109
|
+
rng = np.random.default_rng(seed)
|
|
110
|
+
n = len(a)
|
|
111
|
+
if n == 0:
|
|
112
|
+
return (0.0, 0.0)
|
|
113
|
+
samples = np.empty(n_bootstrap, dtype=np.float64)
|
|
114
|
+
for i in range(n_bootstrap):
|
|
115
|
+
idx = rng.integers(0, n, size=n)
|
|
116
|
+
samples[i] = cohens_kappa(a[idx], b[idx])
|
|
117
|
+
low = float(np.quantile(samples, (1 - confidence) / 2))
|
|
118
|
+
high = float(np.quantile(samples, 1 - (1 - confidence) / 2))
|
|
119
|
+
return (low, high)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def bootstrap_fleiss_ci(
|
|
123
|
+
ratings: np.ndarray,
|
|
124
|
+
n_bootstrap: int = 1000,
|
|
125
|
+
confidence: float = 0.95,
|
|
126
|
+
seed: int = RNG_SEED,
|
|
127
|
+
) -> tuple[float, float]:
|
|
128
|
+
"""Bootstrap 95% CI on Fleiss' kappa by item resampling."""
|
|
129
|
+
rng = np.random.default_rng(seed)
|
|
130
|
+
n = ratings.shape[0]
|
|
131
|
+
if n == 0:
|
|
132
|
+
return (0.0, 0.0)
|
|
133
|
+
samples = np.empty(n_bootstrap, dtype=np.float64)
|
|
134
|
+
for i in range(n_bootstrap):
|
|
135
|
+
idx = rng.integers(0, n, size=n)
|
|
136
|
+
samples[i] = fleiss_kappa(ratings[idx])
|
|
137
|
+
low = float(np.quantile(samples, (1 - confidence) / 2))
|
|
138
|
+
high = float(np.quantile(samples, 1 - (1 - confidence) / 2))
|
|
139
|
+
return (low, high)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
# Pipeline: parquet -> pairwise + Fleiss tables
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def compute_pairwise(df: pl.DataFrame, n_bootstrap: int = 1000) -> list[PairKappa]:
|
|
148
|
+
"""Compute Cohen's kappa for every (judge_a, judge_b) pair on every axis."""
|
|
149
|
+
required = {"session_id", "axis", "judge_shortname", "score"}
|
|
150
|
+
missing = required - set(df.columns)
|
|
151
|
+
if missing:
|
|
152
|
+
raise ValueError(f"parquet is missing columns: {sorted(missing)}")
|
|
153
|
+
|
|
154
|
+
out: list[PairKappa] = []
|
|
155
|
+
for axis in df["axis"].unique().sort():
|
|
156
|
+
sub = df.filter(pl.col("axis") == axis)
|
|
157
|
+
judges = sorted(sub["judge_shortname"].unique().to_list())
|
|
158
|
+
# Pivot to (session_id, judge_shortname) -> score
|
|
159
|
+
wide = sub.pivot(
|
|
160
|
+
values="score", index="session_id", on="judge_shortname", aggregate_function="first"
|
|
161
|
+
).drop_nulls()
|
|
162
|
+
if wide.height == 0:
|
|
163
|
+
continue
|
|
164
|
+
for i, ja in enumerate(judges):
|
|
165
|
+
for jb in judges[i + 1 :]:
|
|
166
|
+
if ja not in wide.columns or jb not in wide.columns:
|
|
167
|
+
continue
|
|
168
|
+
a = wide[ja].to_numpy()
|
|
169
|
+
b = wide[jb].to_numpy()
|
|
170
|
+
k = cohens_kappa(a, b)
|
|
171
|
+
lo, hi = bootstrap_kappa_ci(a, b, n_bootstrap=n_bootstrap)
|
|
172
|
+
out.append(
|
|
173
|
+
PairKappa(
|
|
174
|
+
axis=str(axis),
|
|
175
|
+
judge_a=ja,
|
|
176
|
+
judge_b=jb,
|
|
177
|
+
n_items=len(a),
|
|
178
|
+
kappa=k,
|
|
179
|
+
ci_low=lo,
|
|
180
|
+
ci_high=hi,
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
return out
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def compute_fleiss(df: pl.DataFrame, n_bootstrap: int = 1000) -> list[FleissKappa]:
|
|
187
|
+
"""Compute Fleiss' kappa per axis across all judges."""
|
|
188
|
+
out: list[FleissKappa] = []
|
|
189
|
+
for axis in df["axis"].unique().sort():
|
|
190
|
+
sub = df.filter(pl.col("axis") == axis)
|
|
191
|
+
judges = sorted(sub["judge_shortname"].unique().to_list())
|
|
192
|
+
if len(judges) < 3:
|
|
193
|
+
continue
|
|
194
|
+
wide = sub.pivot(
|
|
195
|
+
values="score", index="session_id", on="judge_shortname", aggregate_function="first"
|
|
196
|
+
).drop_nulls()
|
|
197
|
+
if wide.height == 0:
|
|
198
|
+
continue
|
|
199
|
+
categories = sorted(set(sub["score"].unique().to_list()))
|
|
200
|
+
cat_idx = {c: i for i, c in enumerate(categories)}
|
|
201
|
+
counts = np.zeros((wide.height, len(categories)), dtype=np.int64)
|
|
202
|
+
for r, row in enumerate(wide.iter_rows(named=True)):
|
|
203
|
+
for j in judges:
|
|
204
|
+
counts[r, cat_idx[row[j]]] += 1
|
|
205
|
+
k = fleiss_kappa(counts)
|
|
206
|
+
lo, hi = bootstrap_fleiss_ci(counts, n_bootstrap=n_bootstrap)
|
|
207
|
+
out.append(
|
|
208
|
+
FleissKappa(
|
|
209
|
+
axis=str(axis),
|
|
210
|
+
n_judges=len(judges),
|
|
211
|
+
n_items=wide.height,
|
|
212
|
+
kappa=k,
|
|
213
|
+
ci_low=lo,
|
|
214
|
+
ci_high=hi,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
return out
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# ---------------------------------------------------------------------------
|
|
221
|
+
# Stopping-rule gate
|
|
222
|
+
# ---------------------------------------------------------------------------
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def delta_gate_excludes_zero(
|
|
226
|
+
current: FleissKappa,
|
|
227
|
+
prior: FleissKappa,
|
|
228
|
+
n_bootstrap: int = 1000,
|
|
229
|
+
seed: int = RNG_SEED,
|
|
230
|
+
) -> bool:
|
|
231
|
+
"""Does the 95% CI on (current.kappa - prior.kappa) exclude zero?
|
|
232
|
+
|
|
233
|
+
Bootstrap approximation: resample both kappas' bootstrap samples and
|
|
234
|
+
take the paired difference. When the resulting CI excludes zero,
|
|
235
|
+
the pre-registered policy pauses the study for rebaseline.
|
|
236
|
+
"""
|
|
237
|
+
rng = np.random.default_rng(seed)
|
|
238
|
+
cur_samples = rng.normal(
|
|
239
|
+
loc=current.kappa, scale=(current.ci_high - current.ci_low) / 3.92, size=n_bootstrap
|
|
240
|
+
)
|
|
241
|
+
prior_samples = rng.normal(
|
|
242
|
+
loc=prior.kappa, scale=(prior.ci_high - prior.ci_low) / 3.92, size=n_bootstrap
|
|
243
|
+
)
|
|
244
|
+
diff = cur_samples - prior_samples
|
|
245
|
+
lo = float(np.quantile(diff, 0.025))
|
|
246
|
+
hi = float(np.quantile(diff, 0.975))
|
|
247
|
+
return lo > 0 or hi < 0
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def load_scores(path: Path) -> pl.DataFrame:
|
|
251
|
+
"""Read a judge-scores parquet with schema validation."""
|
|
252
|
+
df = pl.read_parquet(path)
|
|
253
|
+
required = {"session_id", "axis", "judge_shortname", "score"}
|
|
254
|
+
missing = required - set(df.columns)
|
|
255
|
+
if missing:
|
|
256
|
+
raise ValueError(f"{path} is missing required columns: {sorted(missing)}")
|
|
257
|
+
return df
|