claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ """Inter-rater reliability: Cohen's & Fleiss' kappa + bootstrapped CIs.
2
+
3
+ Consumes one or more judge-score parquets written by ``judge_worker``
4
+ (schema: ``session_id, axis, judge_shortname, score``) and computes:
5
+
6
+ 1. **Cohen's kappa** for every pair of judges on every axis.
7
+ 2. **Fleiss' kappa** across all judges on every axis (when ≥3 judges).
8
+ 3. **Bootstrapped 95% CI** on both statistics via 1000 resamples.
9
+ 4. **Stopping-rule gate**: with ``--floor 0.6 --delta-gate <prior.parquet>``
10
+ returns non-zero exit if the delta-kappa CI excludes zero, matching
11
+ the pre-registered rebaseline policy from the Bonk↔Clod session.
12
+
13
+ No Bedrock calls. Pure stats. Safe to run unlimited times.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+
21
+ import numpy as np
22
+ import polars as pl
23
+
24
+ RNG_SEED = 42
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class PairKappa:
29
+ """Cohen's kappa between two judges on a single axis."""
30
+
31
+ axis: str
32
+ judge_a: str
33
+ judge_b: str
34
+ n_items: int
35
+ kappa: float
36
+ ci_low: float
37
+ ci_high: float
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class FleissKappa:
42
+ """Fleiss' kappa across all judges on a single axis."""
43
+
44
+ axis: str
45
+ n_judges: int
46
+ n_items: int
47
+ kappa: float
48
+ ci_low: float
49
+ ci_high: float
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Core kappa math
54
+ # ---------------------------------------------------------------------------
55
+
56
+
57
+ def cohens_kappa(a: np.ndarray, b: np.ndarray) -> float:
58
+ """Cohen's kappa for two rater arrays of equal length.
59
+
60
+ Returns 0.0 when observers never disagree *or* agree above chance
61
+ (i.e., ``pe == 1.0``), not NaN, so downstream stats stay valid.
62
+ """
63
+ assert a.shape == b.shape, f"shape mismatch: {a.shape} vs {b.shape}" # noqa: S101 — input invariant
64
+ if len(a) == 0:
65
+ return 0.0
66
+ categories = sorted(set(a.tolist()) | set(b.tolist()))
67
+ po = float(np.mean(a == b))
68
+ pe = 0.0
69
+ for c in categories:
70
+ pa = float(np.mean(a == c))
71
+ pb = float(np.mean(b == c))
72
+ pe += pa * pb
73
+ if pe >= 1.0:
74
+ return 0.0
75
+ return (po - pe) / (1.0 - pe)
76
+
77
+
78
+ def fleiss_kappa(ratings: np.ndarray) -> float:
79
+ """Fleiss' kappa for an (n_items, n_categories) count matrix.
80
+
81
+ Each row is one item; each column is the count of judges who
82
+ assigned that category. Row sums must be equal (``n_judges``).
83
+ """
84
+ n_items, _ = ratings.shape
85
+ n_judges = int(ratings[0].sum())
86
+ if n_judges < 2 or n_items == 0:
87
+ return 0.0
88
+
89
+ # p_j = column proportion = share of all (item, judge) ratings in category j
90
+ p_j = ratings.sum(axis=0) / (n_items * n_judges)
91
+
92
+ # P_i = within-item agreement for item i
93
+ p_i = (np.sum(ratings**2, axis=1) - n_judges) / (n_judges * (n_judges - 1))
94
+ p_bar = float(np.mean(p_i))
95
+ pe_bar = float(np.sum(p_j**2))
96
+ if pe_bar >= 1.0:
97
+ return 0.0
98
+ return (p_bar - pe_bar) / (1.0 - pe_bar)
99
+
100
+
101
+ def bootstrap_kappa_ci(
102
+ a: np.ndarray,
103
+ b: np.ndarray,
104
+ n_bootstrap: int = 1000,
105
+ confidence: float = 0.95,
106
+ seed: int = RNG_SEED,
107
+ ) -> tuple[float, float]:
108
+ """Bootstrap 95% CI on Cohen's kappa by item resampling."""
109
+ rng = np.random.default_rng(seed)
110
+ n = len(a)
111
+ if n == 0:
112
+ return (0.0, 0.0)
113
+ samples = np.empty(n_bootstrap, dtype=np.float64)
114
+ for i in range(n_bootstrap):
115
+ idx = rng.integers(0, n, size=n)
116
+ samples[i] = cohens_kappa(a[idx], b[idx])
117
+ low = float(np.quantile(samples, (1 - confidence) / 2))
118
+ high = float(np.quantile(samples, 1 - (1 - confidence) / 2))
119
+ return (low, high)
120
+
121
+
122
+ def bootstrap_fleiss_ci(
123
+ ratings: np.ndarray,
124
+ n_bootstrap: int = 1000,
125
+ confidence: float = 0.95,
126
+ seed: int = RNG_SEED,
127
+ ) -> tuple[float, float]:
128
+ """Bootstrap 95% CI on Fleiss' kappa by item resampling."""
129
+ rng = np.random.default_rng(seed)
130
+ n = ratings.shape[0]
131
+ if n == 0:
132
+ return (0.0, 0.0)
133
+ samples = np.empty(n_bootstrap, dtype=np.float64)
134
+ for i in range(n_bootstrap):
135
+ idx = rng.integers(0, n, size=n)
136
+ samples[i] = fleiss_kappa(ratings[idx])
137
+ low = float(np.quantile(samples, (1 - confidence) / 2))
138
+ high = float(np.quantile(samples, 1 - (1 - confidence) / 2))
139
+ return (low, high)
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Pipeline: parquet -> pairwise + Fleiss tables
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def compute_pairwise(df: pl.DataFrame, n_bootstrap: int = 1000) -> list[PairKappa]:
148
+ """Compute Cohen's kappa for every (judge_a, judge_b) pair on every axis."""
149
+ required = {"session_id", "axis", "judge_shortname", "score"}
150
+ missing = required - set(df.columns)
151
+ if missing:
152
+ raise ValueError(f"parquet is missing columns: {sorted(missing)}")
153
+
154
+ out: list[PairKappa] = []
155
+ for axis in df["axis"].unique().sort():
156
+ sub = df.filter(pl.col("axis") == axis)
157
+ judges = sorted(sub["judge_shortname"].unique().to_list())
158
+ # Pivot to (session_id, judge_shortname) -> score
159
+ wide = sub.pivot(
160
+ values="score", index="session_id", on="judge_shortname", aggregate_function="first"
161
+ ).drop_nulls()
162
+ if wide.height == 0:
163
+ continue
164
+ for i, ja in enumerate(judges):
165
+ for jb in judges[i + 1 :]:
166
+ if ja not in wide.columns or jb not in wide.columns:
167
+ continue
168
+ a = wide[ja].to_numpy()
169
+ b = wide[jb].to_numpy()
170
+ k = cohens_kappa(a, b)
171
+ lo, hi = bootstrap_kappa_ci(a, b, n_bootstrap=n_bootstrap)
172
+ out.append(
173
+ PairKappa(
174
+ axis=str(axis),
175
+ judge_a=ja,
176
+ judge_b=jb,
177
+ n_items=len(a),
178
+ kappa=k,
179
+ ci_low=lo,
180
+ ci_high=hi,
181
+ )
182
+ )
183
+ return out
184
+
185
+
186
+ def compute_fleiss(df: pl.DataFrame, n_bootstrap: int = 1000) -> list[FleissKappa]:
187
+ """Compute Fleiss' kappa per axis across all judges."""
188
+ out: list[FleissKappa] = []
189
+ for axis in df["axis"].unique().sort():
190
+ sub = df.filter(pl.col("axis") == axis)
191
+ judges = sorted(sub["judge_shortname"].unique().to_list())
192
+ if len(judges) < 3:
193
+ continue
194
+ wide = sub.pivot(
195
+ values="score", index="session_id", on="judge_shortname", aggregate_function="first"
196
+ ).drop_nulls()
197
+ if wide.height == 0:
198
+ continue
199
+ categories = sorted(set(sub["score"].unique().to_list()))
200
+ cat_idx = {c: i for i, c in enumerate(categories)}
201
+ counts = np.zeros((wide.height, len(categories)), dtype=np.int64)
202
+ for r, row in enumerate(wide.iter_rows(named=True)):
203
+ for j in judges:
204
+ counts[r, cat_idx[row[j]]] += 1
205
+ k = fleiss_kappa(counts)
206
+ lo, hi = bootstrap_fleiss_ci(counts, n_bootstrap=n_bootstrap)
207
+ out.append(
208
+ FleissKappa(
209
+ axis=str(axis),
210
+ n_judges=len(judges),
211
+ n_items=wide.height,
212
+ kappa=k,
213
+ ci_low=lo,
214
+ ci_high=hi,
215
+ )
216
+ )
217
+ return out
218
+
219
+
220
+ # ---------------------------------------------------------------------------
221
+ # Stopping-rule gate
222
+ # ---------------------------------------------------------------------------
223
+
224
+
225
+ def delta_gate_excludes_zero(
226
+ current: FleissKappa,
227
+ prior: FleissKappa,
228
+ n_bootstrap: int = 1000,
229
+ seed: int = RNG_SEED,
230
+ ) -> bool:
231
+ """Does the 95% CI on (current.kappa - prior.kappa) exclude zero?
232
+
233
+ Bootstrap approximation: resample both kappas' bootstrap samples and
234
+ take the paired difference. When the resulting CI excludes zero,
235
+ the pre-registered policy pauses the study for rebaseline.
236
+ """
237
+ rng = np.random.default_rng(seed)
238
+ cur_samples = rng.normal(
239
+ loc=current.kappa, scale=(current.ci_high - current.ci_low) / 3.92, size=n_bootstrap
240
+ )
241
+ prior_samples = rng.normal(
242
+ loc=prior.kappa, scale=(prior.ci_high - prior.ci_low) / 3.92, size=n_bootstrap
243
+ )
244
+ diff = cur_samples - prior_samples
245
+ lo = float(np.quantile(diff, 0.025))
246
+ hi = float(np.quantile(diff, 0.975))
247
+ return lo > 0 or hi < 0
248
+
249
+
250
+ def load_scores(path: Path) -> pl.DataFrame:
251
+ """Read a judge-scores parquet with schema validation."""
252
+ df = pl.read_parquet(path)
253
+ required = {"session_id", "axis", "judge_shortname", "score"}
254
+ missing = required - set(df.columns)
255
+ if missing:
256
+ raise ValueError(f"{path} is missing required columns: {sorted(missing)}")
257
+ return df