themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/main.py +427 -57
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/core/entities.py +23 -3
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/pipelines/standard_pipeline.py +68 -8
- themis/experiment/cache_manager.py +8 -3
- themis/experiment/export.py +110 -2
- themis/experiment/orchestrator.py +48 -6
- themis/experiment/storage.py +1313 -110
- themis/integrations/huggingface.py +12 -1
- themis/integrations/wandb.py +13 -1
- themis/interfaces/__init__.py +86 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.1.dist-info/METADATA +0 -758
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""Statistical tests for comparing experiment results.
|
|
2
|
+
|
|
3
|
+
This module provides various statistical tests to determine if differences
|
|
4
|
+
between runs are statistically significant.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
import random
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Sequence
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StatisticalTest(str, Enum):
|
|
17
|
+
"""Available statistical tests."""
|
|
18
|
+
|
|
19
|
+
T_TEST = "t_test"
|
|
20
|
+
BOOTSTRAP = "bootstrap"
|
|
21
|
+
PERMUTATION = "permutation"
|
|
22
|
+
NONE = "none"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class StatisticalTestResult:
|
|
27
|
+
"""Result of a statistical test.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
test_name: Name of the test performed
|
|
31
|
+
statistic: Test statistic value
|
|
32
|
+
p_value: P-value (probability of observing this difference by chance)
|
|
33
|
+
significant: Whether the difference is statistically significant
|
|
34
|
+
confidence_level: Confidence level used (e.g., 0.95 for 95%)
|
|
35
|
+
effect_size: Effect size (e.g., Cohen's d)
|
|
36
|
+
confidence_interval: Confidence interval for the difference
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
test_name: str
|
|
40
|
+
statistic: float
|
|
41
|
+
p_value: float
|
|
42
|
+
significant: bool
|
|
43
|
+
confidence_level: float = 0.95
|
|
44
|
+
effect_size: float | None = None
|
|
45
|
+
confidence_interval: tuple[float, float] | None = None
|
|
46
|
+
|
|
47
|
+
def __str__(self) -> str:
|
|
48
|
+
"""Human-readable summary."""
|
|
49
|
+
sig_str = "significant" if self.significant else "not significant"
|
|
50
|
+
result = f"{self.test_name}: p={self.p_value:.4f} ({sig_str})"
|
|
51
|
+
|
|
52
|
+
if self.effect_size is not None:
|
|
53
|
+
result += f", effect_size={self.effect_size:.3f}"
|
|
54
|
+
|
|
55
|
+
if self.confidence_interval is not None:
|
|
56
|
+
low, high = self.confidence_interval
|
|
57
|
+
result += f", CI=[{low:.3f}, {high:.3f}]"
|
|
58
|
+
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def t_test(
|
|
63
|
+
samples_a: Sequence[float],
|
|
64
|
+
samples_b: Sequence[float],
|
|
65
|
+
*,
|
|
66
|
+
alpha: float = 0.05,
|
|
67
|
+
paired: bool = True,
|
|
68
|
+
) -> StatisticalTestResult:
|
|
69
|
+
"""Perform a t-test to compare two sets of samples.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
samples_a: First set of samples
|
|
73
|
+
samples_b: Second set of samples
|
|
74
|
+
alpha: Significance level (default: 0.05 for 95% confidence)
|
|
75
|
+
paired: Whether to use paired t-test (default: True)
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
StatisticalTestResult with test statistics and significance
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If samples are empty or have mismatched lengths (for paired test)
|
|
82
|
+
"""
|
|
83
|
+
if not samples_a or not samples_b:
|
|
84
|
+
raise ValueError("Cannot perform t-test on empty samples")
|
|
85
|
+
|
|
86
|
+
if paired and len(samples_a) != len(samples_b):
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"Paired t-test requires equal sample sizes. "
|
|
89
|
+
f"Got {len(samples_a)} and {len(samples_b)}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
n_a = len(samples_a)
|
|
93
|
+
n_b = len(samples_b)
|
|
94
|
+
|
|
95
|
+
# Calculate means
|
|
96
|
+
mean_a = sum(samples_a) / n_a
|
|
97
|
+
mean_b = sum(samples_b) / n_b
|
|
98
|
+
|
|
99
|
+
if paired:
|
|
100
|
+
# Paired t-test: test on differences
|
|
101
|
+
diffs = [a - b for a, b in zip(samples_a, samples_b)]
|
|
102
|
+
mean_diff = sum(diffs) / len(diffs)
|
|
103
|
+
|
|
104
|
+
# Standard deviation of differences
|
|
105
|
+
var_diff = sum((d - mean_diff) ** 2 for d in diffs) / (len(diffs) - 1) if len(diffs) > 1 else 0
|
|
106
|
+
se_diff = math.sqrt(var_diff / len(diffs))
|
|
107
|
+
|
|
108
|
+
# T-statistic
|
|
109
|
+
if se_diff > 1e-10: # Non-zero standard error
|
|
110
|
+
t_stat = mean_diff / se_diff
|
|
111
|
+
elif abs(mean_diff) > 1e-10: # Perfect consistency with non-zero difference
|
|
112
|
+
t_stat = float('inf') if mean_diff > 0 else float('-inf')
|
|
113
|
+
else: # No difference at all
|
|
114
|
+
t_stat = 0.0
|
|
115
|
+
|
|
116
|
+
df = len(diffs) - 1
|
|
117
|
+
|
|
118
|
+
# Effect size (Cohen's d for paired samples)
|
|
119
|
+
sd_diff = math.sqrt(var_diff)
|
|
120
|
+
effect_size = mean_diff / sd_diff if sd_diff > 1e-10 else (1.0 if abs(mean_diff) > 1e-10 else 0.0)
|
|
121
|
+
|
|
122
|
+
else:
|
|
123
|
+
# Independent samples t-test
|
|
124
|
+
# Calculate pooled standard deviation
|
|
125
|
+
var_a = sum((x - mean_a) ** 2 for x in samples_a) / (n_a - 1) if n_a > 1 else 0
|
|
126
|
+
var_b = sum((x - mean_b) ** 2 for x in samples_b) / (n_b - 1) if n_b > 1 else 0
|
|
127
|
+
|
|
128
|
+
pooled_sd = math.sqrt(((n_a - 1) * var_a + (n_b - 1) * var_b) / (n_a + n_b - 2))
|
|
129
|
+
se = pooled_sd * math.sqrt(1/n_a + 1/n_b)
|
|
130
|
+
|
|
131
|
+
# T-statistic
|
|
132
|
+
t_stat = (mean_a - mean_b) / se if se > 0 else 0.0
|
|
133
|
+
df = n_a + n_b - 2
|
|
134
|
+
|
|
135
|
+
# Effect size (Cohen's d)
|
|
136
|
+
effect_size = (mean_a - mean_b) / pooled_sd if pooled_sd > 0 else 0.0
|
|
137
|
+
|
|
138
|
+
# Approximate p-value using t-distribution
|
|
139
|
+
# For simplicity, we use a conservative approximation
|
|
140
|
+
# In practice, you'd use scipy.stats.t.sf for accurate p-values
|
|
141
|
+
p_value = _approximate_t_test_p_value(abs(t_stat), df)
|
|
142
|
+
|
|
143
|
+
# Confidence interval (approximate)
|
|
144
|
+
# t_critical ≈ 2.0 for 95% CI and reasonable df
|
|
145
|
+
t_critical = 2.0 # Conservative estimate
|
|
146
|
+
margin = t_critical * (se_diff if paired else se)
|
|
147
|
+
ci = (mean_a - mean_b - margin, mean_a - mean_b + margin)
|
|
148
|
+
|
|
149
|
+
return StatisticalTestResult(
|
|
150
|
+
test_name="t-test (paired)" if paired else "t-test (independent)",
|
|
151
|
+
statistic=t_stat,
|
|
152
|
+
p_value=p_value,
|
|
153
|
+
significant=p_value < alpha,
|
|
154
|
+
confidence_level=1 - alpha,
|
|
155
|
+
effect_size=effect_size,
|
|
156
|
+
confidence_interval=ci,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _approximate_t_test_p_value(t_stat: float, df: int) -> float:
|
|
161
|
+
"""Approximate p-value for t-test.
|
|
162
|
+
|
|
163
|
+
This is a rough approximation. For accurate p-values, use scipy.stats.
|
|
164
|
+
"""
|
|
165
|
+
# Very rough approximation based on standard normal
|
|
166
|
+
# This gets less accurate for small df
|
|
167
|
+
if df < 1:
|
|
168
|
+
return 1.0
|
|
169
|
+
|
|
170
|
+
# Convert to z-score approximation for large df
|
|
171
|
+
if df > 30:
|
|
172
|
+
z = t_stat
|
|
173
|
+
# Approximate p-value for two-tailed test
|
|
174
|
+
# P(|Z| > z) ≈ 2 * (1 - Φ(z))
|
|
175
|
+
if z > 6:
|
|
176
|
+
return 0.0
|
|
177
|
+
elif z < 0.5:
|
|
178
|
+
return 1.0
|
|
179
|
+
else:
|
|
180
|
+
# Rough approximation
|
|
181
|
+
return min(1.0, 2 * math.exp(-0.5 * z * z) / math.sqrt(2 * math.pi))
|
|
182
|
+
|
|
183
|
+
# For small df, be conservative
|
|
184
|
+
return min(1.0, 0.5 if t_stat < 2 else 0.1 if t_stat < 3 else 0.01)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def bootstrap_confidence_interval(
|
|
188
|
+
samples_a: Sequence[float],
|
|
189
|
+
samples_b: Sequence[float],
|
|
190
|
+
*,
|
|
191
|
+
n_bootstrap: int = 10000,
|
|
192
|
+
confidence_level: float = 0.95,
|
|
193
|
+
statistic_fn: callable = None,
|
|
194
|
+
seed: int | None = None,
|
|
195
|
+
) -> StatisticalTestResult:
|
|
196
|
+
"""Compute bootstrap confidence interval for difference between two samples.
|
|
197
|
+
|
|
198
|
+
Uses bootstrap resampling to estimate the confidence interval for the
|
|
199
|
+
difference in means (or other statistic) between two samples.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
samples_a: First set of samples
|
|
203
|
+
samples_b: Second set of samples
|
|
204
|
+
n_bootstrap: Number of bootstrap iterations (default: 10000)
|
|
205
|
+
confidence_level: Confidence level (default: 0.95)
|
|
206
|
+
statistic_fn: Function to compute statistic (default: mean difference)
|
|
207
|
+
seed: Random seed for reproducibility
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
StatisticalTestResult with bootstrap confidence interval
|
|
211
|
+
"""
|
|
212
|
+
if not samples_a or not samples_b:
|
|
213
|
+
raise ValueError("Cannot perform bootstrap on empty samples")
|
|
214
|
+
|
|
215
|
+
if seed is not None:
|
|
216
|
+
random.seed(seed)
|
|
217
|
+
|
|
218
|
+
# Default statistic: difference in means
|
|
219
|
+
if statistic_fn is None:
|
|
220
|
+
def statistic_fn(a, b):
|
|
221
|
+
return sum(a) / len(a) - sum(b) / len(b)
|
|
222
|
+
|
|
223
|
+
# Observed difference
|
|
224
|
+
observed_diff = statistic_fn(samples_a, samples_b)
|
|
225
|
+
|
|
226
|
+
# Bootstrap resampling
|
|
227
|
+
bootstrap_diffs = []
|
|
228
|
+
for _ in range(n_bootstrap):
|
|
229
|
+
# Resample with replacement
|
|
230
|
+
resampled_a = [random.choice(samples_a) for _ in range(len(samples_a))]
|
|
231
|
+
resampled_b = [random.choice(samples_b) for _ in range(len(samples_b))]
|
|
232
|
+
|
|
233
|
+
diff = statistic_fn(resampled_a, resampled_b)
|
|
234
|
+
bootstrap_diffs.append(diff)
|
|
235
|
+
|
|
236
|
+
# Sort for percentile method
|
|
237
|
+
bootstrap_diffs.sort()
|
|
238
|
+
|
|
239
|
+
# Compute confidence interval
|
|
240
|
+
alpha = 1 - confidence_level
|
|
241
|
+
lower_idx = int(n_bootstrap * (alpha / 2))
|
|
242
|
+
upper_idx = int(n_bootstrap * (1 - alpha / 2))
|
|
243
|
+
|
|
244
|
+
ci = (bootstrap_diffs[lower_idx], bootstrap_diffs[upper_idx])
|
|
245
|
+
|
|
246
|
+
# Check if 0 is in the confidence interval
|
|
247
|
+
significant = not (ci[0] <= 0 <= ci[1])
|
|
248
|
+
|
|
249
|
+
# Pseudo p-value: proportion of bootstrap samples with opposite sign
|
|
250
|
+
p_value = sum(1 for d in bootstrap_diffs if (d * observed_diff) < 0) / n_bootstrap
|
|
251
|
+
p_value = max(p_value, 1 / n_bootstrap) # Lower bound
|
|
252
|
+
|
|
253
|
+
return StatisticalTestResult(
|
|
254
|
+
test_name=f"bootstrap (n={n_bootstrap})",
|
|
255
|
+
statistic=observed_diff,
|
|
256
|
+
p_value=p_value,
|
|
257
|
+
significant=significant,
|
|
258
|
+
confidence_level=confidence_level,
|
|
259
|
+
confidence_interval=ci,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def permutation_test(
|
|
264
|
+
samples_a: Sequence[float],
|
|
265
|
+
samples_b: Sequence[float],
|
|
266
|
+
*,
|
|
267
|
+
n_permutations: int = 10000,
|
|
268
|
+
alpha: float = 0.05,
|
|
269
|
+
statistic_fn: callable = None,
|
|
270
|
+
seed: int | None = None,
|
|
271
|
+
) -> StatisticalTestResult:
|
|
272
|
+
"""Perform permutation test to compare two samples.
|
|
273
|
+
|
|
274
|
+
Tests the null hypothesis that the two samples come from the same
|
|
275
|
+
distribution by randomly permuting the labels and computing the test
|
|
276
|
+
statistic.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
samples_a: First set of samples
|
|
280
|
+
samples_b: Second set of samples
|
|
281
|
+
n_permutations: Number of permutations (default: 10000)
|
|
282
|
+
alpha: Significance level (default: 0.05)
|
|
283
|
+
statistic_fn: Function to compute statistic (default: difference in means)
|
|
284
|
+
seed: Random seed for reproducibility
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
StatisticalTestResult with permutation test results
|
|
288
|
+
"""
|
|
289
|
+
if not samples_a or not samples_b:
|
|
290
|
+
raise ValueError("Cannot perform permutation test on empty samples")
|
|
291
|
+
|
|
292
|
+
if seed is not None:
|
|
293
|
+
random.seed(seed)
|
|
294
|
+
|
|
295
|
+
# Default statistic: absolute difference in means
|
|
296
|
+
if statistic_fn is None:
|
|
297
|
+
def statistic_fn(a, b):
|
|
298
|
+
return abs(sum(a) / len(a) - sum(b) / len(b))
|
|
299
|
+
|
|
300
|
+
# Observed statistic
|
|
301
|
+
observed_stat = statistic_fn(samples_a, samples_b)
|
|
302
|
+
|
|
303
|
+
# Combine all samples
|
|
304
|
+
combined = list(samples_a) + list(samples_b)
|
|
305
|
+
n_a = len(samples_a)
|
|
306
|
+
n_total = len(combined)
|
|
307
|
+
|
|
308
|
+
# Permutation testing
|
|
309
|
+
more_extreme = 0
|
|
310
|
+
for _ in range(n_permutations):
|
|
311
|
+
# Shuffle and split
|
|
312
|
+
shuffled = combined.copy()
|
|
313
|
+
random.shuffle(shuffled)
|
|
314
|
+
|
|
315
|
+
perm_a = shuffled[:n_a]
|
|
316
|
+
perm_b = shuffled[n_a:]
|
|
317
|
+
|
|
318
|
+
perm_stat = statistic_fn(perm_a, perm_b)
|
|
319
|
+
|
|
320
|
+
if perm_stat >= observed_stat:
|
|
321
|
+
more_extreme += 1
|
|
322
|
+
|
|
323
|
+
# P-value: proportion of permutations as extreme as observed
|
|
324
|
+
p_value = more_extreme / n_permutations
|
|
325
|
+
|
|
326
|
+
return StatisticalTestResult(
|
|
327
|
+
test_name=f"permutation (n={n_permutations})",
|
|
328
|
+
statistic=observed_stat,
|
|
329
|
+
p_value=p_value,
|
|
330
|
+
significant=p_value < alpha,
|
|
331
|
+
confidence_level=1 - alpha,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def mcnemar_test(
|
|
336
|
+
contingency_table: tuple[int, int, int, int],
|
|
337
|
+
*,
|
|
338
|
+
alpha: float = 0.05,
|
|
339
|
+
) -> StatisticalTestResult:
|
|
340
|
+
"""Perform McNemar's test for paired nominal data.
|
|
341
|
+
|
|
342
|
+
Useful for comparing two models on the same test set, where you want to
|
|
343
|
+
know if one model consistently outperforms the other.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
contingency_table: 2x2 contingency table as (n_00, n_01, n_10, n_11)
|
|
347
|
+
where n_ij = number of samples where model A predicts i and model B predicts j
|
|
348
|
+
(0 = incorrect, 1 = correct)
|
|
349
|
+
alpha: Significance level
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
StatisticalTestResult with McNemar's test results
|
|
353
|
+
"""
|
|
354
|
+
n_00, n_01, n_10, n_11 = contingency_table
|
|
355
|
+
|
|
356
|
+
# Only discordant pairs matter
|
|
357
|
+
b = n_01 # A wrong, B correct
|
|
358
|
+
c = n_10 # A correct, B wrong
|
|
359
|
+
|
|
360
|
+
if b + c == 0:
|
|
361
|
+
# No discordant pairs
|
|
362
|
+
return StatisticalTestResult(
|
|
363
|
+
test_name="McNemar's test",
|
|
364
|
+
statistic=0.0,
|
|
365
|
+
p_value=1.0,
|
|
366
|
+
significant=False,
|
|
367
|
+
confidence_level=1 - alpha,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# McNemar's statistic with continuity correction
|
|
371
|
+
chi_square = ((abs(b - c) - 1) ** 2) / (b + c)
|
|
372
|
+
|
|
373
|
+
# Approximate p-value (chi-square with 1 df)
|
|
374
|
+
# For chi-square > 3.84, p < 0.05
|
|
375
|
+
# For chi-square > 6.63, p < 0.01
|
|
376
|
+
if chi_square > 10.83:
|
|
377
|
+
p_value = 0.001
|
|
378
|
+
elif chi_square > 6.63:
|
|
379
|
+
p_value = 0.01
|
|
380
|
+
elif chi_square > 3.84:
|
|
381
|
+
p_value = 0.05
|
|
382
|
+
else:
|
|
383
|
+
# Rough linear approximation
|
|
384
|
+
p_value = 1.0 - (chi_square / 3.84) * 0.95
|
|
385
|
+
|
|
386
|
+
return StatisticalTestResult(
|
|
387
|
+
test_name="McNemar's test",
|
|
388
|
+
statistic=chi_square,
|
|
389
|
+
p_value=p_value,
|
|
390
|
+
significant=p_value < alpha,
|
|
391
|
+
confidence_level=1 - alpha,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
__all__ = [
|
|
396
|
+
"StatisticalTest",
|
|
397
|
+
"StatisticalTestResult",
|
|
398
|
+
"t_test",
|
|
399
|
+
"bootstrap_confidence_interval",
|
|
400
|
+
"permutation_test",
|
|
401
|
+
"mcnemar_test",
|
|
402
|
+
]
|
themis/core/entities.py
CHANGED
|
@@ -58,13 +58,33 @@ class Reference(Generic[T]):
|
|
|
58
58
|
For backward compatibility, it can be used without type parameters
|
|
59
59
|
and will behave like Reference[Any].
|
|
60
60
|
|
|
61
|
+
The value field can hold any type including:
|
|
62
|
+
- Simple types: str, int, float, bool
|
|
63
|
+
- Collections: list, tuple, set
|
|
64
|
+
- Dictionaries: dict (for multi-value references)
|
|
65
|
+
- Custom objects
|
|
66
|
+
|
|
61
67
|
Examples:
|
|
62
|
-
#
|
|
68
|
+
# Simple reference
|
|
63
69
|
ref = Reference(kind="answer", value="42")
|
|
64
70
|
|
|
65
|
-
#
|
|
71
|
+
# Multi-value reference using dict
|
|
72
|
+
ref = Reference(
|
|
73
|
+
kind="countdown_task",
|
|
74
|
+
value={"target": 122, "numbers": [25, 50, 75, 100]}
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# List reference
|
|
78
|
+
ref = Reference(kind="valid_answers", value=["yes", "no", "maybe"])
|
|
79
|
+
|
|
80
|
+
# Typed reference
|
|
66
81
|
ref: Reference[str] = Reference(kind="answer", value="42")
|
|
67
|
-
ref: Reference[
|
|
82
|
+
ref: Reference[dict] = Reference(kind="task", value={"a": 1, "b": 2})
|
|
83
|
+
|
|
84
|
+
Note:
|
|
85
|
+
When using dict values, metrics can access individual fields directly:
|
|
86
|
+
>>> target = reference.value["target"]
|
|
87
|
+
>>> numbers = reference.value["numbers"]
|
|
68
88
|
"""
|
|
69
89
|
|
|
70
90
|
kind: str
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Code generation evaluation metrics.
|
|
2
|
+
|
|
3
|
+
This module provides metrics for evaluating code generation tasks:
|
|
4
|
+
- Pass@k: Functional correctness with k samples
|
|
5
|
+
- CodeBLEU: Code-aware BLEU variant
|
|
6
|
+
- ExecutionAccuracy: Safe code execution and testing
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from themis.evaluation.metrics.code.pass_at_k import PassAtK, estimate_pass_at_k
|
|
10
|
+
from themis.evaluation.metrics.code.codebleu import CodeBLEU
|
|
11
|
+
from themis.evaluation.metrics.code.execution import ExecutionAccuracy, ExecutionResult
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"PassAtK",
|
|
15
|
+
"estimate_pass_at_k",
|
|
16
|
+
"CodeBLEU",
|
|
17
|
+
"ExecutionAccuracy",
|
|
18
|
+
"ExecutionResult",
|
|
19
|
+
]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""CodeBLEU metric for code generation evaluation.
|
|
2
|
+
|
|
3
|
+
CodeBLEU extends BLEU with syntax awareness using abstract syntax trees (AST)
|
|
4
|
+
and data flow matching.
|
|
5
|
+
|
|
6
|
+
References:
|
|
7
|
+
Ren et al. (2020). CodeBLEU: a Method for Automatic Evaluation of Code Synthesis.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Sequence
|
|
13
|
+
|
|
14
|
+
from themis.core.entities import MetricScore
|
|
15
|
+
from themis.interfaces import Metric
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CodeBLEU(Metric):
|
|
19
|
+
"""CodeBLEU metric for code generation.
|
|
20
|
+
|
|
21
|
+
CodeBLEU combines:
|
|
22
|
+
- N-gram matching (like BLEU)
|
|
23
|
+
- Syntax matching (AST-based)
|
|
24
|
+
- Data flow matching (variable dependencies)
|
|
25
|
+
|
|
26
|
+
It's more suitable for code evaluation than plain BLEU as it considers
|
|
27
|
+
code structure and semantics, not just surface form.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
name: Metric identifier ("codebleu")
|
|
31
|
+
lang: Programming language ("python", "java", "javascript", etc.)
|
|
32
|
+
weights: Weights for [ngram, syntax, dataflow] components
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> from themis.evaluation.metrics.code import CodeBLEU
|
|
36
|
+
>>> metric = CodeBLEU(lang="python")
|
|
37
|
+
>>> score = metric.compute(
|
|
38
|
+
... prediction="def add(a, b):\\n return a + b",
|
|
39
|
+
... references=["def add(x, y):\\n return x + y"]
|
|
40
|
+
... )
|
|
41
|
+
>>> print(f"CodeBLEU: {score.value:.4f}")
|
|
42
|
+
CodeBLEU: 0.8234
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
requires_reference = True
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
lang: str = "python",
|
|
50
|
+
weights: tuple[float, float, float] = (0.25, 0.25, 0.50),
|
|
51
|
+
alpha: float = 0.25,
|
|
52
|
+
beta: float = 0.25,
|
|
53
|
+
gamma: float = 0.50,
|
|
54
|
+
theta: float = 0.0,
|
|
55
|
+
):
|
|
56
|
+
"""Initialize CodeBLEU metric.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
lang: Programming language ("python", "java", "javascript", "go", "php", "ruby")
|
|
60
|
+
weights: Weights for [ngram, weighted_ngram, syntax, dataflow].
|
|
61
|
+
Default: (0.25, 0.25, 0.25, 0.25)
|
|
62
|
+
alpha: Weight for n-gram matching
|
|
63
|
+
beta: Weight for weighted n-gram matching
|
|
64
|
+
gamma: Weight for syntax matching
|
|
65
|
+
theta: Weight for data flow matching
|
|
66
|
+
"""
|
|
67
|
+
self.name = "codebleu"
|
|
68
|
+
self.lang = lang
|
|
69
|
+
self.alpha = alpha
|
|
70
|
+
self.beta = beta
|
|
71
|
+
self.gamma = gamma
|
|
72
|
+
self.theta = theta
|
|
73
|
+
|
|
74
|
+
# Lazy import codebleu (not required for all users)
|
|
75
|
+
try:
|
|
76
|
+
from codebleu import calc_codebleu
|
|
77
|
+
self._calc_codebleu = calc_codebleu
|
|
78
|
+
except ImportError:
|
|
79
|
+
raise ImportError(
|
|
80
|
+
"codebleu is required for CodeBLEU metric. "
|
|
81
|
+
"Install it with: pip install codebleu"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def compute(
|
|
85
|
+
self,
|
|
86
|
+
*,
|
|
87
|
+
prediction: Any,
|
|
88
|
+
references: Sequence[Any],
|
|
89
|
+
metadata: dict[str, Any] | None = None,
|
|
90
|
+
) -> MetricScore:
|
|
91
|
+
"""Compute CodeBLEU score.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
prediction: Generated code (already extracted by pipeline)
|
|
95
|
+
references: List of reference code implementations
|
|
96
|
+
metadata: Optional metadata dict
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
MetricScore with CodeBLEU value and component scores
|
|
100
|
+
"""
|
|
101
|
+
# Convert to strings
|
|
102
|
+
pred_str = str(prediction)
|
|
103
|
+
ref_strs = [str(ref) for ref in references]
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Compute CodeBLEU
|
|
107
|
+
result = self._calc_codebleu(
|
|
108
|
+
references=[ref_strs], # List of reference lists
|
|
109
|
+
predictions=[pred_str], # List of predictions
|
|
110
|
+
lang=self.lang,
|
|
111
|
+
weights=(self.alpha, self.beta, self.gamma, self.theta),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
codebleu_score = result["codebleu"]
|
|
115
|
+
|
|
116
|
+
return MetricScore(
|
|
117
|
+
metric_name=self.name,
|
|
118
|
+
value=codebleu_score,
|
|
119
|
+
details={
|
|
120
|
+
"codebleu": codebleu_score,
|
|
121
|
+
"ngram_match_score": result.get("ngram_match_score", 0.0),
|
|
122
|
+
"weighted_ngram_match_score": result.get("weighted_ngram_match_score", 0.0),
|
|
123
|
+
"syntax_match_score": result.get("syntax_match_score", 0.0),
|
|
124
|
+
"dataflow_match_score": result.get("dataflow_match_score", 0.0),
|
|
125
|
+
"lang": self.lang,
|
|
126
|
+
"num_references": len(ref_strs),
|
|
127
|
+
},
|
|
128
|
+
metadata=metadata or {},
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
# Handle parsing errors (invalid code, unsupported language, etc.)
|
|
133
|
+
return MetricScore(
|
|
134
|
+
metric_name=self.name,
|
|
135
|
+
value=0.0,
|
|
136
|
+
details={
|
|
137
|
+
"error": str(e),
|
|
138
|
+
"lang": self.lang,
|
|
139
|
+
},
|
|
140
|
+
metadata=metadata or {},
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
__all__ = ["CodeBLEU"]
|