themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +48 -6
  27. themis/experiment/storage.py +1313 -110
  28. themis/integrations/huggingface.py +12 -1
  29. themis/integrations/wandb.py +13 -1
  30. themis/interfaces/__init__.py +86 -0
  31. themis/presets/__init__.py +10 -0
  32. themis/presets/benchmarks.py +354 -0
  33. themis/presets/models.py +190 -0
  34. themis/server/__init__.py +28 -0
  35. themis/server/app.py +337 -0
  36. themis_eval-0.2.0.dist-info/METADATA +596 -0
  37. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
  38. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  39. themis_eval-0.1.1.dist-info/METADATA +0 -758
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  41. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,402 @@
1
+ """Statistical tests for comparing experiment results.
2
+
3
+ This module provides various statistical tests to determine if differences
4
+ between runs are statistically significant.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ import random
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
+ from typing import Sequence
14
+
15
+
16
+ class StatisticalTest(str, Enum):
17
+ """Available statistical tests."""
18
+
19
+ T_TEST = "t_test"
20
+ BOOTSTRAP = "bootstrap"
21
+ PERMUTATION = "permutation"
22
+ NONE = "none"
23
+
24
+
25
+ @dataclass
26
+ class StatisticalTestResult:
27
+ """Result of a statistical test.
28
+
29
+ Attributes:
30
+ test_name: Name of the test performed
31
+ statistic: Test statistic value
32
+ p_value: P-value (probability of observing this difference by chance)
33
+ significant: Whether the difference is statistically significant
34
+ confidence_level: Confidence level used (e.g., 0.95 for 95%)
35
+ effect_size: Effect size (e.g., Cohen's d)
36
+ confidence_interval: Confidence interval for the difference
37
+ """
38
+
39
+ test_name: str
40
+ statistic: float
41
+ p_value: float
42
+ significant: bool
43
+ confidence_level: float = 0.95
44
+ effect_size: float | None = None
45
+ confidence_interval: tuple[float, float] | None = None
46
+
47
+ def __str__(self) -> str:
48
+ """Human-readable summary."""
49
+ sig_str = "significant" if self.significant else "not significant"
50
+ result = f"{self.test_name}: p={self.p_value:.4f} ({sig_str})"
51
+
52
+ if self.effect_size is not None:
53
+ result += f", effect_size={self.effect_size:.3f}"
54
+
55
+ if self.confidence_interval is not None:
56
+ low, high = self.confidence_interval
57
+ result += f", CI=[{low:.3f}, {high:.3f}]"
58
+
59
+ return result
60
+
61
+
62
+ def t_test(
63
+ samples_a: Sequence[float],
64
+ samples_b: Sequence[float],
65
+ *,
66
+ alpha: float = 0.05,
67
+ paired: bool = True,
68
+ ) -> StatisticalTestResult:
69
+ """Perform a t-test to compare two sets of samples.
70
+
71
+ Args:
72
+ samples_a: First set of samples
73
+ samples_b: Second set of samples
74
+ alpha: Significance level (default: 0.05 for 95% confidence)
75
+ paired: Whether to use paired t-test (default: True)
76
+
77
+ Returns:
78
+ StatisticalTestResult with test statistics and significance
79
+
80
+ Raises:
81
+ ValueError: If samples are empty or have mismatched lengths (for paired test)
82
+ """
83
+ if not samples_a or not samples_b:
84
+ raise ValueError("Cannot perform t-test on empty samples")
85
+
86
+ if paired and len(samples_a) != len(samples_b):
87
+ raise ValueError(
88
+ f"Paired t-test requires equal sample sizes. "
89
+ f"Got {len(samples_a)} and {len(samples_b)}"
90
+ )
91
+
92
+ n_a = len(samples_a)
93
+ n_b = len(samples_b)
94
+
95
+ # Calculate means
96
+ mean_a = sum(samples_a) / n_a
97
+ mean_b = sum(samples_b) / n_b
98
+
99
+ if paired:
100
+ # Paired t-test: test on differences
101
+ diffs = [a - b for a, b in zip(samples_a, samples_b)]
102
+ mean_diff = sum(diffs) / len(diffs)
103
+
104
+ # Standard deviation of differences
105
+ var_diff = sum((d - mean_diff) ** 2 for d in diffs) / (len(diffs) - 1) if len(diffs) > 1 else 0
106
+ se_diff = math.sqrt(var_diff / len(diffs))
107
+
108
+ # T-statistic
109
+ if se_diff > 1e-10: # Non-zero standard error
110
+ t_stat = mean_diff / se_diff
111
+ elif abs(mean_diff) > 1e-10: # Perfect consistency with non-zero difference
112
+ t_stat = float('inf') if mean_diff > 0 else float('-inf')
113
+ else: # No difference at all
114
+ t_stat = 0.0
115
+
116
+ df = len(diffs) - 1
117
+
118
+ # Effect size (Cohen's d for paired samples)
119
+ sd_diff = math.sqrt(var_diff)
120
+ effect_size = mean_diff / sd_diff if sd_diff > 1e-10 else (1.0 if abs(mean_diff) > 1e-10 else 0.0)
121
+
122
+ else:
123
+ # Independent samples t-test
124
+ # Calculate pooled standard deviation
125
+ var_a = sum((x - mean_a) ** 2 for x in samples_a) / (n_a - 1) if n_a > 1 else 0
126
+ var_b = sum((x - mean_b) ** 2 for x in samples_b) / (n_b - 1) if n_b > 1 else 0
127
+
128
+ pooled_sd = math.sqrt(((n_a - 1) * var_a + (n_b - 1) * var_b) / (n_a + n_b - 2))
129
+ se = pooled_sd * math.sqrt(1/n_a + 1/n_b)
130
+
131
+ # T-statistic
132
+ t_stat = (mean_a - mean_b) / se if se > 0 else 0.0
133
+ df = n_a + n_b - 2
134
+
135
+ # Effect size (Cohen's d)
136
+ effect_size = (mean_a - mean_b) / pooled_sd if pooled_sd > 0 else 0.0
137
+
138
+ # Approximate p-value using t-distribution
139
+ # For simplicity, we use a conservative approximation
140
+ # In practice, you'd use scipy.stats.t.sf for accurate p-values
141
+ p_value = _approximate_t_test_p_value(abs(t_stat), df)
142
+
143
+ # Confidence interval (approximate)
144
+ # t_critical ≈ 2.0 for 95% CI and reasonable df
145
+ t_critical = 2.0 # Conservative estimate
146
+ margin = t_critical * (se_diff if paired else se)
147
+ ci = (mean_a - mean_b - margin, mean_a - mean_b + margin)
148
+
149
+ return StatisticalTestResult(
150
+ test_name="t-test (paired)" if paired else "t-test (independent)",
151
+ statistic=t_stat,
152
+ p_value=p_value,
153
+ significant=p_value < alpha,
154
+ confidence_level=1 - alpha,
155
+ effect_size=effect_size,
156
+ confidence_interval=ci,
157
+ )
158
+
159
+
160
+ def _approximate_t_test_p_value(t_stat: float, df: int) -> float:
161
+ """Approximate p-value for t-test.
162
+
163
+ This is a rough approximation. For accurate p-values, use scipy.stats.
164
+ """
165
+ # Very rough approximation based on standard normal
166
+ # This gets less accurate for small df
167
+ if df < 1:
168
+ return 1.0
169
+
170
+ # Convert to z-score approximation for large df
171
+ if df > 30:
172
+ z = t_stat
173
+ # Approximate p-value for two-tailed test
174
+ # P(|Z| > z) ≈ 2 * (1 - Φ(z))
175
+ if z > 6:
176
+ return 0.0
177
+ elif z < 0.5:
178
+ return 1.0
179
+ else:
180
+ # Rough approximation
181
+ return min(1.0, 2 * math.exp(-0.5 * z * z) / math.sqrt(2 * math.pi))
182
+
183
+ # For small df, be conservative
184
+ return min(1.0, 0.5 if t_stat < 2 else 0.1 if t_stat < 3 else 0.01)
185
+
186
+
187
+ def bootstrap_confidence_interval(
188
+ samples_a: Sequence[float],
189
+ samples_b: Sequence[float],
190
+ *,
191
+ n_bootstrap: int = 10000,
192
+ confidence_level: float = 0.95,
193
+ statistic_fn: callable = None,
194
+ seed: int | None = None,
195
+ ) -> StatisticalTestResult:
196
+ """Compute bootstrap confidence interval for difference between two samples.
197
+
198
+ Uses bootstrap resampling to estimate the confidence interval for the
199
+ difference in means (or other statistic) between two samples.
200
+
201
+ Args:
202
+ samples_a: First set of samples
203
+ samples_b: Second set of samples
204
+ n_bootstrap: Number of bootstrap iterations (default: 10000)
205
+ confidence_level: Confidence level (default: 0.95)
206
+ statistic_fn: Function to compute statistic (default: mean difference)
207
+ seed: Random seed for reproducibility
208
+
209
+ Returns:
210
+ StatisticalTestResult with bootstrap confidence interval
211
+ """
212
+ if not samples_a or not samples_b:
213
+ raise ValueError("Cannot perform bootstrap on empty samples")
214
+
215
+ if seed is not None:
216
+ random.seed(seed)
217
+
218
+ # Default statistic: difference in means
219
+ if statistic_fn is None:
220
+ def statistic_fn(a, b):
221
+ return sum(a) / len(a) - sum(b) / len(b)
222
+
223
+ # Observed difference
224
+ observed_diff = statistic_fn(samples_a, samples_b)
225
+
226
+ # Bootstrap resampling
227
+ bootstrap_diffs = []
228
+ for _ in range(n_bootstrap):
229
+ # Resample with replacement
230
+ resampled_a = [random.choice(samples_a) for _ in range(len(samples_a))]
231
+ resampled_b = [random.choice(samples_b) for _ in range(len(samples_b))]
232
+
233
+ diff = statistic_fn(resampled_a, resampled_b)
234
+ bootstrap_diffs.append(diff)
235
+
236
+ # Sort for percentile method
237
+ bootstrap_diffs.sort()
238
+
239
+ # Compute confidence interval
240
+ alpha = 1 - confidence_level
241
+ lower_idx = int(n_bootstrap * (alpha / 2))
242
+ upper_idx = int(n_bootstrap * (1 - alpha / 2))
243
+
244
+ ci = (bootstrap_diffs[lower_idx], bootstrap_diffs[upper_idx])
245
+
246
+ # Check if 0 is in the confidence interval
247
+ significant = not (ci[0] <= 0 <= ci[1])
248
+
249
+ # Pseudo p-value: proportion of bootstrap samples with opposite sign
250
+ p_value = sum(1 for d in bootstrap_diffs if (d * observed_diff) < 0) / n_bootstrap
251
+ p_value = max(p_value, 1 / n_bootstrap) # Lower bound
252
+
253
+ return StatisticalTestResult(
254
+ test_name=f"bootstrap (n={n_bootstrap})",
255
+ statistic=observed_diff,
256
+ p_value=p_value,
257
+ significant=significant,
258
+ confidence_level=confidence_level,
259
+ confidence_interval=ci,
260
+ )
261
+
262
+
263
+ def permutation_test(
264
+ samples_a: Sequence[float],
265
+ samples_b: Sequence[float],
266
+ *,
267
+ n_permutations: int = 10000,
268
+ alpha: float = 0.05,
269
+ statistic_fn: callable = None,
270
+ seed: int | None = None,
271
+ ) -> StatisticalTestResult:
272
+ """Perform permutation test to compare two samples.
273
+
274
+ Tests the null hypothesis that the two samples come from the same
275
+ distribution by randomly permuting the labels and computing the test
276
+ statistic.
277
+
278
+ Args:
279
+ samples_a: First set of samples
280
+ samples_b: Second set of samples
281
+ n_permutations: Number of permutations (default: 10000)
282
+ alpha: Significance level (default: 0.05)
283
+ statistic_fn: Function to compute statistic (default: difference in means)
284
+ seed: Random seed for reproducibility
285
+
286
+ Returns:
287
+ StatisticalTestResult with permutation test results
288
+ """
289
+ if not samples_a or not samples_b:
290
+ raise ValueError("Cannot perform permutation test on empty samples")
291
+
292
+ if seed is not None:
293
+ random.seed(seed)
294
+
295
+ # Default statistic: absolute difference in means
296
+ if statistic_fn is None:
297
+ def statistic_fn(a, b):
298
+ return abs(sum(a) / len(a) - sum(b) / len(b))
299
+
300
+ # Observed statistic
301
+ observed_stat = statistic_fn(samples_a, samples_b)
302
+
303
+ # Combine all samples
304
+ combined = list(samples_a) + list(samples_b)
305
+ n_a = len(samples_a)
306
+ n_total = len(combined)
307
+
308
+ # Permutation testing
309
+ more_extreme = 0
310
+ for _ in range(n_permutations):
311
+ # Shuffle and split
312
+ shuffled = combined.copy()
313
+ random.shuffle(shuffled)
314
+
315
+ perm_a = shuffled[:n_a]
316
+ perm_b = shuffled[n_a:]
317
+
318
+ perm_stat = statistic_fn(perm_a, perm_b)
319
+
320
+ if perm_stat >= observed_stat:
321
+ more_extreme += 1
322
+
323
+ # P-value: proportion of permutations as extreme as observed
324
+ p_value = more_extreme / n_permutations
325
+
326
+ return StatisticalTestResult(
327
+ test_name=f"permutation (n={n_permutations})",
328
+ statistic=observed_stat,
329
+ p_value=p_value,
330
+ significant=p_value < alpha,
331
+ confidence_level=1 - alpha,
332
+ )
333
+
334
+
335
+ def mcnemar_test(
336
+ contingency_table: tuple[int, int, int, int],
337
+ *,
338
+ alpha: float = 0.05,
339
+ ) -> StatisticalTestResult:
340
+ """Perform McNemar's test for paired nominal data.
341
+
342
+ Useful for comparing two models on the same test set, where you want to
343
+ know if one model consistently outperforms the other.
344
+
345
+ Args:
346
+ contingency_table: 2x2 contingency table as (n_00, n_01, n_10, n_11)
347
+ where n_ij = number of samples where model A predicts i and model B predicts j
348
+ (0 = incorrect, 1 = correct)
349
+ alpha: Significance level
350
+
351
+ Returns:
352
+ StatisticalTestResult with McNemar's test results
353
+ """
354
+ n_00, n_01, n_10, n_11 = contingency_table
355
+
356
+ # Only discordant pairs matter
357
+ b = n_01 # A wrong, B correct
358
+ c = n_10 # A correct, B wrong
359
+
360
+ if b + c == 0:
361
+ # No discordant pairs
362
+ return StatisticalTestResult(
363
+ test_name="McNemar's test",
364
+ statistic=0.0,
365
+ p_value=1.0,
366
+ significant=False,
367
+ confidence_level=1 - alpha,
368
+ )
369
+
370
+ # McNemar's statistic with continuity correction
371
+ chi_square = ((abs(b - c) - 1) ** 2) / (b + c)
372
+
373
+ # Approximate p-value (chi-square with 1 df)
374
+ # For chi-square > 3.84, p < 0.05
375
+ # For chi-square > 6.63, p < 0.01
376
+ if chi_square > 10.83:
377
+ p_value = 0.001
378
+ elif chi_square > 6.63:
379
+ p_value = 0.01
380
+ elif chi_square > 3.84:
381
+ p_value = 0.05
382
+ else:
383
+ # Rough linear approximation
384
+ p_value = 1.0 - (chi_square / 3.84) * 0.95
385
+
386
+ return StatisticalTestResult(
387
+ test_name="McNemar's test",
388
+ statistic=chi_square,
389
+ p_value=p_value,
390
+ significant=p_value < alpha,
391
+ confidence_level=1 - alpha,
392
+ )
393
+
394
+
395
+ __all__ = [
396
+ "StatisticalTest",
397
+ "StatisticalTestResult",
398
+ "t_test",
399
+ "bootstrap_confidence_interval",
400
+ "permutation_test",
401
+ "mcnemar_test",
402
+ ]
themis/core/entities.py CHANGED
@@ -58,13 +58,33 @@ class Reference(Generic[T]):
58
58
  For backward compatibility, it can be used without type parameters
59
59
  and will behave like Reference[Any].
60
60
 
61
+ The value field can hold any type including:
62
+ - Simple types: str, int, float, bool
63
+ - Collections: list, tuple, set
64
+ - Dictionaries: dict (for multi-value references)
65
+ - Custom objects
66
+
61
67
  Examples:
62
- # Untyped (backward compatible)
68
+ # Simple reference
63
69
  ref = Reference(kind="answer", value="42")
64
70
 
65
- # Typed
71
+ # Multi-value reference using dict
72
+ ref = Reference(
73
+ kind="countdown_task",
74
+ value={"target": 122, "numbers": [25, 50, 75, 100]}
75
+ )
76
+
77
+ # List reference
78
+ ref = Reference(kind="valid_answers", value=["yes", "no", "maybe"])
79
+
80
+ # Typed reference
66
81
  ref: Reference[str] = Reference(kind="answer", value="42")
67
- ref: Reference[int] = Reference(kind="answer", value=42)
82
+ ref: Reference[dict] = Reference(kind="task", value={"a": 1, "b": 2})
83
+
84
+ Note:
85
+ When using dict values, metrics can access individual fields directly:
86
+ >>> target = reference.value["target"]
87
+ >>> numbers = reference.value["numbers"]
68
88
  """
69
89
 
70
90
  kind: str
@@ -0,0 +1,19 @@
1
+ """Code generation evaluation metrics.
2
+
3
+ This module provides metrics for evaluating code generation tasks:
4
+ - Pass@k: Functional correctness with k samples
5
+ - CodeBLEU: Code-aware BLEU variant
6
+ - ExecutionAccuracy: Safe code execution and testing
7
+ """
8
+
9
+ from themis.evaluation.metrics.code.pass_at_k import PassAtK, estimate_pass_at_k
10
+ from themis.evaluation.metrics.code.codebleu import CodeBLEU
11
+ from themis.evaluation.metrics.code.execution import ExecutionAccuracy, ExecutionResult
12
+
13
+ __all__ = [
14
+ "PassAtK",
15
+ "estimate_pass_at_k",
16
+ "CodeBLEU",
17
+ "ExecutionAccuracy",
18
+ "ExecutionResult",
19
+ ]
@@ -0,0 +1,144 @@
1
+ """CodeBLEU metric for code generation evaluation.
2
+
3
+ CodeBLEU extends BLEU with syntax awareness using abstract syntax trees (AST)
4
+ and data flow matching.
5
+
6
+ References:
7
+ Ren et al. (2020). CodeBLEU: a Method for Automatic Evaluation of Code Synthesis.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Sequence
13
+
14
+ from themis.core.entities import MetricScore
15
+ from themis.interfaces import Metric
16
+
17
+
18
+ class CodeBLEU(Metric):
19
+ """CodeBLEU metric for code generation.
20
+
21
+ CodeBLEU combines:
22
+ - N-gram matching (like BLEU)
23
+ - Syntax matching (AST-based)
24
+ - Data flow matching (variable dependencies)
25
+
26
+ It's more suitable for code evaluation than plain BLEU as it considers
27
+ code structure and semantics, not just surface form.
28
+
29
+ Attributes:
30
+ name: Metric identifier ("codebleu")
31
+ lang: Programming language ("python", "java", "javascript", etc.)
32
+ weights: Weights for [ngram, syntax, dataflow] components
33
+
34
+ Example:
35
+ >>> from themis.evaluation.metrics.code import CodeBLEU
36
+ >>> metric = CodeBLEU(lang="python")
37
+ >>> score = metric.compute(
38
+ ... prediction="def add(a, b):\\n return a + b",
39
+ ... references=["def add(x, y):\\n return x + y"]
40
+ ... )
41
+ >>> print(f"CodeBLEU: {score.value:.4f}")
42
+ CodeBLEU: 0.8234
43
+ """
44
+
45
+ requires_reference = True
46
+
47
+ def __init__(
48
+ self,
49
+ lang: str = "python",
50
+ weights: tuple[float, float, float] = (0.25, 0.25, 0.50),
51
+ alpha: float = 0.25,
52
+ beta: float = 0.25,
53
+ gamma: float = 0.50,
54
+ theta: float = 0.0,
55
+ ):
56
+ """Initialize CodeBLEU metric.
57
+
58
+ Args:
59
+ lang: Programming language ("python", "java", "javascript", "go", "php", "ruby")
60
+ weights: Weights for [ngram, weighted_ngram, syntax, dataflow].
61
+ Default: (0.25, 0.25, 0.25, 0.25)
62
+ alpha: Weight for n-gram matching
63
+ beta: Weight for weighted n-gram matching
64
+ gamma: Weight for syntax matching
65
+ theta: Weight for data flow matching
66
+ """
67
+ self.name = "codebleu"
68
+ self.lang = lang
69
+ self.alpha = alpha
70
+ self.beta = beta
71
+ self.gamma = gamma
72
+ self.theta = theta
73
+
74
+ # Lazy import codebleu (not required for all users)
75
+ try:
76
+ from codebleu import calc_codebleu
77
+ self._calc_codebleu = calc_codebleu
78
+ except ImportError:
79
+ raise ImportError(
80
+ "codebleu is required for CodeBLEU metric. "
81
+ "Install it with: pip install codebleu"
82
+ )
83
+
84
+ def compute(
85
+ self,
86
+ *,
87
+ prediction: Any,
88
+ references: Sequence[Any],
89
+ metadata: dict[str, Any] | None = None,
90
+ ) -> MetricScore:
91
+ """Compute CodeBLEU score.
92
+
93
+ Args:
94
+ prediction: Generated code (already extracted by pipeline)
95
+ references: List of reference code implementations
96
+ metadata: Optional metadata dict
97
+
98
+ Returns:
99
+ MetricScore with CodeBLEU value and component scores
100
+ """
101
+ # Convert to strings
102
+ pred_str = str(prediction)
103
+ ref_strs = [str(ref) for ref in references]
104
+
105
+ try:
106
+ # Compute CodeBLEU
107
+ result = self._calc_codebleu(
108
+ references=[ref_strs], # List of reference lists
109
+ predictions=[pred_str], # List of predictions
110
+ lang=self.lang,
111
+ weights=(self.alpha, self.beta, self.gamma, self.theta),
112
+ )
113
+
114
+ codebleu_score = result["codebleu"]
115
+
116
+ return MetricScore(
117
+ metric_name=self.name,
118
+ value=codebleu_score,
119
+ details={
120
+ "codebleu": codebleu_score,
121
+ "ngram_match_score": result.get("ngram_match_score", 0.0),
122
+ "weighted_ngram_match_score": result.get("weighted_ngram_match_score", 0.0),
123
+ "syntax_match_score": result.get("syntax_match_score", 0.0),
124
+ "dataflow_match_score": result.get("dataflow_match_score", 0.0),
125
+ "lang": self.lang,
126
+ "num_references": len(ref_strs),
127
+ },
128
+ metadata=metadata or {},
129
+ )
130
+
131
+ except Exception as e:
132
+ # Handle parsing errors (invalid code, unsupported language, etc.)
133
+ return MetricScore(
134
+ metric_name=self.name,
135
+ value=0.0,
136
+ details={
137
+ "error": str(e),
138
+ "lang": self.lang,
139
+ },
140
+ metadata=metadata or {},
141
+ )
142
+
143
+
144
+ __all__ = ["CodeBLEU"]