themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +48 -6
  27. themis/experiment/storage.py +1313 -110
  28. themis/integrations/huggingface.py +12 -1
  29. themis/integrations/wandb.py +13 -1
  30. themis/interfaces/__init__.py +86 -0
  31. themis/presets/__init__.py +10 -0
  32. themis/presets/benchmarks.py +354 -0
  33. themis/presets/models.py +190 -0
  34. themis/server/__init__.py +28 -0
  35. themis/server/app.py +337 -0
  36. themis_eval-0.2.0.dist-info/METADATA +596 -0
  37. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
  38. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  39. themis_eval-0.1.1.dist-info/METADATA +0 -758
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  41. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,15 @@ from __future__ import annotations
3
3
  import json
4
4
  from dataclasses import asdict, is_dataclass
5
5
  from pathlib import Path
6
+ from typing import TYPE_CHECKING
6
7
 
7
- from huggingface_hub import HfApi
8
+ if TYPE_CHECKING:
9
+ from huggingface_hub import HfApi
10
+ else:
11
+ try:
12
+ from huggingface_hub import HfApi
13
+ except ImportError:
14
+ HfApi = None # type: ignore
8
15
 
9
16
  from themis.config.schema import HuggingFaceHubConfig
10
17
  from themis.core.entities import ExperimentReport
@@ -24,6 +31,10 @@ def to_dict(obj):
24
31
 
25
32
  class HuggingFaceHubUploader:
26
33
  def __init__(self, config: HuggingFaceHubConfig):
34
+ if HfApi is None:
35
+ raise ImportError(
36
+ "huggingface_hub is not installed. Install with: pip install huggingface_hub"
37
+ )
27
38
  self.config = config
28
39
  self.api = HfApi()
29
40
 
@@ -1,6 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
- import wandb
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ import wandb
7
+ else:
8
+ try:
9
+ import wandb
10
+ except ImportError:
11
+ wandb = None # type: ignore
4
12
 
5
13
  from themis.config.schema import WandbConfig
6
14
  from themis.core.entities import ExperimentReport
@@ -8,6 +16,10 @@ from themis.core.entities import ExperimentReport
8
16
 
9
17
  class WandbTracker:
10
18
  def __init__(self, config: WandbConfig):
19
+ if wandb is None:
20
+ raise ImportError(
21
+ "wandb is not installed. Install with: pip install wandb"
22
+ )
11
23
  self.config = config
12
24
 
13
25
  def init(self, experiment_config: dict) -> None:
@@ -56,11 +56,67 @@ class DatasetAdapter(Protocol):
56
56
 
57
57
 
58
58
  class Extractor(Protocol):
59
+ """Protocol for extractors that parse model output.
60
+
61
+ Extractors are responsible for parsing raw model output text and
62
+ extracting the relevant answer or prediction. The evaluation pipeline
63
+ calls the extractor before passing the result to metrics.
64
+
65
+ Example:
66
+ >>> class JsonExtractor:
67
+ ... def extract(self, raw_output: str) -> Any:
68
+ ... import json
69
+ ... return json.loads(raw_output)["answer"]
70
+ """
71
+
59
72
  def extract(self, raw_output: str) -> Any: # pragma: no cover - protocol
73
+ """Extract prediction from raw model output.
74
+
75
+ Args:
76
+ raw_output: Raw text output from the model
77
+
78
+ Returns:
79
+ Extracted prediction (type depends on extractor implementation)
80
+
81
+ Raises:
82
+ FieldExtractionError: If extraction fails
83
+ """
60
84
  ...
61
85
 
62
86
 
63
87
  class Metric(ABC):
88
+ """Abstract base class for evaluation metrics.
89
+
90
+ Metrics compute scores by comparing model predictions against reference values.
91
+ The evaluation pipeline handles extraction before passing data to metrics.
92
+
93
+ IMPORTANT - Extractor Contract:
94
+ The 'prediction' parameter receives EXTRACTED output from the extractor,
95
+ NOT raw model output. Metrics should NOT attempt to re-extract or parse
96
+ the prediction - it has already been processed by the pipeline's extractor.
97
+
98
+ Example flow:
99
+ 1. Model generates: "<think>reasoning</think><answer>42</answer>"
100
+ 2. Extractor extracts: "42"
101
+ 3. Metric receives: prediction="42" (already extracted)
102
+
103
+ Attributes:
104
+ name: Unique metric identifier
105
+ requires_reference: Whether metric needs reference values (default: True)
106
+
107
+ Example:
108
+ >>> class ExactMatch(Metric):
109
+ ... name = "exact_match"
110
+ ...
111
+ ... def compute(self, *, prediction, references, metadata=None):
112
+ ... # prediction is already extracted - no parsing needed
113
+ ... is_correct = any(prediction == ref for ref in references)
114
+ ... return MetricScore(
115
+ ... metric_name=self.name,
116
+ ... value=1.0 if is_correct else 0.0
117
+ ... )
118
+ """
119
+
64
120
  name: str
65
121
  requires_reference: bool = True
66
122
 
@@ -72,6 +128,36 @@ class Metric(ABC):
72
128
  references: Sequence[Any],
73
129
  metadata: dict[str, Any] | None = None,
74
130
  ) -> entities.MetricScore: # pragma: no cover - abstract
131
+ """Compute metric score.
132
+
133
+ Args:
134
+ prediction: Extracted prediction from model output (already processed
135
+ by extractor - do NOT re-extract or parse). Type depends on the
136
+ extractor used in the pipeline.
137
+ references: List of reference values in normalized format. Each element
138
+ can be:
139
+ - A scalar value (str, int, float, bool)
140
+ - A dict (for multi-value references like {"target": 122, "numbers": [...]})
141
+ - Any other type from the original reference
142
+ metadata: Optional metadata dict containing:
143
+ - "sample_id": Sample identifier (if available)
144
+ - Additional task-specific metadata
145
+
146
+ Returns:
147
+ MetricScore with computed value and optional details
148
+
149
+ Note:
150
+ The prediction parameter is already extracted by the pipeline's extractor.
151
+ Metrics should work with the extracted value directly, not attempt to
152
+ parse or extract again from raw output.
153
+
154
+ Example:
155
+ >>> def compute(self, *, prediction, references, metadata=None):
156
+ ... # prediction is already extracted (e.g., "42", not "<answer>42</answer>")
157
+ ... # references is a list (e.g., ["42"] or [{"target": 42, "numbers": [...]}])
158
+ ... score_value = self._compare(prediction, references)
159
+ ... return MetricScore(metric_name=self.name, value=score_value)
160
+ """
75
161
  raise NotImplementedError
76
162
 
77
163
 
@@ -0,0 +1,10 @@
1
+ """Preset configurations for common benchmarks and models.
2
+
3
+ This module provides automatic configuration for popular benchmarks,
4
+ eliminating the need for manual setup of prompts, metrics, and extractors.
5
+ """
6
+
7
+ from themis.presets.benchmarks import get_benchmark_preset, list_benchmarks
8
+ from themis.presets.models import parse_model_name
9
+
10
+ __all__ = ["get_benchmark_preset", "list_benchmarks", "parse_model_name"]
@@ -0,0 +1,354 @@
1
+ """Benchmark preset configurations.
2
+
3
+ This module provides pre-configured settings for popular benchmarks,
4
+ including prompts, metrics, extractors, and data loaders.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Callable, Sequence
11
+
12
+ from themis.generation.templates import PromptTemplate
13
+ from themis.interfaces import Extractor, Metric
14
+
15
+
16
+ @dataclass
17
+ class BenchmarkPreset:
18
+ """Configuration preset for a benchmark.
19
+
20
+ Attributes:
21
+ name: Benchmark name
22
+ prompt_template: Default prompt template
23
+ metrics: List of metric instances
24
+ extractor: Output extractor
25
+ dataset_loader: Function to load the dataset
26
+ metadata_fields: Fields to include in task metadata
27
+ reference_field: Field containing the reference answer
28
+ dataset_id_field: Field containing the sample ID
29
+ description: Human-readable description
30
+ """
31
+
32
+ name: str
33
+ prompt_template: PromptTemplate
34
+ metrics: list[Metric]
35
+ extractor: Extractor
36
+ dataset_loader: Callable[[int | None], Sequence[dict[str, Any]]]
37
+ metadata_fields: tuple[str, ...] = field(default_factory=tuple)
38
+ reference_field: str = "answer"
39
+ dataset_id_field: str = "id"
40
+ description: str = ""
41
+
42
+ def load_dataset(self, limit: int | None = None) -> Sequence[dict[str, Any]]:
43
+ """Load the benchmark dataset.
44
+
45
+ Args:
46
+ limit: Maximum number of samples to load
47
+
48
+ Returns:
49
+ List of dataset samples
50
+ """
51
+ return self.dataset_loader(limit)
52
+
53
+
54
+ # Registry of benchmark presets
55
+ _BENCHMARK_REGISTRY: dict[str, BenchmarkPreset] = {}
56
+ _REGISTRY_INITIALIZED = False
57
+
58
+
59
+ def _ensure_registry_initialized() -> None:
60
+ """Initialize benchmark registry on first use (lazy loading)."""
61
+ global _REGISTRY_INITIALIZED
62
+ if not _REGISTRY_INITIALIZED:
63
+ _register_all_benchmarks()
64
+ _REGISTRY_INITIALIZED = True
65
+
66
+
67
+ def register_benchmark(preset: BenchmarkPreset) -> None:
68
+ """Register a benchmark preset.
69
+
70
+ Args:
71
+ preset: Benchmark preset configuration
72
+ """
73
+ _BENCHMARK_REGISTRY[preset.name.lower()] = preset
74
+
75
+
76
+ def get_benchmark_preset(name: str) -> BenchmarkPreset:
77
+ """Get a benchmark preset by name.
78
+
79
+ Args:
80
+ name: Benchmark name (case-insensitive)
81
+
82
+ Returns:
83
+ Benchmark preset
84
+
85
+ Raises:
86
+ ValueError: If benchmark is not found
87
+ """
88
+ _ensure_registry_initialized()
89
+
90
+ name_lower = name.lower()
91
+ if name_lower not in _BENCHMARK_REGISTRY:
92
+ available = ", ".join(sorted(_BENCHMARK_REGISTRY.keys()))
93
+ raise ValueError(
94
+ f"Unknown benchmark: {name}. "
95
+ f"Available benchmarks: {available}"
96
+ )
97
+ return _BENCHMARK_REGISTRY[name_lower]
98
+
99
+
100
+ def list_benchmarks() -> list[str]:
101
+ """List all registered benchmark names.
102
+
103
+ Returns:
104
+ Sorted list of benchmark names
105
+ """
106
+ _ensure_registry_initialized()
107
+ return sorted(_BENCHMARK_REGISTRY.keys())
108
+
109
+
110
+ # ============================================================================
111
+ # Math Benchmarks
112
+ # ============================================================================
113
+
114
+ def _create_math500_preset() -> BenchmarkPreset:
115
+ """Create MATH-500 benchmark preset."""
116
+ from themis.datasets.math500 import load_math500 as load_math500_dataset
117
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
118
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
119
+
120
+ def load_math500(limit: int | None = None) -> Sequence[dict[str, Any]]:
121
+ samples = load_math500_dataset(source="huggingface", limit=limit)
122
+ # Convert MathSample objects to dicts
123
+ return [s.to_generation_example() if hasattr(s, 'to_generation_example') else dict(s) for s in samples]
124
+
125
+ prompt_template = PromptTemplate(
126
+ name="math500-zero-shot",
127
+ template=(
128
+ "Solve the following math problem step by step. "
129
+ "Put your final answer in \\boxed{{}}.\n\n"
130
+ "Problem: {problem}\n\n"
131
+ "Solution:"
132
+ ),
133
+ )
134
+
135
+ return BenchmarkPreset(
136
+ name="math500",
137
+ prompt_template=prompt_template,
138
+ metrics=[MathVerifyAccuracy()],
139
+ extractor=MathVerifyExtractor(),
140
+ dataset_loader=load_math500,
141
+ metadata_fields=("subject", "level"),
142
+ reference_field="solution",
143
+ dataset_id_field="unique_id",
144
+ description="MATH-500 dataset with 500 competition math problems",
145
+ )
146
+
147
+
148
+ def _create_gsm8k_preset() -> BenchmarkPreset:
149
+ """Create GSM8K benchmark preset."""
150
+ from themis.datasets.gsm8k import load_gsm8k as load_gsm8k_dataset
151
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
152
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
153
+
154
+ def load_gsm8k(limit: int | None = None) -> Sequence[dict[str, Any]]:
155
+ samples = load_gsm8k_dataset(source="huggingface", split="test", limit=limit)
156
+ # Convert sample objects to dicts if needed
157
+ return [dict(s) if not isinstance(s, dict) else s for s in samples]
158
+
159
+ prompt_template = PromptTemplate(
160
+ name="gsm8k-zero-shot",
161
+ template=(
162
+ "Solve this math problem step by step.\n\n"
163
+ "Q: {question}\n"
164
+ "A:"
165
+ ),
166
+ )
167
+
168
+ return BenchmarkPreset(
169
+ name="gsm8k",
170
+ prompt_template=prompt_template,
171
+ metrics=[MathVerifyAccuracy()],
172
+ extractor=MathVerifyExtractor(),
173
+ dataset_loader=load_gsm8k,
174
+ metadata_fields=(),
175
+ reference_field="answer",
176
+ dataset_id_field="id",
177
+ description="GSM8K dataset with grade school math word problems",
178
+ )
179
+
180
+
181
+ def _create_aime24_preset() -> BenchmarkPreset:
182
+ """Create AIME 2024 benchmark preset."""
183
+ from themis.datasets.competition_math import load_competition_math
184
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
185
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
186
+
187
+ def load_aime24(limit: int | None = None) -> Sequence[dict[str, Any]]:
188
+ samples = load_competition_math(
189
+ dataset_id="aime24",
190
+ source="huggingface",
191
+ split="test",
192
+ limit=limit,
193
+ )
194
+ return [dict(s) if not isinstance(s, dict) else s for s in samples]
195
+
196
+ prompt_template = PromptTemplate(
197
+ name="aime24-zero-shot",
198
+ template=(
199
+ "Solve the following AIME problem. "
200
+ "Your answer should be a number between 000 and 999.\n\n"
201
+ "Problem: {problem}\n\n"
202
+ "Solution:"
203
+ ),
204
+ )
205
+
206
+ return BenchmarkPreset(
207
+ name="aime24",
208
+ prompt_template=prompt_template,
209
+ metrics=[MathVerifyAccuracy()],
210
+ extractor=MathVerifyExtractor(),
211
+ dataset_loader=load_aime24,
212
+ metadata_fields=("subject",),
213
+ reference_field="answer",
214
+ dataset_id_field="id",
215
+ description="AIME 2024 competition math problems",
216
+ )
217
+
218
+
219
+ # ============================================================================
220
+ # MCQ Benchmarks
221
+ # ============================================================================
222
+
223
+ def _create_mmlu_pro_preset() -> BenchmarkPreset:
224
+ """Create MMLU-Pro benchmark preset."""
225
+ from themis.datasets.mmlu_pro import load_mmlu_pro as load_mmlu_pro_dataset
226
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
227
+ from themis.evaluation.metrics.exact_match import ExactMatch
228
+
229
+ def load_mmlu_pro(limit: int | None = None) -> Sequence[dict[str, Any]]:
230
+ samples = load_mmlu_pro_dataset(source="huggingface", split="test", limit=limit)
231
+ return [dict(s) if not isinstance(s, dict) else s for s in samples]
232
+
233
+ prompt_template = PromptTemplate(
234
+ name="mmlu-pro-zero-shot",
235
+ template=(
236
+ "Answer the following multiple choice question.\n\n"
237
+ "Question: {question}\n\n"
238
+ "Options:\n{options}\n\n"
239
+ "Answer:"
240
+ ),
241
+ )
242
+
243
+ return BenchmarkPreset(
244
+ name="mmlu-pro",
245
+ prompt_template=prompt_template,
246
+ metrics=[ExactMatch()],
247
+ extractor=IdentityExtractor(),
248
+ dataset_loader=load_mmlu_pro,
249
+ metadata_fields=("category",),
250
+ reference_field="answer",
251
+ dataset_id_field="id",
252
+ description="MMLU-Pro professional-level multiple choice questions",
253
+ )
254
+
255
+
256
+ def _create_supergpqa_preset() -> BenchmarkPreset:
257
+ """Create SuperGPQA benchmark preset."""
258
+ from themis.datasets.super_gpqa import load_super_gpqa as load_supergpqa_dataset
259
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
260
+ from themis.evaluation.metrics.exact_match import ExactMatch
261
+
262
+ def load_supergpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
263
+ samples = load_supergpqa_dataset(source="huggingface", split="test", limit=limit)
264
+ return [dict(s) if not isinstance(s, dict) else s for s in samples]
265
+
266
+ prompt_template = PromptTemplate(
267
+ name="supergpqa-zero-shot",
268
+ template=(
269
+ "Answer the following science question.\n\n"
270
+ "Question: {question}\n\n"
271
+ "Choices:\n{choices}\n\n"
272
+ "Answer:"
273
+ ),
274
+ )
275
+
276
+ return BenchmarkPreset(
277
+ name="supergpqa",
278
+ prompt_template=prompt_template,
279
+ metrics=[ExactMatch()],
280
+ extractor=IdentityExtractor(),
281
+ dataset_loader=load_supergpqa,
282
+ metadata_fields=("subject",),
283
+ reference_field="answer",
284
+ dataset_id_field="id",
285
+ description="SuperGPQA graduate-level science questions",
286
+ )
287
+
288
+
289
+ # ============================================================================
290
+ # Demo/Test Benchmarks
291
+ # ============================================================================
292
+
293
+ def _create_demo_preset() -> BenchmarkPreset:
294
+ """Create demo benchmark preset for testing."""
295
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
296
+ from themis.evaluation.metrics.exact_match import ExactMatch
297
+
298
+ def load_demo(limit: int | None = None) -> Sequence[dict[str, Any]]:
299
+ samples = [
300
+ {"id": "demo-1", "question": "What is 2 + 2?", "answer": "4"},
301
+ {"id": "demo-2", "question": "What is the capital of France?", "answer": "Paris"},
302
+ {"id": "demo-3", "question": "What is 10 * 5?", "answer": "50"},
303
+ ]
304
+ if limit is not None:
305
+ samples = samples[:limit]
306
+ return samples
307
+
308
+ prompt_template = PromptTemplate(
309
+ name="demo",
310
+ template="Q: {question}\nA:",
311
+ )
312
+
313
+ return BenchmarkPreset(
314
+ name="demo",
315
+ prompt_template=prompt_template,
316
+ metrics=[ExactMatch()],
317
+ extractor=IdentityExtractor(),
318
+ dataset_loader=load_demo,
319
+ metadata_fields=(),
320
+ reference_field="answer",
321
+ dataset_id_field="id",
322
+ description="Demo benchmark for testing",
323
+ )
324
+
325
+
326
+ # ============================================================================
327
+ # Register all benchmarks (lazy initialization)
328
+ # ============================================================================
329
+
330
+ def _register_all_benchmarks() -> None:
331
+ """Register all built-in benchmarks.
332
+
333
+ This is called lazily on first use to avoid importing heavy dependencies
334
+ (datasets, models, etc.) until actually needed.
335
+ """
336
+ # Math benchmarks
337
+ register_benchmark(_create_math500_preset())
338
+ register_benchmark(_create_gsm8k_preset())
339
+ register_benchmark(_create_aime24_preset())
340
+
341
+ # MCQ benchmarks
342
+ register_benchmark(_create_mmlu_pro_preset())
343
+ register_benchmark(_create_supergpqa_preset())
344
+
345
+ # Demo
346
+ register_benchmark(_create_demo_preset())
347
+
348
+
349
+ __all__ = [
350
+ "BenchmarkPreset",
351
+ "register_benchmark",
352
+ "get_benchmark_preset",
353
+ "list_benchmarks",
354
+ ]