themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Interfaces (ports) that external adapters must implement."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any, Iterable, Protocol, Sequence, runtime_checkable
|
|
7
|
+
|
|
8
|
+
from themis.core import entities
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ModelProvider(ABC):
|
|
12
|
+
"""Abstract interface for anything capable of fulfilling generation tasks."""
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def generate(
|
|
16
|
+
self, task: entities.GenerationTask
|
|
17
|
+
) -> entities.GenerationRecord: # pragma: no cover - abstract
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@runtime_checkable
|
|
22
|
+
class DatasetAdapter(Protocol):
|
|
23
|
+
"""Protocol for dataset adapters that produce raw samples for experiments.
|
|
24
|
+
|
|
25
|
+
This is a structural protocol that can be satisfied by any class implementing
|
|
26
|
+
the required methods, without explicit inheritance. The @runtime_checkable
|
|
27
|
+
decorator allows isinstance() checks at runtime.
|
|
28
|
+
|
|
29
|
+
Required Methods:
|
|
30
|
+
iter_samples: Returns an iterable of sample dictionaries
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> class MyDataset:
|
|
34
|
+
... def iter_samples(self):
|
|
35
|
+
... return iter([{"id": "1", "text": "sample"}])
|
|
36
|
+
...
|
|
37
|
+
>>> isinstance(MyDataset(), DatasetAdapter) # True at runtime
|
|
38
|
+
|
|
39
|
+
Note:
|
|
40
|
+
Classes do not need to explicitly inherit from this protocol.
|
|
41
|
+
Duck typing is sufficient - any class with an iter_samples() method
|
|
42
|
+
will be recognized as a DatasetAdapter at runtime.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def iter_samples(self) -> Iterable[dict[str, Any]]: # pragma: no cover - protocol
|
|
46
|
+
"""Iterate over dataset samples.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Iterable of dictionaries, each representing a dataset sample
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> for sample in dataset.iter_samples():
|
|
53
|
+
... print(sample["id"])
|
|
54
|
+
"""
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Extractor(Protocol):
|
|
59
|
+
"""Protocol for extractors that parse model output.
|
|
60
|
+
|
|
61
|
+
Extractors are responsible for parsing raw model output text and
|
|
62
|
+
extracting the relevant answer or prediction. The evaluation pipeline
|
|
63
|
+
calls the extractor before passing the result to metrics.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> class JsonExtractor:
|
|
67
|
+
... def extract(self, raw_output: str) -> Any:
|
|
68
|
+
... import json
|
|
69
|
+
... return json.loads(raw_output)["answer"]
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def extract(self, raw_output: str) -> Any: # pragma: no cover - protocol
|
|
73
|
+
"""Extract prediction from raw model output.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
raw_output: Raw text output from the model
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Extracted prediction (type depends on extractor implementation)
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
FieldExtractionError: If extraction fails
|
|
83
|
+
"""
|
|
84
|
+
...
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Metric(ABC):
|
|
88
|
+
"""Abstract base class for evaluation metrics.
|
|
89
|
+
|
|
90
|
+
Metrics compute scores by comparing model predictions against reference values.
|
|
91
|
+
The evaluation pipeline handles extraction before passing data to metrics.
|
|
92
|
+
|
|
93
|
+
IMPORTANT - Extractor Contract:
|
|
94
|
+
The 'prediction' parameter receives EXTRACTED output from the extractor,
|
|
95
|
+
NOT raw model output. Metrics should NOT attempt to re-extract or parse
|
|
96
|
+
the prediction - it has already been processed by the pipeline's extractor.
|
|
97
|
+
|
|
98
|
+
Example flow:
|
|
99
|
+
1. Model generates: "<think>reasoning</think><answer>42</answer>"
|
|
100
|
+
2. Extractor extracts: "42"
|
|
101
|
+
3. Metric receives: prediction="42" (already extracted)
|
|
102
|
+
|
|
103
|
+
Attributes:
|
|
104
|
+
name: Unique metric identifier
|
|
105
|
+
requires_reference: Whether metric needs reference values (default: True)
|
|
106
|
+
|
|
107
|
+
Example:
|
|
108
|
+
>>> class ExactMatch(Metric):
|
|
109
|
+
... name = "exact_match"
|
|
110
|
+
...
|
|
111
|
+
... def compute(self, *, prediction, references, metadata=None):
|
|
112
|
+
... # prediction is already extracted - no parsing needed
|
|
113
|
+
... is_correct = any(prediction == ref for ref in references)
|
|
114
|
+
... return MetricScore(
|
|
115
|
+
... metric_name=self.name,
|
|
116
|
+
... value=1.0 if is_correct else 0.0
|
|
117
|
+
... )
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
name: str
|
|
121
|
+
requires_reference: bool = True
|
|
122
|
+
|
|
123
|
+
@abstractmethod
|
|
124
|
+
def compute(
|
|
125
|
+
self,
|
|
126
|
+
*,
|
|
127
|
+
prediction: Any,
|
|
128
|
+
references: Sequence[Any],
|
|
129
|
+
metadata: dict[str, Any] | None = None,
|
|
130
|
+
) -> entities.MetricScore: # pragma: no cover - abstract
|
|
131
|
+
"""Compute metric score.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
prediction: Extracted prediction from model output (already processed
|
|
135
|
+
by extractor - do NOT re-extract or parse). Type depends on the
|
|
136
|
+
extractor used in the pipeline.
|
|
137
|
+
references: List of reference values in normalized format. Each element
|
|
138
|
+
can be:
|
|
139
|
+
- A scalar value (str, int, float, bool)
|
|
140
|
+
- A dict (for multi-value references like {"target": 122, "numbers": [...]})
|
|
141
|
+
- Any other type from the original reference
|
|
142
|
+
metadata: Optional metadata dict containing:
|
|
143
|
+
- "sample_id": Sample identifier (if available)
|
|
144
|
+
- Additional task-specific metadata
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
MetricScore with computed value and optional details
|
|
148
|
+
|
|
149
|
+
Note:
|
|
150
|
+
The prediction parameter is already extracted by the pipeline's extractor.
|
|
151
|
+
Metrics should work with the extracted value directly, not attempt to
|
|
152
|
+
parse or extract again from raw output.
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> def compute(self, *, prediction, references, metadata=None):
|
|
156
|
+
... # prediction is already extracted (e.g., "42", not "<answer>42</answer>")
|
|
157
|
+
... # references is a list (e.g., ["42"] or [{"target": 42, "numbers": [...]}])
|
|
158
|
+
... score_value = self._compare(prediction, references)
|
|
159
|
+
... return MetricScore(metric_name=self.name, value=score_value)
|
|
160
|
+
"""
|
|
161
|
+
raise NotImplementedError
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
__all__ = [
|
|
165
|
+
"ModelProvider",
|
|
166
|
+
"DatasetAdapter",
|
|
167
|
+
"Extractor",
|
|
168
|
+
"Metric",
|
|
169
|
+
]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Preset configurations for common benchmarks and models.
|
|
2
|
+
|
|
3
|
+
This module provides automatic configuration for popular benchmarks,
|
|
4
|
+
eliminating the need for manual setup of prompts, metrics, and extractors.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from themis.presets.benchmarks import get_benchmark_preset, list_benchmarks
|
|
8
|
+
from themis.presets.models import parse_model_name
|
|
9
|
+
|
|
10
|
+
__all__ = ["get_benchmark_preset", "list_benchmarks", "parse_model_name"]
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""Benchmark preset configurations.
|
|
2
|
+
|
|
3
|
+
This module provides pre-configured settings for popular benchmarks,
|
|
4
|
+
including prompts, metrics, extractors, and data loaders.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, Callable, Sequence
|
|
11
|
+
|
|
12
|
+
from themis.generation.templates import PromptTemplate
|
|
13
|
+
from themis.interfaces import Extractor, Metric
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class BenchmarkPreset:
|
|
18
|
+
"""Configuration preset for a benchmark.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
name: Benchmark name
|
|
22
|
+
prompt_template: Default prompt template
|
|
23
|
+
metrics: List of metric instances
|
|
24
|
+
extractor: Output extractor
|
|
25
|
+
dataset_loader: Function to load the dataset
|
|
26
|
+
metadata_fields: Fields to include in task metadata
|
|
27
|
+
reference_field: Field containing the reference answer
|
|
28
|
+
dataset_id_field: Field containing the sample ID
|
|
29
|
+
description: Human-readable description
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name: str
|
|
33
|
+
prompt_template: PromptTemplate
|
|
34
|
+
metrics: list[Metric]
|
|
35
|
+
extractor: Extractor
|
|
36
|
+
dataset_loader: Callable[[int | None], Sequence[dict[str, Any]]]
|
|
37
|
+
metadata_fields: tuple[str, ...] = field(default_factory=tuple)
|
|
38
|
+
reference_field: str = "answer"
|
|
39
|
+
dataset_id_field: str = "id"
|
|
40
|
+
description: str = ""
|
|
41
|
+
|
|
42
|
+
def load_dataset(self, limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
43
|
+
"""Load the benchmark dataset.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
limit: Maximum number of samples to load
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of dataset samples
|
|
50
|
+
"""
|
|
51
|
+
return self.dataset_loader(limit)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Registry of benchmark presets
|
|
55
|
+
_BENCHMARK_REGISTRY: dict[str, BenchmarkPreset] = {}
|
|
56
|
+
_REGISTRY_INITIALIZED = False
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _ensure_registry_initialized() -> None:
|
|
60
|
+
"""Initialize benchmark registry on first use (lazy loading)."""
|
|
61
|
+
global _REGISTRY_INITIALIZED
|
|
62
|
+
if not _REGISTRY_INITIALIZED:
|
|
63
|
+
_register_all_benchmarks()
|
|
64
|
+
_REGISTRY_INITIALIZED = True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def register_benchmark(preset: BenchmarkPreset) -> None:
|
|
68
|
+
"""Register a benchmark preset.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
preset: Benchmark preset configuration
|
|
72
|
+
"""
|
|
73
|
+
_BENCHMARK_REGISTRY[preset.name.lower()] = preset
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_benchmark_preset(name: str) -> BenchmarkPreset:
|
|
77
|
+
"""Get a benchmark preset by name.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
name: Benchmark name (case-insensitive)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Benchmark preset
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
ValueError: If benchmark is not found
|
|
87
|
+
"""
|
|
88
|
+
_ensure_registry_initialized()
|
|
89
|
+
|
|
90
|
+
name_lower = name.lower()
|
|
91
|
+
if name_lower not in _BENCHMARK_REGISTRY:
|
|
92
|
+
available = ", ".join(sorted(_BENCHMARK_REGISTRY.keys()))
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"Unknown benchmark: {name}. "
|
|
95
|
+
f"Available benchmarks: {available}"
|
|
96
|
+
)
|
|
97
|
+
return _BENCHMARK_REGISTRY[name_lower]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def list_benchmarks() -> list[str]:
|
|
101
|
+
"""List all registered benchmark names.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Sorted list of benchmark names
|
|
105
|
+
"""
|
|
106
|
+
_ensure_registry_initialized()
|
|
107
|
+
return sorted(_BENCHMARK_REGISTRY.keys())
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ============================================================================
|
|
111
|
+
# Math Benchmarks
|
|
112
|
+
# ============================================================================
|
|
113
|
+
|
|
114
|
+
def _create_math500_preset() -> BenchmarkPreset:
|
|
115
|
+
"""Create MATH-500 benchmark preset."""
|
|
116
|
+
from themis.datasets.math500 import load_math500 as load_math500_dataset
|
|
117
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
118
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
119
|
+
|
|
120
|
+
def load_math500(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
121
|
+
samples = load_math500_dataset(source="huggingface", limit=limit)
|
|
122
|
+
# Convert MathSample objects to dicts
|
|
123
|
+
return [s.to_generation_example() if hasattr(s, 'to_generation_example') else dict(s) for s in samples]
|
|
124
|
+
|
|
125
|
+
prompt_template = PromptTemplate(
|
|
126
|
+
name="math500-zero-shot",
|
|
127
|
+
template=(
|
|
128
|
+
"Solve the following math problem step by step. "
|
|
129
|
+
"Put your final answer in \\boxed{{}}.\n\n"
|
|
130
|
+
"Problem: {problem}\n\n"
|
|
131
|
+
"Solution:"
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return BenchmarkPreset(
|
|
136
|
+
name="math500",
|
|
137
|
+
prompt_template=prompt_template,
|
|
138
|
+
metrics=[MathVerifyAccuracy()],
|
|
139
|
+
extractor=MathVerifyExtractor(),
|
|
140
|
+
dataset_loader=load_math500,
|
|
141
|
+
metadata_fields=("subject", "level"),
|
|
142
|
+
reference_field="solution",
|
|
143
|
+
dataset_id_field="unique_id",
|
|
144
|
+
description="MATH-500 dataset with 500 competition math problems",
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _create_gsm8k_preset() -> BenchmarkPreset:
|
|
149
|
+
"""Create GSM8K benchmark preset."""
|
|
150
|
+
from themis.datasets.gsm8k import load_gsm8k as load_gsm8k_dataset
|
|
151
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
152
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
153
|
+
|
|
154
|
+
def load_gsm8k(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
155
|
+
samples = load_gsm8k_dataset(source="huggingface", split="test", limit=limit)
|
|
156
|
+
# Convert sample objects to dicts if needed
|
|
157
|
+
return [dict(s) if not isinstance(s, dict) else s for s in samples]
|
|
158
|
+
|
|
159
|
+
prompt_template = PromptTemplate(
|
|
160
|
+
name="gsm8k-zero-shot",
|
|
161
|
+
template=(
|
|
162
|
+
"Solve this math problem step by step.\n\n"
|
|
163
|
+
"Q: {question}\n"
|
|
164
|
+
"A:"
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return BenchmarkPreset(
|
|
169
|
+
name="gsm8k",
|
|
170
|
+
prompt_template=prompt_template,
|
|
171
|
+
metrics=[MathVerifyAccuracy()],
|
|
172
|
+
extractor=MathVerifyExtractor(),
|
|
173
|
+
dataset_loader=load_gsm8k,
|
|
174
|
+
metadata_fields=(),
|
|
175
|
+
reference_field="answer",
|
|
176
|
+
dataset_id_field="id",
|
|
177
|
+
description="GSM8K dataset with grade school math word problems",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _create_aime24_preset() -> BenchmarkPreset:
|
|
182
|
+
"""Create AIME 2024 benchmark preset."""
|
|
183
|
+
from themis.datasets.competition_math import load_competition_math
|
|
184
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
185
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
186
|
+
|
|
187
|
+
def load_aime24(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
188
|
+
samples = load_competition_math(
|
|
189
|
+
dataset_id="aime24",
|
|
190
|
+
source="huggingface",
|
|
191
|
+
split="test",
|
|
192
|
+
limit=limit,
|
|
193
|
+
)
|
|
194
|
+
return [dict(s) if not isinstance(s, dict) else s for s in samples]
|
|
195
|
+
|
|
196
|
+
prompt_template = PromptTemplate(
|
|
197
|
+
name="aime24-zero-shot",
|
|
198
|
+
template=(
|
|
199
|
+
"Solve the following AIME problem. "
|
|
200
|
+
"Your answer should be a number between 000 and 999.\n\n"
|
|
201
|
+
"Problem: {problem}\n\n"
|
|
202
|
+
"Solution:"
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return BenchmarkPreset(
|
|
207
|
+
name="aime24",
|
|
208
|
+
prompt_template=prompt_template,
|
|
209
|
+
metrics=[MathVerifyAccuracy()],
|
|
210
|
+
extractor=MathVerifyExtractor(),
|
|
211
|
+
dataset_loader=load_aime24,
|
|
212
|
+
metadata_fields=("subject",),
|
|
213
|
+
reference_field="answer",
|
|
214
|
+
dataset_id_field="id",
|
|
215
|
+
description="AIME 2024 competition math problems",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# ============================================================================
|
|
220
|
+
# MCQ Benchmarks
|
|
221
|
+
# ============================================================================
|
|
222
|
+
|
|
223
|
+
def _create_mmlu_pro_preset() -> BenchmarkPreset:
|
|
224
|
+
"""Create MMLU-Pro benchmark preset."""
|
|
225
|
+
from themis.datasets.mmlu_pro import load_mmlu_pro as load_mmlu_pro_dataset
|
|
226
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
227
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
228
|
+
|
|
229
|
+
def load_mmlu_pro(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
230
|
+
samples = load_mmlu_pro_dataset(source="huggingface", split="test", limit=limit)
|
|
231
|
+
return [dict(s) if not isinstance(s, dict) else s for s in samples]
|
|
232
|
+
|
|
233
|
+
prompt_template = PromptTemplate(
|
|
234
|
+
name="mmlu-pro-zero-shot",
|
|
235
|
+
template=(
|
|
236
|
+
"Answer the following multiple choice question.\n\n"
|
|
237
|
+
"Question: {question}\n\n"
|
|
238
|
+
"Options:\n{options}\n\n"
|
|
239
|
+
"Answer:"
|
|
240
|
+
),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
return BenchmarkPreset(
|
|
244
|
+
name="mmlu-pro",
|
|
245
|
+
prompt_template=prompt_template,
|
|
246
|
+
metrics=[ExactMatch()],
|
|
247
|
+
extractor=IdentityExtractor(),
|
|
248
|
+
dataset_loader=load_mmlu_pro,
|
|
249
|
+
metadata_fields=("category",),
|
|
250
|
+
reference_field="answer",
|
|
251
|
+
dataset_id_field="id",
|
|
252
|
+
description="MMLU-Pro professional-level multiple choice questions",
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _create_supergpqa_preset() -> BenchmarkPreset:
|
|
257
|
+
"""Create SuperGPQA benchmark preset."""
|
|
258
|
+
from themis.datasets.super_gpqa import load_super_gpqa as load_supergpqa_dataset
|
|
259
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
260
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
261
|
+
|
|
262
|
+
def load_supergpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
263
|
+
samples = load_supergpqa_dataset(source="huggingface", split="test", limit=limit)
|
|
264
|
+
return [dict(s) if not isinstance(s, dict) else s for s in samples]
|
|
265
|
+
|
|
266
|
+
prompt_template = PromptTemplate(
|
|
267
|
+
name="supergpqa-zero-shot",
|
|
268
|
+
template=(
|
|
269
|
+
"Answer the following science question.\n\n"
|
|
270
|
+
"Question: {question}\n\n"
|
|
271
|
+
"Choices:\n{choices}\n\n"
|
|
272
|
+
"Answer:"
|
|
273
|
+
),
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
return BenchmarkPreset(
|
|
277
|
+
name="supergpqa",
|
|
278
|
+
prompt_template=prompt_template,
|
|
279
|
+
metrics=[ExactMatch()],
|
|
280
|
+
extractor=IdentityExtractor(),
|
|
281
|
+
dataset_loader=load_supergpqa,
|
|
282
|
+
metadata_fields=("subject",),
|
|
283
|
+
reference_field="answer",
|
|
284
|
+
dataset_id_field="id",
|
|
285
|
+
description="SuperGPQA graduate-level science questions",
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# ============================================================================
|
|
290
|
+
# Demo/Test Benchmarks
|
|
291
|
+
# ============================================================================
|
|
292
|
+
|
|
293
|
+
def _create_demo_preset() -> BenchmarkPreset:
|
|
294
|
+
"""Create demo benchmark preset for testing."""
|
|
295
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
296
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
297
|
+
|
|
298
|
+
def load_demo(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
299
|
+
samples = [
|
|
300
|
+
{"id": "demo-1", "question": "What is 2 + 2?", "answer": "4"},
|
|
301
|
+
{"id": "demo-2", "question": "What is the capital of France?", "answer": "Paris"},
|
|
302
|
+
{"id": "demo-3", "question": "What is 10 * 5?", "answer": "50"},
|
|
303
|
+
]
|
|
304
|
+
if limit is not None:
|
|
305
|
+
samples = samples[:limit]
|
|
306
|
+
return samples
|
|
307
|
+
|
|
308
|
+
prompt_template = PromptTemplate(
|
|
309
|
+
name="demo",
|
|
310
|
+
template="Q: {question}\nA:",
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
return BenchmarkPreset(
|
|
314
|
+
name="demo",
|
|
315
|
+
prompt_template=prompt_template,
|
|
316
|
+
metrics=[ExactMatch()],
|
|
317
|
+
extractor=IdentityExtractor(),
|
|
318
|
+
dataset_loader=load_demo,
|
|
319
|
+
metadata_fields=(),
|
|
320
|
+
reference_field="answer",
|
|
321
|
+
dataset_id_field="id",
|
|
322
|
+
description="Demo benchmark for testing",
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# ============================================================================
|
|
327
|
+
# Register all benchmarks (lazy initialization)
|
|
328
|
+
# ============================================================================
|
|
329
|
+
|
|
330
|
+
def _register_all_benchmarks() -> None:
|
|
331
|
+
"""Register all built-in benchmarks.
|
|
332
|
+
|
|
333
|
+
This is called lazily on first use to avoid importing heavy dependencies
|
|
334
|
+
(datasets, models, etc.) until actually needed.
|
|
335
|
+
"""
|
|
336
|
+
# Math benchmarks
|
|
337
|
+
register_benchmark(_create_math500_preset())
|
|
338
|
+
register_benchmark(_create_gsm8k_preset())
|
|
339
|
+
register_benchmark(_create_aime24_preset())
|
|
340
|
+
|
|
341
|
+
# MCQ benchmarks
|
|
342
|
+
register_benchmark(_create_mmlu_pro_preset())
|
|
343
|
+
register_benchmark(_create_supergpqa_preset())
|
|
344
|
+
|
|
345
|
+
# Demo
|
|
346
|
+
register_benchmark(_create_demo_preset())
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
__all__ = [
|
|
350
|
+
"BenchmarkPreset",
|
|
351
|
+
"register_benchmark",
|
|
352
|
+
"get_benchmark_preset",
|
|
353
|
+
"list_benchmarks",
|
|
354
|
+
]
|