themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +429 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/main.py +427 -57
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/core/entities.py +23 -3
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/pipelines/standard_pipeline.py +68 -8
- themis/experiment/cache_manager.py +8 -3
- themis/experiment/export.py +110 -2
- themis/experiment/orchestrator.py +109 -11
- themis/experiment/storage.py +1457 -110
- themis/generation/providers/litellm_provider.py +46 -0
- themis/generation/runner.py +22 -6
- themis/integrations/huggingface.py +12 -1
- themis/integrations/wandb.py +13 -1
- themis/interfaces/__init__.py +86 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis_eval-0.2.1.dist-info/METADATA +596 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
- themis_eval-0.1.1.dist-info/METADATA +0 -758
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
themis/__init__.py
CHANGED
|
@@ -1,14 +1,25 @@
|
|
|
1
|
-
"""Themis experiment platform.
|
|
1
|
+
"""Themis experiment platform - Dead simple LLM evaluation.
|
|
2
|
+
|
|
3
|
+
The primary interface is the `evaluate()` function:
|
|
4
|
+
|
|
5
|
+
import themis
|
|
6
|
+
report = themis.evaluate("math500", model="gpt-4", limit=100)
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from themis import config, core, evaluation, experiment, generation, project
|
|
4
10
|
from themis._version import __version__
|
|
11
|
+
from themis.api import evaluate
|
|
5
12
|
|
|
6
13
|
__all__ = [
|
|
14
|
+
# Main API
|
|
15
|
+
"evaluate",
|
|
16
|
+
# Submodules
|
|
7
17
|
"config",
|
|
8
18
|
"core",
|
|
9
19
|
"evaluation",
|
|
10
20
|
"experiment",
|
|
11
21
|
"generation",
|
|
12
22
|
"project",
|
|
23
|
+
# Version
|
|
13
24
|
"__version__",
|
|
14
25
|
]
|
themis/_version.py
CHANGED
|
@@ -7,9 +7,9 @@ from importlib import metadata
|
|
|
7
7
|
|
|
8
8
|
def _detect_version() -> str:
|
|
9
9
|
try:
|
|
10
|
-
return metadata.version("themis")
|
|
10
|
+
return metadata.version("themis-eval")
|
|
11
11
|
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
12
|
-
return "0.
|
|
12
|
+
return "0.2.1" # Fallback for development
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
__version__ = _detect_version()
|
themis/api.py
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
"""Unified API for Themis - The primary interface for all evaluations.
|
|
2
|
+
|
|
3
|
+
This module provides the main entry point for running evaluations:
|
|
4
|
+
- Simple one-liner for benchmarks
|
|
5
|
+
- Custom datasets with minimal configuration
|
|
6
|
+
- Distributed execution and cloud storage support
|
|
7
|
+
- Auto-configuration of prompts, metrics, and extractors
|
|
8
|
+
|
|
9
|
+
Example:
|
|
10
|
+
```python
|
|
11
|
+
import themis
|
|
12
|
+
|
|
13
|
+
# Simple benchmark evaluation
|
|
14
|
+
report = themis.evaluate("math500", model="gpt-4", limit=100)
|
|
15
|
+
|
|
16
|
+
# Custom dataset
|
|
17
|
+
report = themis.evaluate(
|
|
18
|
+
dataset=[{"id": "1", "question": "...", "answer": "..."}],
|
|
19
|
+
model="claude-3-opus",
|
|
20
|
+
prompt="Solve: {question}"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Distributed with cloud storage
|
|
24
|
+
report = themis.evaluate(
|
|
25
|
+
"gsm8k",
|
|
26
|
+
model="gpt-4",
|
|
27
|
+
distributed=True,
|
|
28
|
+
workers=8,
|
|
29
|
+
storage="s3://my-bucket/experiments"
|
|
30
|
+
)
|
|
31
|
+
```
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import logging
|
|
37
|
+
from datetime import datetime
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from typing import Any, Callable, Sequence
|
|
40
|
+
|
|
41
|
+
from themis.core.entities import (
|
|
42
|
+
ExperimentReport,
|
|
43
|
+
GenerationRecord,
|
|
44
|
+
ModelSpec,
|
|
45
|
+
PromptSpec,
|
|
46
|
+
SamplingConfig,
|
|
47
|
+
)
|
|
48
|
+
from themis.evaluation.pipeline import EvaluationPipeline
|
|
49
|
+
from themis.experiment.orchestrator import ExperimentOrchestrator
|
|
50
|
+
from themis.generation.plan import GenerationPlan
|
|
51
|
+
from themis.generation.router import ProviderRouter
|
|
52
|
+
from themis.generation.runner import GenerationRunner
|
|
53
|
+
from themis.generation.templates import PromptTemplate
|
|
54
|
+
from themis.providers import create_provider
|
|
55
|
+
|
|
56
|
+
# Import provider modules to ensure they register themselves
|
|
57
|
+
try:
|
|
58
|
+
from themis.generation import clients # noqa: F401 - registers fake provider
|
|
59
|
+
from themis.generation.providers import (
|
|
60
|
+
litellm_provider, # noqa: F401
|
|
61
|
+
vllm_provider, # noqa: F401
|
|
62
|
+
)
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
logger = logging.getLogger(__name__)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def evaluate(
|
|
70
|
+
benchmark_or_dataset: str | Sequence[dict[str, Any]],
|
|
71
|
+
*,
|
|
72
|
+
model: str,
|
|
73
|
+
limit: int | None = None,
|
|
74
|
+
prompt: str | None = None,
|
|
75
|
+
metrics: list[str] | None = None,
|
|
76
|
+
temperature: float = 0.0,
|
|
77
|
+
max_tokens: int = 512,
|
|
78
|
+
num_samples: int = 1,
|
|
79
|
+
distributed: bool = False,
|
|
80
|
+
workers: int = 4,
|
|
81
|
+
storage: str | Path | None = None,
|
|
82
|
+
run_id: str | None = None,
|
|
83
|
+
resume: bool = True,
|
|
84
|
+
on_result: Callable[[GenerationRecord], None] | None = None,
|
|
85
|
+
**kwargs: Any,
|
|
86
|
+
) -> ExperimentReport:
|
|
87
|
+
"""Run an LLM evaluation with automatic configuration.
|
|
88
|
+
|
|
89
|
+
This is the primary API for Themis. It auto-configures prompts, metrics,
|
|
90
|
+
and extractors based on the benchmark name, or allows full customization
|
|
91
|
+
for custom datasets.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
benchmark_or_dataset: Either a benchmark name (e.g., "math500", "gsm8k")
|
|
95
|
+
or a list of dataset samples as dictionaries. For custom datasets,
|
|
96
|
+
each dict should have: prompt/question (input), answer/reference (output),
|
|
97
|
+
and optionally id (unique identifier).
|
|
98
|
+
model: Model identifier for LiteLLM (e.g., "gpt-4", "claude-3-opus-20240229",
|
|
99
|
+
"azure/gpt-4", "ollama/llama3"). Provider is auto-detected from the name.
|
|
100
|
+
limit: Maximum number of samples to evaluate. Use for testing or when you
|
|
101
|
+
want to evaluate a subset. None means evaluate all samples.
|
|
102
|
+
prompt: Custom prompt template using Python format strings. Variables like
|
|
103
|
+
{prompt}, {question}, {context} will be replaced with dataset fields.
|
|
104
|
+
If None, uses the benchmark's default prompt template.
|
|
105
|
+
metrics: List of metric names to compute. Available: "ExactMatch", "MathVerify",
|
|
106
|
+
"BLEU", "ROUGE", "BERTScore", "METEOR", "PassAtK", "CodeBLEU",
|
|
107
|
+
"ExecutionAccuracy". If None, uses benchmark defaults.
|
|
108
|
+
temperature: Sampling temperature (0.0 = deterministic/greedy, 1.0 = standard,
|
|
109
|
+
2.0 = very random). Recommended: 0.0 for evaluation reproducibility.
|
|
110
|
+
max_tokens: Maximum tokens in model response. Typical values: 256 for short
|
|
111
|
+
answers, 512 for medium, 2048 for long explanations or code.
|
|
112
|
+
num_samples: Number of responses to generate per prompt. Use >1 for Pass@K
|
|
113
|
+
metrics, ensembling, or measuring response variance.
|
|
114
|
+
distributed: Whether to use distributed execution. Currently a placeholder
|
|
115
|
+
for future Ray integration.
|
|
116
|
+
workers: Number of parallel workers for generation. Higher = faster but may
|
|
117
|
+
hit rate limits. Recommended: 4-16 for APIs, 32+ for local models.
|
|
118
|
+
storage: Storage location for results and cache. Defaults to ".cache/experiments".
|
|
119
|
+
Can be a local path or (future) cloud storage URI.
|
|
120
|
+
run_id: Unique identifier for this run. If None, auto-generated from timestamp
|
|
121
|
+
(e.g., "run-2024-01-15-123456"). Use meaningful IDs for tracking experiments.
|
|
122
|
+
resume: Whether to resume from cached results.
|
|
123
|
+
on_result: Optional callback function called for each result.
|
|
124
|
+
**kwargs: Additional provider-specific options.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
ExperimentReport containing generation results, evaluation metrics,
|
|
128
|
+
and metadata.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
ValueError: If benchmark is unknown or configuration is invalid.
|
|
132
|
+
RuntimeError: If evaluation fails.
|
|
133
|
+
|
|
134
|
+
Example:
|
|
135
|
+
>>> report = themis.evaluate("math500", model="gpt-4", limit=10)
|
|
136
|
+
>>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
|
|
137
|
+
Accuracy: 85.00%
|
|
138
|
+
"""
|
|
139
|
+
logger.info("=" * 60)
|
|
140
|
+
logger.info("Starting Themis evaluation")
|
|
141
|
+
logger.info(f"Model: {model}")
|
|
142
|
+
logger.info(f"Workers: {workers}")
|
|
143
|
+
logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
|
|
144
|
+
if "api_base" in kwargs:
|
|
145
|
+
logger.info(f"Custom API base: {kwargs['api_base']}")
|
|
146
|
+
if "api_key" in kwargs:
|
|
147
|
+
logger.info("API key: <provided>")
|
|
148
|
+
else:
|
|
149
|
+
logger.warning("⚠️ No api_key provided - may fail for custom API endpoints")
|
|
150
|
+
logger.info("=" * 60)
|
|
151
|
+
|
|
152
|
+
# Import presets system (lazy import to avoid circular dependencies)
|
|
153
|
+
from themis.presets import get_benchmark_preset, parse_model_name
|
|
154
|
+
|
|
155
|
+
# Determine if we're using a benchmark or custom dataset
|
|
156
|
+
is_benchmark = isinstance(benchmark_or_dataset, str)
|
|
157
|
+
|
|
158
|
+
if is_benchmark:
|
|
159
|
+
benchmark_name = benchmark_or_dataset
|
|
160
|
+
logger.info(f"Loading benchmark: {benchmark_name}")
|
|
161
|
+
|
|
162
|
+
# Get preset configuration
|
|
163
|
+
try:
|
|
164
|
+
preset = get_benchmark_preset(benchmark_name)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
# Load dataset using preset loader
|
|
170
|
+
logger.info(f"Loading dataset (limit={limit})...")
|
|
171
|
+
try:
|
|
172
|
+
dataset = preset.load_dataset(limit=limit)
|
|
173
|
+
logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"❌ Failed to load dataset: {e}")
|
|
176
|
+
raise
|
|
177
|
+
|
|
178
|
+
# Use preset prompt if not overridden
|
|
179
|
+
if prompt is None:
|
|
180
|
+
prompt_template = preset.prompt_template
|
|
181
|
+
else:
|
|
182
|
+
prompt_template = PromptTemplate(name="custom", template=prompt)
|
|
183
|
+
|
|
184
|
+
# Use preset metrics if not overridden
|
|
185
|
+
if metrics is None:
|
|
186
|
+
metrics_list = preset.metrics
|
|
187
|
+
else:
|
|
188
|
+
metrics_list = _resolve_metrics(metrics)
|
|
189
|
+
|
|
190
|
+
# Use preset extractor
|
|
191
|
+
extractor = preset.extractor
|
|
192
|
+
|
|
193
|
+
# Use preset metadata fields
|
|
194
|
+
metadata_fields = preset.metadata_fields
|
|
195
|
+
reference_field = preset.reference_field
|
|
196
|
+
dataset_id_field = preset.dataset_id_field
|
|
197
|
+
else:
|
|
198
|
+
# Custom dataset
|
|
199
|
+
logger.info("Using custom dataset")
|
|
200
|
+
dataset = list(benchmark_or_dataset)
|
|
201
|
+
logger.info(f"Custom dataset has {len(dataset)} samples")
|
|
202
|
+
|
|
203
|
+
# Limit dataset if requested
|
|
204
|
+
if limit is not None:
|
|
205
|
+
dataset = dataset[:limit]
|
|
206
|
+
logger.info(f"Limited to {len(dataset)} samples")
|
|
207
|
+
|
|
208
|
+
# Use provided prompt or default
|
|
209
|
+
if prompt is None:
|
|
210
|
+
raise ValueError(
|
|
211
|
+
"Custom datasets require a prompt template. "
|
|
212
|
+
"Example: prompt='Solve: {question}'"
|
|
213
|
+
)
|
|
214
|
+
prompt_template = PromptTemplate(name="custom", template=prompt)
|
|
215
|
+
|
|
216
|
+
# Use provided metrics or defaults
|
|
217
|
+
if metrics is None:
|
|
218
|
+
metrics_list = _resolve_metrics(["exact_match"])
|
|
219
|
+
else:
|
|
220
|
+
metrics_list = _resolve_metrics(metrics)
|
|
221
|
+
|
|
222
|
+
# Use identity extractor by default
|
|
223
|
+
from themis.evaluation.extractors import IdentityExtractor
|
|
224
|
+
extractor = IdentityExtractor()
|
|
225
|
+
|
|
226
|
+
# Use standard field names
|
|
227
|
+
metadata_fields = ()
|
|
228
|
+
reference_field = "answer"
|
|
229
|
+
dataset_id_field = "id"
|
|
230
|
+
|
|
231
|
+
# Parse model name to get provider and options
|
|
232
|
+
logger.info(f"Parsing model configuration...")
|
|
233
|
+
try:
|
|
234
|
+
provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
|
|
235
|
+
logger.info(f"Provider: {provider_name}")
|
|
236
|
+
logger.info(f"Model ID: {model_id}")
|
|
237
|
+
logger.debug(f"Provider options: {provider_options}")
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.error(f"❌ Failed to parse model name '{model}': {e}")
|
|
240
|
+
raise
|
|
241
|
+
|
|
242
|
+
# Create model spec
|
|
243
|
+
model_spec = ModelSpec(
|
|
244
|
+
identifier=model_id,
|
|
245
|
+
provider=provider_name,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Create sampling config
|
|
249
|
+
sampling_config = SamplingConfig(
|
|
250
|
+
temperature=temperature,
|
|
251
|
+
top_p=kwargs.get("top_p", 0.95),
|
|
252
|
+
max_tokens=max_tokens,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Create generation plan
|
|
256
|
+
plan = GenerationPlan(
|
|
257
|
+
templates=[prompt_template],
|
|
258
|
+
models=[model_spec],
|
|
259
|
+
sampling_parameters=[sampling_config],
|
|
260
|
+
dataset_id_field=dataset_id_field,
|
|
261
|
+
reference_field=reference_field,
|
|
262
|
+
metadata_fields=metadata_fields,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Create provider and router
|
|
266
|
+
logger.info(f"Creating provider '{provider_name}'...")
|
|
267
|
+
try:
|
|
268
|
+
provider = create_provider(provider_name, **provider_options)
|
|
269
|
+
logger.info(f"✅ Provider created successfully")
|
|
270
|
+
except KeyError as e:
|
|
271
|
+
logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
|
|
272
|
+
logger.error(f" This usually means the provider module wasn't imported.")
|
|
273
|
+
raise
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(f"❌ Failed to create provider: {e}")
|
|
276
|
+
raise
|
|
277
|
+
|
|
278
|
+
router = ProviderRouter({model_id: provider})
|
|
279
|
+
logger.debug(f"Router configured for model: {model_id}")
|
|
280
|
+
|
|
281
|
+
# Create runner
|
|
282
|
+
runner = GenerationRunner(provider=router, max_parallel=workers)
|
|
283
|
+
logger.info(f"Runner configured with {workers} parallel workers")
|
|
284
|
+
|
|
285
|
+
# Create evaluation pipeline
|
|
286
|
+
pipeline = EvaluationPipeline(
|
|
287
|
+
extractor=extractor,
|
|
288
|
+
metrics=metrics_list,
|
|
289
|
+
)
|
|
290
|
+
logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
|
|
291
|
+
|
|
292
|
+
# Determine storage location
|
|
293
|
+
if storage is None:
|
|
294
|
+
storage_dir = Path.home() / ".themis" / "runs"
|
|
295
|
+
else:
|
|
296
|
+
storage_dir = Path(storage) if not str(storage).startswith(("s3://", "gs://", "azure://")) else storage
|
|
297
|
+
|
|
298
|
+
# Generate run ID if not provided
|
|
299
|
+
if run_id is None:
|
|
300
|
+
run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
301
|
+
logger.info(f"Run ID: {run_id}")
|
|
302
|
+
logger.info(f"Storage: {storage_dir}")
|
|
303
|
+
logger.info(f"Resume: {resume}")
|
|
304
|
+
|
|
305
|
+
# Create storage backend
|
|
306
|
+
if isinstance(storage_dir, Path):
|
|
307
|
+
from themis.experiment.storage import ExperimentStorage
|
|
308
|
+
storage_backend = ExperimentStorage(storage_dir)
|
|
309
|
+
logger.debug(f"Storage backend created at {storage_dir}")
|
|
310
|
+
else:
|
|
311
|
+
# Cloud storage (to be implemented in Phase 3)
|
|
312
|
+
raise NotImplementedError(
|
|
313
|
+
f"Cloud storage not yet implemented. Use local path for now. "
|
|
314
|
+
f"Requested: {storage_dir}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Create orchestrator
|
|
318
|
+
orchestrator = ExperimentOrchestrator(
|
|
319
|
+
generation_plan=plan,
|
|
320
|
+
generation_runner=runner,
|
|
321
|
+
evaluation_pipeline=pipeline,
|
|
322
|
+
storage=storage_backend,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Run evaluation
|
|
326
|
+
if distributed:
|
|
327
|
+
# Distributed execution (to be implemented in Phase 3)
|
|
328
|
+
raise NotImplementedError(
|
|
329
|
+
"Distributed execution not yet implemented. "
|
|
330
|
+
"Set distributed=False to use local execution."
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Run locally
|
|
334
|
+
logger.info("=" * 60)
|
|
335
|
+
logger.info("🚀 Starting experiment execution...")
|
|
336
|
+
logger.info("=" * 60)
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
report = orchestrator.run(
|
|
340
|
+
dataset=dataset,
|
|
341
|
+
max_samples=limit,
|
|
342
|
+
run_id=run_id,
|
|
343
|
+
resume=resume,
|
|
344
|
+
on_result=on_result,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
logger.info("=" * 60)
|
|
348
|
+
logger.info("✅ Evaluation completed successfully!")
|
|
349
|
+
logger.info(f" Total samples: {len(report.generation_results)}")
|
|
350
|
+
logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
|
|
351
|
+
logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
|
|
352
|
+
if report.evaluation_report.metrics:
|
|
353
|
+
logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
|
|
354
|
+
logger.info("=" * 60)
|
|
355
|
+
|
|
356
|
+
return report
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.error("=" * 60)
|
|
359
|
+
logger.error(f"❌ Evaluation failed: {e}")
|
|
360
|
+
logger.error("=" * 60)
|
|
361
|
+
raise
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _resolve_metrics(metric_names: list[str]) -> list:
|
|
365
|
+
"""Resolve metric names to metric instances.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
metric_names: List of metric names (e.g., ["exact_match", "bleu"])
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
List of metric instances
|
|
372
|
+
|
|
373
|
+
Raises:
|
|
374
|
+
ValueError: If a metric name is unknown
|
|
375
|
+
"""
|
|
376
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
377
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
378
|
+
from themis.evaluation.metrics.response_length import ResponseLength
|
|
379
|
+
|
|
380
|
+
# NLP metrics (Phase 2)
|
|
381
|
+
try:
|
|
382
|
+
from themis.evaluation.metrics.nlp import BLEU, ROUGE, BERTScore, METEOR, ROUGEVariant
|
|
383
|
+
nlp_available = True
|
|
384
|
+
except ImportError:
|
|
385
|
+
nlp_available = False
|
|
386
|
+
|
|
387
|
+
# Metric registry
|
|
388
|
+
METRICS_REGISTRY = {
|
|
389
|
+
# Core metrics
|
|
390
|
+
"exact_match": ExactMatch,
|
|
391
|
+
"math_verify": MathVerifyAccuracy,
|
|
392
|
+
"response_length": ResponseLength,
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
# Add NLP metrics if available
|
|
396
|
+
if nlp_available:
|
|
397
|
+
METRICS_REGISTRY.update({
|
|
398
|
+
"bleu": BLEU,
|
|
399
|
+
"rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
|
|
400
|
+
"rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
|
|
401
|
+
"rougeL": lambda: ROUGE(variant=ROUGEVariant.ROUGE_L),
|
|
402
|
+
"bertscore": BERTScore,
|
|
403
|
+
"meteor": METEOR,
|
|
404
|
+
})
|
|
405
|
+
|
|
406
|
+
# Code metrics (to be added later in Phase 2)
|
|
407
|
+
# "pass_at_k": PassAtK,
|
|
408
|
+
# "codebleu": CodeBLEU,
|
|
409
|
+
|
|
410
|
+
metrics = []
|
|
411
|
+
for name in metric_names:
|
|
412
|
+
if name not in METRICS_REGISTRY:
|
|
413
|
+
available = ", ".join(sorted(METRICS_REGISTRY.keys()))
|
|
414
|
+
raise ValueError(
|
|
415
|
+
f"Unknown metric: {name}. "
|
|
416
|
+
f"Available metrics: {available}"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
metric_cls = METRICS_REGISTRY[name]
|
|
420
|
+
# Handle both class and lambda factory
|
|
421
|
+
if callable(metric_cls) and not isinstance(metric_cls, type):
|
|
422
|
+
metrics.append(metric_cls())
|
|
423
|
+
else:
|
|
424
|
+
metrics.append(metric_cls())
|
|
425
|
+
|
|
426
|
+
return metrics
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
__all__ = ["evaluate"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Backend interfaces for extending Themis.
|
|
2
|
+
|
|
3
|
+
This module provides abstract interfaces for implementing custom backends:
|
|
4
|
+
- StorageBackend: Custom storage implementations (cloud, databases, etc.)
|
|
5
|
+
- ExecutionBackend: Custom execution strategies (distributed, async, etc.)
|
|
6
|
+
|
|
7
|
+
These interfaces allow advanced users to extend Themis without modifying core code.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from themis.backends.execution import ExecutionBackend, LocalExecutionBackend
|
|
11
|
+
from themis.backends.storage import StorageBackend
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"StorageBackend",
|
|
15
|
+
"ExecutionBackend",
|
|
16
|
+
"LocalExecutionBackend",
|
|
17
|
+
]
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Execution backend interface for custom execution strategies.
|
|
2
|
+
|
|
3
|
+
This module defines the abstract interface for execution backends, allowing
|
|
4
|
+
users to implement custom execution strategies (distributed, GPU-accelerated,
|
|
5
|
+
async, etc.) without modifying Themis core code.
|
|
6
|
+
|
|
7
|
+
Example implementations:
|
|
8
|
+
- RayExecutionBackend: Distributed execution with Ray
|
|
9
|
+
- DaskExecutionBackend: Distributed execution with Dask
|
|
10
|
+
- AsyncExecutionBackend: Async/await based execution
|
|
11
|
+
- GPUBatchExecutionBackend: Batched GPU execution for vLLM
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
18
|
+
from typing import Any, Callable, Iterable, Iterator, TypeVar
|
|
19
|
+
|
|
20
|
+
T = TypeVar("T")
|
|
21
|
+
R = TypeVar("R")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ExecutionBackend(ABC):
|
|
25
|
+
"""Abstract interface for execution backends.
|
|
26
|
+
|
|
27
|
+
Implement this interface to create custom execution strategies.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> class RayExecutionBackend(ExecutionBackend):
|
|
31
|
+
... def __init__(self, num_cpus: int = 4):
|
|
32
|
+
... import ray
|
|
33
|
+
... if not ray.is_initialized():
|
|
34
|
+
... ray.init(num_cpus=num_cpus)
|
|
35
|
+
...
|
|
36
|
+
... def map(self, func, items, max_workers=None):
|
|
37
|
+
... import ray
|
|
38
|
+
... # Convert to Ray remote function
|
|
39
|
+
... remote_func = ray.remote(func)
|
|
40
|
+
... # Submit all tasks
|
|
41
|
+
... futures = [remote_func.remote(item) for item in items]
|
|
42
|
+
... # Get results as they complete
|
|
43
|
+
... for future in futures:
|
|
44
|
+
... yield ray.get(future)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def map(
|
|
49
|
+
self,
|
|
50
|
+
func: Callable[[T], R],
|
|
51
|
+
items: Iterable[T],
|
|
52
|
+
*,
|
|
53
|
+
max_workers: int | None = None,
|
|
54
|
+
timeout: float | None = None,
|
|
55
|
+
**kwargs: Any,
|
|
56
|
+
) -> Iterator[R]:
|
|
57
|
+
"""Execute function over items in parallel.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
func: Function to apply to each item
|
|
61
|
+
items: Iterable of items to process
|
|
62
|
+
max_workers: Maximum number of parallel workers
|
|
63
|
+
timeout: Timeout for each execution (seconds)
|
|
64
|
+
**kwargs: Additional backend-specific options
|
|
65
|
+
|
|
66
|
+
Yields:
|
|
67
|
+
Results as they complete
|
|
68
|
+
|
|
69
|
+
Note:
|
|
70
|
+
Results may be yielded in any order (not necessarily input order).
|
|
71
|
+
Implementation should handle errors gracefully.
|
|
72
|
+
"""
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def shutdown(self) -> None:
|
|
77
|
+
"""Shutdown the execution backend and release resources.
|
|
78
|
+
|
|
79
|
+
Called when execution is complete. Should cleanup workers,
|
|
80
|
+
connections, and other resources.
|
|
81
|
+
"""
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
def __enter__(self):
|
|
85
|
+
"""Context manager entry."""
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
89
|
+
"""Context manager exit."""
|
|
90
|
+
self.shutdown()
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class LocalExecutionBackend(ExecutionBackend):
|
|
95
|
+
"""Local multi-threaded execution using ThreadPoolExecutor.
|
|
96
|
+
|
|
97
|
+
This is the default execution backend, using Python's built-in
|
|
98
|
+
ThreadPoolExecutor for parallel execution.
|
|
99
|
+
|
|
100
|
+
Attributes:
|
|
101
|
+
executor: ThreadPoolExecutor instance
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self, max_workers: int = 4):
|
|
105
|
+
"""Initialize with number of workers.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
max_workers: Maximum number of worker threads
|
|
109
|
+
"""
|
|
110
|
+
self._max_workers = max_workers
|
|
111
|
+
self._executor: ThreadPoolExecutor | None = None
|
|
112
|
+
|
|
113
|
+
def map(
|
|
114
|
+
self,
|
|
115
|
+
func: Callable[[T], R],
|
|
116
|
+
items: Iterable[T],
|
|
117
|
+
*,
|
|
118
|
+
max_workers: int | None = None,
|
|
119
|
+
timeout: float | None = None,
|
|
120
|
+
**kwargs: Any,
|
|
121
|
+
) -> Iterator[R]:
|
|
122
|
+
"""Execute function over items using ThreadPoolExecutor.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
func: Function to apply to each item
|
|
126
|
+
items: Iterable of items to process
|
|
127
|
+
max_workers: Override default max_workers
|
|
128
|
+
timeout: Timeout for each task (seconds)
|
|
129
|
+
**kwargs: Ignored (for interface compatibility)
|
|
130
|
+
|
|
131
|
+
Yields:
|
|
132
|
+
Results as they complete
|
|
133
|
+
"""
|
|
134
|
+
workers = max_workers or self._max_workers
|
|
135
|
+
|
|
136
|
+
# Create executor if not exists
|
|
137
|
+
if self._executor is None:
|
|
138
|
+
self._executor = ThreadPoolExecutor(max_workers=workers)
|
|
139
|
+
|
|
140
|
+
# Submit all tasks
|
|
141
|
+
items_list = list(items) # Materialize iterator
|
|
142
|
+
futures = [self._executor.submit(func, item) for item in items_list]
|
|
143
|
+
|
|
144
|
+
# Yield results as they complete
|
|
145
|
+
for future in as_completed(futures, timeout=timeout):
|
|
146
|
+
result = future.result()
|
|
147
|
+
yield result
|
|
148
|
+
|
|
149
|
+
def shutdown(self) -> None:
|
|
150
|
+
"""Shutdown the executor."""
|
|
151
|
+
if self._executor is not None:
|
|
152
|
+
self._executor.shutdown(wait=True)
|
|
153
|
+
self._executor = None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class SequentialExecutionBackend(ExecutionBackend):
|
|
157
|
+
"""Sequential execution backend for debugging.
|
|
158
|
+
|
|
159
|
+
Executes tasks one at a time without parallelism.
|
|
160
|
+
Useful for debugging, testing, or when parallelism causes issues.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def map(
|
|
164
|
+
self,
|
|
165
|
+
func: Callable[[T], R],
|
|
166
|
+
items: Iterable[T],
|
|
167
|
+
*,
|
|
168
|
+
max_workers: int | None = None,
|
|
169
|
+
timeout: float | None = None,
|
|
170
|
+
**kwargs: Any,
|
|
171
|
+
) -> Iterator[R]:
|
|
172
|
+
"""Execute function sequentially.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
func: Function to apply to each item
|
|
176
|
+
items: Iterable of items to process
|
|
177
|
+
max_workers: Ignored (no parallelism)
|
|
178
|
+
timeout: Timeout for each task (seconds)
|
|
179
|
+
**kwargs: Ignored
|
|
180
|
+
|
|
181
|
+
Yields:
|
|
182
|
+
Results in input order
|
|
183
|
+
"""
|
|
184
|
+
for item in items:
|
|
185
|
+
result = func(item)
|
|
186
|
+
yield result
|
|
187
|
+
|
|
188
|
+
def shutdown(self) -> None:
|
|
189
|
+
"""No-op for sequential execution."""
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
__all__ = [
|
|
194
|
+
"ExecutionBackend",
|
|
195
|
+
"LocalExecutionBackend",
|
|
196
|
+
"SequentialExecutionBackend",
|
|
197
|
+
]
|