themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +48 -6
  27. themis/experiment/storage.py +1313 -110
  28. themis/integrations/huggingface.py +12 -1
  29. themis/integrations/wandb.py +13 -1
  30. themis/interfaces/__init__.py +86 -0
  31. themis/presets/__init__.py +10 -0
  32. themis/presets/benchmarks.py +354 -0
  33. themis/presets/models.py +190 -0
  34. themis/server/__init__.py +28 -0
  35. themis/server/app.py +337 -0
  36. themis_eval-0.2.0.dist-info/METADATA +596 -0
  37. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
  38. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  39. themis_eval-0.1.1.dist-info/METADATA +0 -758
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  41. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
themis/__init__.py CHANGED
@@ -1,14 +1,25 @@
1
- """Themis experiment platform."""
1
+ """Themis experiment platform - Dead simple LLM evaluation.
2
+
3
+ The primary interface is the `evaluate()` function:
4
+
5
+ import themis
6
+ report = themis.evaluate("math500", model="gpt-4", limit=100)
7
+ """
2
8
 
3
9
  from themis import config, core, evaluation, experiment, generation, project
4
10
  from themis._version import __version__
11
+ from themis.api import evaluate
5
12
 
6
13
  __all__ = [
14
+ # Main API
15
+ "evaluate",
16
+ # Submodules
7
17
  "config",
8
18
  "core",
9
19
  "evaluation",
10
20
  "experiment",
11
21
  "generation",
12
22
  "project",
23
+ # Version
13
24
  "__version__",
14
25
  ]
themis/_version.py CHANGED
@@ -7,9 +7,9 @@ from importlib import metadata
7
7
 
8
8
  def _detect_version() -> str:
9
9
  try:
10
- return metadata.version("themis")
10
+ return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.0.0"
12
+ return "0.2.0" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
themis/api.py ADDED
@@ -0,0 +1,343 @@
1
+ """Unified API for Themis - The primary interface for all evaluations.
2
+
3
+ This module provides the main entry point for running evaluations:
4
+ - Simple one-liner for benchmarks
5
+ - Custom datasets with minimal configuration
6
+ - Distributed execution and cloud storage support
7
+ - Auto-configuration of prompts, metrics, and extractors
8
+
9
+ Example:
10
+ ```python
11
+ import themis
12
+
13
+ # Simple benchmark evaluation
14
+ report = themis.evaluate("math500", model="gpt-4", limit=100)
15
+
16
+ # Custom dataset
17
+ report = themis.evaluate(
18
+ dataset=[{"id": "1", "question": "...", "answer": "..."}],
19
+ model="claude-3-opus",
20
+ prompt="Solve: {question}"
21
+ )
22
+
23
+ # Distributed with cloud storage
24
+ report = themis.evaluate(
25
+ "gsm8k",
26
+ model="gpt-4",
27
+ distributed=True,
28
+ workers=8,
29
+ storage="s3://my-bucket/experiments"
30
+ )
31
+ ```
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ from datetime import datetime
37
+ from pathlib import Path
38
+ from typing import Any, Callable, Sequence
39
+
40
+ from themis.core.entities import (
41
+ ExperimentReport,
42
+ GenerationRecord,
43
+ ModelSpec,
44
+ PromptSpec,
45
+ SamplingConfig,
46
+ )
47
+ from themis.evaluation.pipeline import EvaluationPipeline
48
+ from themis.experiment.orchestrator import ExperimentOrchestrator
49
+ from themis.generation.plan import GenerationPlan
50
+ from themis.generation.router import ProviderRouter
51
+ from themis.generation.runner import GenerationRunner
52
+ from themis.generation.templates import PromptTemplate
53
+ from themis.providers import create_provider
54
+
55
+
56
+ def evaluate(
57
+ benchmark_or_dataset: str | Sequence[dict[str, Any]],
58
+ *,
59
+ model: str,
60
+ limit: int | None = None,
61
+ prompt: str | None = None,
62
+ metrics: list[str] | None = None,
63
+ temperature: float = 0.0,
64
+ max_tokens: int = 512,
65
+ num_samples: int = 1,
66
+ distributed: bool = False,
67
+ workers: int = 4,
68
+ storage: str | Path | None = None,
69
+ run_id: str | None = None,
70
+ resume: bool = True,
71
+ on_result: Callable[[GenerationRecord], None] | None = None,
72
+ **kwargs: Any,
73
+ ) -> ExperimentReport:
74
+ """Run an LLM evaluation with automatic configuration.
75
+
76
+ This is the primary API for Themis. It auto-configures prompts, metrics,
77
+ and extractors based on the benchmark name, or allows full customization
78
+ for custom datasets.
79
+
80
+ Args:
81
+ benchmark_or_dataset: Either a benchmark name (e.g., "math500", "gsm8k")
82
+ or a list of dataset samples as dictionaries. For custom datasets,
83
+ each dict should have: prompt/question (input), answer/reference (output),
84
+ and optionally id (unique identifier).
85
+ model: Model identifier for LiteLLM (e.g., "gpt-4", "claude-3-opus-20240229",
86
+ "azure/gpt-4", "ollama/llama3"). Provider is auto-detected from the name.
87
+ limit: Maximum number of samples to evaluate. Use for testing or when you
88
+ want to evaluate a subset. None means evaluate all samples.
89
+ prompt: Custom prompt template using Python format strings. Variables like
90
+ {prompt}, {question}, {context} will be replaced with dataset fields.
91
+ If None, uses the benchmark's default prompt template.
92
+ metrics: List of metric names to compute. Available: "ExactMatch", "MathVerify",
93
+ "BLEU", "ROUGE", "BERTScore", "METEOR", "PassAtK", "CodeBLEU",
94
+ "ExecutionAccuracy". If None, uses benchmark defaults.
95
+ temperature: Sampling temperature (0.0 = deterministic/greedy, 1.0 = standard,
96
+ 2.0 = very random). Recommended: 0.0 for evaluation reproducibility.
97
+ max_tokens: Maximum tokens in model response. Typical values: 256 for short
98
+ answers, 512 for medium, 2048 for long explanations or code.
99
+ num_samples: Number of responses to generate per prompt. Use >1 for Pass@K
100
+ metrics, ensembling, or measuring response variance.
101
+ distributed: Whether to use distributed execution. Currently a placeholder
102
+ for future Ray integration.
103
+ workers: Number of parallel workers for generation. Higher = faster but may
104
+ hit rate limits. Recommended: 4-16 for APIs, 32+ for local models.
105
+ storage: Storage location for results and cache. Defaults to ".cache/experiments".
106
+ Can be a local path or (future) cloud storage URI.
107
+ run_id: Unique identifier for this run. If None, auto-generated from timestamp
108
+ (e.g., "run-2024-01-15-123456"). Use meaningful IDs for tracking experiments.
109
+ resume: Whether to resume from cached results.
110
+ on_result: Optional callback function called for each result.
111
+ **kwargs: Additional provider-specific options.
112
+
113
+ Returns:
114
+ ExperimentReport containing generation results, evaluation metrics,
115
+ and metadata.
116
+
117
+ Raises:
118
+ ValueError: If benchmark is unknown or configuration is invalid.
119
+ RuntimeError: If evaluation fails.
120
+
121
+ Example:
122
+ >>> report = themis.evaluate("math500", model="gpt-4", limit=10)
123
+ >>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
124
+ Accuracy: 85.00%
125
+ """
126
+ # Import presets system (lazy import to avoid circular dependencies)
127
+ from themis.presets import get_benchmark_preset, parse_model_name
128
+
129
+ # Determine if we're using a benchmark or custom dataset
130
+ is_benchmark = isinstance(benchmark_or_dataset, str)
131
+
132
+ if is_benchmark:
133
+ benchmark_name = benchmark_or_dataset
134
+ # Get preset configuration
135
+ preset = get_benchmark_preset(benchmark_name)
136
+
137
+ # Load dataset using preset loader
138
+ dataset = preset.load_dataset(limit=limit)
139
+
140
+ # Use preset prompt if not overridden
141
+ if prompt is None:
142
+ prompt_template = preset.prompt_template
143
+ else:
144
+ prompt_template = PromptTemplate(name="custom", template=prompt)
145
+
146
+ # Use preset metrics if not overridden
147
+ if metrics is None:
148
+ metrics_list = preset.metrics
149
+ else:
150
+ metrics_list = _resolve_metrics(metrics)
151
+
152
+ # Use preset extractor
153
+ extractor = preset.extractor
154
+
155
+ # Use preset metadata fields
156
+ metadata_fields = preset.metadata_fields
157
+ reference_field = preset.reference_field
158
+ dataset_id_field = preset.dataset_id_field
159
+ else:
160
+ # Custom dataset
161
+ dataset = list(benchmark_or_dataset)
162
+
163
+ # Limit dataset if requested
164
+ if limit is not None:
165
+ dataset = dataset[:limit]
166
+
167
+ # Use provided prompt or default
168
+ if prompt is None:
169
+ raise ValueError(
170
+ "Custom datasets require a prompt template. "
171
+ "Example: prompt='Solve: {question}'"
172
+ )
173
+ prompt_template = PromptTemplate(name="custom", template=prompt)
174
+
175
+ # Use provided metrics or defaults
176
+ if metrics is None:
177
+ metrics_list = _resolve_metrics(["exact_match"])
178
+ else:
179
+ metrics_list = _resolve_metrics(metrics)
180
+
181
+ # Use identity extractor by default
182
+ from themis.evaluation.extractors import IdentityExtractor
183
+ extractor = IdentityExtractor()
184
+
185
+ # Use standard field names
186
+ metadata_fields = ()
187
+ reference_field = "answer"
188
+ dataset_id_field = "id"
189
+
190
+ # Parse model name to get provider and options
191
+ provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
192
+
193
+ # Create model spec
194
+ model_spec = ModelSpec(
195
+ identifier=model_id,
196
+ provider=provider_name,
197
+ )
198
+
199
+ # Create sampling config
200
+ sampling_config = SamplingConfig(
201
+ temperature=temperature,
202
+ top_p=kwargs.get("top_p", 0.95),
203
+ max_tokens=max_tokens,
204
+ )
205
+
206
+ # Create generation plan
207
+ plan = GenerationPlan(
208
+ templates=[prompt_template],
209
+ models=[model_spec],
210
+ sampling_parameters=[sampling_config],
211
+ dataset_id_field=dataset_id_field,
212
+ reference_field=reference_field,
213
+ metadata_fields=metadata_fields,
214
+ )
215
+
216
+ # Create provider and router
217
+ provider = create_provider(provider_name, **provider_options)
218
+ router = ProviderRouter({model_id: provider})
219
+
220
+ # Create runner
221
+ runner = GenerationRunner(provider=router)
222
+
223
+ # Create evaluation pipeline
224
+ pipeline = EvaluationPipeline(
225
+ extractor=extractor,
226
+ metrics=metrics_list,
227
+ )
228
+
229
+ # Determine storage location
230
+ if storage is None:
231
+ storage_dir = Path.home() / ".themis" / "runs"
232
+ else:
233
+ storage_dir = Path(storage) if not str(storage).startswith(("s3://", "gs://", "azure://")) else storage
234
+
235
+ # Generate run ID if not provided
236
+ if run_id is None:
237
+ run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
238
+
239
+ # Create storage backend
240
+ if isinstance(storage_dir, Path):
241
+ from themis.experiment.storage import ExperimentStorage
242
+ storage_backend = ExperimentStorage(storage_dir)
243
+ else:
244
+ # Cloud storage (to be implemented in Phase 3)
245
+ raise NotImplementedError(
246
+ f"Cloud storage not yet implemented. Use local path for now. "
247
+ f"Requested: {storage_dir}"
248
+ )
249
+
250
+ # Create orchestrator
251
+ orchestrator = ExperimentOrchestrator(
252
+ generation_plan=plan,
253
+ generation_runner=runner,
254
+ evaluation_pipeline=pipeline,
255
+ storage=storage_backend,
256
+ )
257
+
258
+ # Run evaluation
259
+ if distributed:
260
+ # Distributed execution (to be implemented in Phase 3)
261
+ raise NotImplementedError(
262
+ "Distributed execution not yet implemented. "
263
+ "Set distributed=False to use local execution."
264
+ )
265
+
266
+ # Run locally
267
+ report = orchestrator.run(
268
+ dataset=dataset,
269
+ max_samples=limit,
270
+ run_id=run_id,
271
+ resume=resume,
272
+ on_result=on_result,
273
+ )
274
+
275
+ return report
276
+
277
+
278
+ def _resolve_metrics(metric_names: list[str]) -> list:
279
+ """Resolve metric names to metric instances.
280
+
281
+ Args:
282
+ metric_names: List of metric names (e.g., ["exact_match", "bleu"])
283
+
284
+ Returns:
285
+ List of metric instances
286
+
287
+ Raises:
288
+ ValueError: If a metric name is unknown
289
+ """
290
+ from themis.evaluation.metrics.exact_match import ExactMatch
291
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
292
+ from themis.evaluation.metrics.response_length import ResponseLength
293
+
294
+ # NLP metrics (Phase 2)
295
+ try:
296
+ from themis.evaluation.metrics.nlp import BLEU, ROUGE, BERTScore, METEOR, ROUGEVariant
297
+ nlp_available = True
298
+ except ImportError:
299
+ nlp_available = False
300
+
301
+ # Metric registry
302
+ METRICS_REGISTRY = {
303
+ # Core metrics
304
+ "exact_match": ExactMatch,
305
+ "math_verify": MathVerifyAccuracy,
306
+ "response_length": ResponseLength,
307
+ }
308
+
309
+ # Add NLP metrics if available
310
+ if nlp_available:
311
+ METRICS_REGISTRY.update({
312
+ "bleu": BLEU,
313
+ "rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
314
+ "rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
315
+ "rougeL": lambda: ROUGE(variant=ROUGEVariant.ROUGE_L),
316
+ "bertscore": BERTScore,
317
+ "meteor": METEOR,
318
+ })
319
+
320
+ # Code metrics (to be added later in Phase 2)
321
+ # "pass_at_k": PassAtK,
322
+ # "codebleu": CodeBLEU,
323
+
324
+ metrics = []
325
+ for name in metric_names:
326
+ if name not in METRICS_REGISTRY:
327
+ available = ", ".join(sorted(METRICS_REGISTRY.keys()))
328
+ raise ValueError(
329
+ f"Unknown metric: {name}. "
330
+ f"Available metrics: {available}"
331
+ )
332
+
333
+ metric_cls = METRICS_REGISTRY[name]
334
+ # Handle both class and lambda factory
335
+ if callable(metric_cls) and not isinstance(metric_cls, type):
336
+ metrics.append(metric_cls())
337
+ else:
338
+ metrics.append(metric_cls())
339
+
340
+ return metrics
341
+
342
+
343
+ __all__ = ["evaluate"]
@@ -0,0 +1,17 @@
1
+ """Backend interfaces for extending Themis.
2
+
3
+ This module provides abstract interfaces for implementing custom backends:
4
+ - StorageBackend: Custom storage implementations (cloud, databases, etc.)
5
+ - ExecutionBackend: Custom execution strategies (distributed, async, etc.)
6
+
7
+ These interfaces allow advanced users to extend Themis without modifying core code.
8
+ """
9
+
10
+ from themis.backends.execution import ExecutionBackend, LocalExecutionBackend
11
+ from themis.backends.storage import StorageBackend
12
+
13
+ __all__ = [
14
+ "StorageBackend",
15
+ "ExecutionBackend",
16
+ "LocalExecutionBackend",
17
+ ]
@@ -0,0 +1,197 @@
1
+ """Execution backend interface for custom execution strategies.
2
+
3
+ This module defines the abstract interface for execution backends, allowing
4
+ users to implement custom execution strategies (distributed, GPU-accelerated,
5
+ async, etc.) without modifying Themis core code.
6
+
7
+ Example implementations:
8
+ - RayExecutionBackend: Distributed execution with Ray
9
+ - DaskExecutionBackend: Distributed execution with Dask
10
+ - AsyncExecutionBackend: Async/await based execution
11
+ - GPUBatchExecutionBackend: Batched GPU execution for vLLM
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from typing import Any, Callable, Iterable, Iterator, TypeVar
19
+
20
+ T = TypeVar("T")
21
+ R = TypeVar("R")
22
+
23
+
24
+ class ExecutionBackend(ABC):
25
+ """Abstract interface for execution backends.
26
+
27
+ Implement this interface to create custom execution strategies.
28
+
29
+ Example:
30
+ >>> class RayExecutionBackend(ExecutionBackend):
31
+ ... def __init__(self, num_cpus: int = 4):
32
+ ... import ray
33
+ ... if not ray.is_initialized():
34
+ ... ray.init(num_cpus=num_cpus)
35
+ ...
36
+ ... def map(self, func, items, max_workers=None):
37
+ ... import ray
38
+ ... # Convert to Ray remote function
39
+ ... remote_func = ray.remote(func)
40
+ ... # Submit all tasks
41
+ ... futures = [remote_func.remote(item) for item in items]
42
+ ... # Get results as they complete
43
+ ... for future in futures:
44
+ ... yield ray.get(future)
45
+ """
46
+
47
+ @abstractmethod
48
+ def map(
49
+ self,
50
+ func: Callable[[T], R],
51
+ items: Iterable[T],
52
+ *,
53
+ max_workers: int | None = None,
54
+ timeout: float | None = None,
55
+ **kwargs: Any,
56
+ ) -> Iterator[R]:
57
+ """Execute function over items in parallel.
58
+
59
+ Args:
60
+ func: Function to apply to each item
61
+ items: Iterable of items to process
62
+ max_workers: Maximum number of parallel workers
63
+ timeout: Timeout for each execution (seconds)
64
+ **kwargs: Additional backend-specific options
65
+
66
+ Yields:
67
+ Results as they complete
68
+
69
+ Note:
70
+ Results may be yielded in any order (not necessarily input order).
71
+ Implementation should handle errors gracefully.
72
+ """
73
+ pass
74
+
75
+ @abstractmethod
76
+ def shutdown(self) -> None:
77
+ """Shutdown the execution backend and release resources.
78
+
79
+ Called when execution is complete. Should cleanup workers,
80
+ connections, and other resources.
81
+ """
82
+ pass
83
+
84
+ def __enter__(self):
85
+ """Context manager entry."""
86
+ return self
87
+
88
+ def __exit__(self, exc_type, exc_val, exc_tb):
89
+ """Context manager exit."""
90
+ self.shutdown()
91
+ return False
92
+
93
+
94
+ class LocalExecutionBackend(ExecutionBackend):
95
+ """Local multi-threaded execution using ThreadPoolExecutor.
96
+
97
+ This is the default execution backend, using Python's built-in
98
+ ThreadPoolExecutor for parallel execution.
99
+
100
+ Attributes:
101
+ executor: ThreadPoolExecutor instance
102
+ """
103
+
104
+ def __init__(self, max_workers: int = 4):
105
+ """Initialize with number of workers.
106
+
107
+ Args:
108
+ max_workers: Maximum number of worker threads
109
+ """
110
+ self._max_workers = max_workers
111
+ self._executor: ThreadPoolExecutor | None = None
112
+
113
+ def map(
114
+ self,
115
+ func: Callable[[T], R],
116
+ items: Iterable[T],
117
+ *,
118
+ max_workers: int | None = None,
119
+ timeout: float | None = None,
120
+ **kwargs: Any,
121
+ ) -> Iterator[R]:
122
+ """Execute function over items using ThreadPoolExecutor.
123
+
124
+ Args:
125
+ func: Function to apply to each item
126
+ items: Iterable of items to process
127
+ max_workers: Override default max_workers
128
+ timeout: Timeout for each task (seconds)
129
+ **kwargs: Ignored (for interface compatibility)
130
+
131
+ Yields:
132
+ Results as they complete
133
+ """
134
+ workers = max_workers or self._max_workers
135
+
136
+ # Create executor if not exists
137
+ if self._executor is None:
138
+ self._executor = ThreadPoolExecutor(max_workers=workers)
139
+
140
+ # Submit all tasks
141
+ items_list = list(items) # Materialize iterator
142
+ futures = [self._executor.submit(func, item) for item in items_list]
143
+
144
+ # Yield results as they complete
145
+ for future in as_completed(futures, timeout=timeout):
146
+ result = future.result()
147
+ yield result
148
+
149
+ def shutdown(self) -> None:
150
+ """Shutdown the executor."""
151
+ if self._executor is not None:
152
+ self._executor.shutdown(wait=True)
153
+ self._executor = None
154
+
155
+
156
+ class SequentialExecutionBackend(ExecutionBackend):
157
+ """Sequential execution backend for debugging.
158
+
159
+ Executes tasks one at a time without parallelism.
160
+ Useful for debugging, testing, or when parallelism causes issues.
161
+ """
162
+
163
+ def map(
164
+ self,
165
+ func: Callable[[T], R],
166
+ items: Iterable[T],
167
+ *,
168
+ max_workers: int | None = None,
169
+ timeout: float | None = None,
170
+ **kwargs: Any,
171
+ ) -> Iterator[R]:
172
+ """Execute function sequentially.
173
+
174
+ Args:
175
+ func: Function to apply to each item
176
+ items: Iterable of items to process
177
+ max_workers: Ignored (no parallelism)
178
+ timeout: Timeout for each task (seconds)
179
+ **kwargs: Ignored
180
+
181
+ Yields:
182
+ Results in input order
183
+ """
184
+ for item in items:
185
+ result = func(item)
186
+ yield result
187
+
188
+ def shutdown(self) -> None:
189
+ """No-op for sequential execution."""
190
+ pass
191
+
192
+
193
+ __all__ = [
194
+ "ExecutionBackend",
195
+ "LocalExecutionBackend",
196
+ "SequentialExecutionBackend",
197
+ ]