themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +16 -0
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/server/app.py
CHANGED
|
@@ -17,7 +17,8 @@ from pydantic import BaseModel, Field
|
|
|
17
17
|
|
|
18
18
|
from themis.comparison import compare_runs
|
|
19
19
|
from themis.comparison.statistics import StatisticalTest
|
|
20
|
-
from themis.
|
|
20
|
+
from themis.storage import ExperimentStorage
|
|
21
|
+
from themis._version import __version__
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class RunSummary(BaseModel):
|
|
@@ -71,7 +72,7 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
71
72
|
app = FastAPI(
|
|
72
73
|
title="Themis API",
|
|
73
74
|
description="REST API for Themis experiment management",
|
|
74
|
-
version=
|
|
75
|
+
version=__version__,
|
|
75
76
|
)
|
|
76
77
|
|
|
77
78
|
# Enable CORS for web dashboard
|
|
@@ -117,13 +118,16 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
117
118
|
return {
|
|
118
119
|
"status": "ok",
|
|
119
120
|
"service": "themis-api",
|
|
120
|
-
"version":
|
|
121
|
+
"version": __version__,
|
|
121
122
|
}
|
|
122
123
|
|
|
123
124
|
@app.get("/api/runs", response_model=List[RunSummary], tags=["runs"])
|
|
124
125
|
async def list_runs():
|
|
125
126
|
"""List all experiment runs."""
|
|
126
|
-
|
|
127
|
+
run_entries = storage.list_runs()
|
|
128
|
+
run_ids = [
|
|
129
|
+
entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
|
|
130
|
+
]
|
|
127
131
|
|
|
128
132
|
summaries = []
|
|
129
133
|
for run_id in run_ids:
|
|
@@ -133,15 +137,12 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
133
137
|
# Calculate average metrics
|
|
134
138
|
metrics_dict: Dict[str, List[float]] = {}
|
|
135
139
|
for record in eval_records.values():
|
|
136
|
-
for
|
|
140
|
+
for score_obj in record.scores:
|
|
141
|
+
metric_name = score_obj.metric_name
|
|
137
142
|
if metric_name not in metrics_dict:
|
|
138
143
|
metrics_dict[metric_name] = []
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
if hasattr(score_obj, 'value'):
|
|
142
|
-
metrics_dict[metric_name].append(score_obj.value)
|
|
143
|
-
elif isinstance(score_obj, (int, float)):
|
|
144
|
-
metrics_dict[metric_name].append(float(score_obj))
|
|
144
|
+
|
|
145
|
+
metrics_dict[metric_name].append(score_obj.value)
|
|
145
146
|
|
|
146
147
|
# Average metrics
|
|
147
148
|
avg_metrics = {
|
|
@@ -162,7 +163,11 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
162
163
|
@app.get("/api/runs/{run_id}", response_model=RunDetail, tags=["runs"])
|
|
163
164
|
async def get_run(run_id: str):
|
|
164
165
|
"""Get detailed information about a run."""
|
|
165
|
-
|
|
166
|
+
run_entries = storage.list_runs()
|
|
167
|
+
run_ids = [
|
|
168
|
+
entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
|
|
169
|
+
]
|
|
170
|
+
if run_id not in run_ids:
|
|
166
171
|
raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
|
|
167
172
|
|
|
168
173
|
# Load records
|
|
@@ -179,25 +184,25 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
179
184
|
|
|
180
185
|
# Extract scores
|
|
181
186
|
scores = {}
|
|
182
|
-
for
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
value = float(score_obj)
|
|
187
|
-
else:
|
|
188
|
-
continue
|
|
189
|
-
|
|
187
|
+
for score_obj in eval_record.scores:
|
|
188
|
+
metric_name = score_obj.metric_name
|
|
189
|
+
value = score_obj.value
|
|
190
|
+
|
|
190
191
|
scores[metric_name] = value
|
|
191
|
-
|
|
192
|
+
|
|
192
193
|
if metric_name not in metrics_dict:
|
|
193
194
|
metrics_dict[metric_name] = []
|
|
194
195
|
metrics_dict[metric_name].append(value)
|
|
195
196
|
|
|
196
197
|
# Build sample
|
|
198
|
+
sample_id = eval_record.sample_id
|
|
199
|
+
if sample_id is None and gen_record is not None:
|
|
200
|
+
sample_id = gen_record.task.metadata.get("dataset_id")
|
|
201
|
+
|
|
197
202
|
sample = {
|
|
198
|
-
"id":
|
|
199
|
-
"prompt": gen_record.prompt if gen_record else "",
|
|
200
|
-
"response": gen_record.
|
|
203
|
+
"id": sample_id or cache_key,
|
|
204
|
+
"prompt": gen_record.task.prompt.text if gen_record else "",
|
|
205
|
+
"response": gen_record.output.text if gen_record and gen_record.output else "",
|
|
201
206
|
"scores": scores,
|
|
202
207
|
}
|
|
203
208
|
samples.append(sample)
|
|
@@ -220,7 +225,11 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
220
225
|
@app.delete("/api/runs/{run_id}", tags=["runs"])
|
|
221
226
|
async def delete_run(run_id: str):
|
|
222
227
|
"""Delete a run."""
|
|
223
|
-
|
|
228
|
+
run_entries = storage.list_runs()
|
|
229
|
+
run_ids = [
|
|
230
|
+
entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
|
|
231
|
+
]
|
|
232
|
+
if run_id not in run_ids:
|
|
224
233
|
raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
|
|
225
234
|
|
|
226
235
|
# Note: Current storage doesn't implement delete
|
|
@@ -234,7 +243,10 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
|
|
|
234
243
|
async def compare_runs_api(request: ComparisonRequest):
|
|
235
244
|
"""Compare multiple runs."""
|
|
236
245
|
# Validate runs exist
|
|
237
|
-
|
|
246
|
+
run_entries = storage.list_runs()
|
|
247
|
+
existing_runs = set(
|
|
248
|
+
entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
|
|
249
|
+
)
|
|
238
250
|
for run_id in request.run_ids:
|
|
239
251
|
if run_id not in existing_runs:
|
|
240
252
|
raise HTTPException(
|
themis/session.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Experiment session orchestration for vNext workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable, Sequence
|
|
8
|
+
|
|
9
|
+
from themis.core.entities import ExperimentReport, ModelSpec, SamplingConfig
|
|
10
|
+
from themis.evaluation.pipeline import EvaluationPipelineContract
|
|
11
|
+
from themis.experiment.orchestrator import ExperimentOrchestrator
|
|
12
|
+
from themis.generation.plan import GenerationPlan
|
|
13
|
+
from themis.generation.router import ProviderRouter
|
|
14
|
+
from themis.generation.runner import GenerationRunner
|
|
15
|
+
from themis.generation.templates import PromptTemplate
|
|
16
|
+
from themis.interfaces import DatasetAdapter
|
|
17
|
+
from themis.presets import parse_model_name
|
|
18
|
+
from themis.providers import create_provider
|
|
19
|
+
from themis.specs import ExecutionSpec, ExperimentSpec, StorageSpec
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ExperimentSession:
|
|
24
|
+
"""Main entry point for running experiments with vNext specs."""
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
self,
|
|
28
|
+
spec: ExperimentSpec,
|
|
29
|
+
*,
|
|
30
|
+
execution: ExecutionSpec | None = None,
|
|
31
|
+
storage: StorageSpec | None = None,
|
|
32
|
+
) -> ExperimentReport:
|
|
33
|
+
execution = execution or ExecutionSpec()
|
|
34
|
+
storage = storage or StorageSpec()
|
|
35
|
+
|
|
36
|
+
pipeline = spec.pipeline
|
|
37
|
+
if not isinstance(pipeline, EvaluationPipelineContract):
|
|
38
|
+
raise TypeError(
|
|
39
|
+
"spec.pipeline must implement EvaluationPipelineContract."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
dataset = _resolve_dataset(spec.dataset)
|
|
43
|
+
|
|
44
|
+
provider_name, model_id, provider_options = _parse_model(spec.model)
|
|
45
|
+
model_spec = ModelSpec(identifier=model_id, provider=provider_name)
|
|
46
|
+
sampling = _build_sampling(spec.sampling)
|
|
47
|
+
|
|
48
|
+
plan = GenerationPlan(
|
|
49
|
+
templates=[PromptTemplate(name="default", template=spec.prompt)],
|
|
50
|
+
models=[model_spec],
|
|
51
|
+
sampling_parameters=[sampling],
|
|
52
|
+
dataset_id_field="id",
|
|
53
|
+
reference_field="answer",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
provider = create_provider(provider_name, **provider_options)
|
|
57
|
+
router = ProviderRouter({(provider_name, model_id): provider})
|
|
58
|
+
|
|
59
|
+
runner = GenerationRunner(
|
|
60
|
+
provider=router,
|
|
61
|
+
max_parallel=execution.workers,
|
|
62
|
+
max_retries=execution.max_retries,
|
|
63
|
+
retry_initial_delay=execution.retry_initial_delay,
|
|
64
|
+
retry_backoff_multiplier=execution.retry_backoff_multiplier,
|
|
65
|
+
retry_max_delay=execution.retry_max_delay,
|
|
66
|
+
execution_backend=execution.backend,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
storage_backend = _resolve_storage(storage)
|
|
70
|
+
|
|
71
|
+
orchestrator = ExperimentOrchestrator(
|
|
72
|
+
generation_plan=plan,
|
|
73
|
+
generation_runner=runner,
|
|
74
|
+
evaluation_pipeline=pipeline,
|
|
75
|
+
storage=storage_backend,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return orchestrator.run(
|
|
79
|
+
dataset=dataset,
|
|
80
|
+
run_id=spec.run_id,
|
|
81
|
+
resume=storage.cache,
|
|
82
|
+
cache_results=storage.cache,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _parse_model(model: str) -> tuple[str, str, dict]:
|
|
87
|
+
if ":" in model:
|
|
88
|
+
provider_name, model_id = model.split(":", 1)
|
|
89
|
+
return provider_name, model_id, {}
|
|
90
|
+
return parse_model_name(model)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _build_sampling(data: dict) -> SamplingConfig:
|
|
94
|
+
return SamplingConfig(
|
|
95
|
+
temperature=float(data.get("temperature", 0.0)),
|
|
96
|
+
top_p=float(data.get("top_p", 0.95)),
|
|
97
|
+
max_tokens=int(data.get("max_tokens", 512)),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _resolve_dataset(dataset: object) -> list[dict]:
|
|
102
|
+
if isinstance(dataset, DatasetAdapter):
|
|
103
|
+
return list(dataset.iter_samples())
|
|
104
|
+
if isinstance(dataset, Iterable):
|
|
105
|
+
return list(dataset) # type: ignore[arg-type]
|
|
106
|
+
raise TypeError("spec.dataset must be iterable or implement DatasetAdapter.")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _resolve_storage(storage: StorageSpec):
|
|
110
|
+
if storage.backend is not None:
|
|
111
|
+
backend = storage.backend
|
|
112
|
+
if hasattr(backend, "experiment_storage"):
|
|
113
|
+
return backend.experiment_storage
|
|
114
|
+
if not hasattr(backend, "start_run"):
|
|
115
|
+
raise TypeError(
|
|
116
|
+
"storage.backend must be ExperimentStorage-compatible."
|
|
117
|
+
)
|
|
118
|
+
return backend
|
|
119
|
+
root = Path(storage.path) if storage.path is not None else Path(".cache/experiments")
|
|
120
|
+
from themis.storage import ExperimentStorage
|
|
121
|
+
|
|
122
|
+
return ExperimentStorage(root)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
__all__ = ["ExperimentSession"]
|
themis/specs/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Execution specification for vNext workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class ExecutionSpec:
|
|
10
|
+
"""Execution configuration for running experiments."""
|
|
11
|
+
|
|
12
|
+
backend: object | None = None
|
|
13
|
+
workers: int = 4
|
|
14
|
+
max_retries: int = 3
|
|
15
|
+
retry_initial_delay: float = 0.5
|
|
16
|
+
retry_backoff_multiplier: float = 2.0
|
|
17
|
+
retry_max_delay: float | None = 2.0
|
|
18
|
+
|
|
19
|
+
def __post_init__(self) -> None:
|
|
20
|
+
if self.workers < 1:
|
|
21
|
+
raise ValueError("ExecutionSpec.workers must be >= 1.")
|
|
22
|
+
if self.max_retries < 1:
|
|
23
|
+
raise ValueError("ExecutionSpec.max_retries must be >= 1.")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__ = ["ExecutionSpec"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Experiment specification for vNext workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Mapping
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class ExperimentSpec:
|
|
11
|
+
"""Canonical experiment specification.
|
|
12
|
+
|
|
13
|
+
This spec is the single source of truth for the experiment's
|
|
14
|
+
dataset, prompt, model, sampling config, and evaluation pipeline.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
dataset: object
|
|
18
|
+
prompt: str
|
|
19
|
+
model: str
|
|
20
|
+
sampling: Mapping[str, Any] = field(default_factory=dict)
|
|
21
|
+
pipeline: object | None = None
|
|
22
|
+
run_id: str | None = None
|
|
23
|
+
|
|
24
|
+
def __post_init__(self) -> None:
|
|
25
|
+
if not self.prompt:
|
|
26
|
+
raise ValueError("ExperimentSpec.prompt must be a non-empty string.")
|
|
27
|
+
if not self.model:
|
|
28
|
+
raise ValueError("ExperimentSpec.model must be a non-empty string.")
|
|
29
|
+
if self.pipeline is None:
|
|
30
|
+
raise ValueError("ExperimentSpec.pipeline must be provided.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
__all__ = ["ExperimentSpec"]
|
themis/specs/storage.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Storage specification for vNext workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class StorageSpec:
|
|
11
|
+
"""Storage configuration for experiment persistence and caching."""
|
|
12
|
+
|
|
13
|
+
backend: object | None = None
|
|
14
|
+
path: str | Path | None = None
|
|
15
|
+
cache: bool = True
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = ["StorageSpec"]
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Storage backends and adapters for vNext workflows."""
|
|
2
|
+
|
|
3
|
+
from themis.backends.storage import LocalFileStorageBackend, StorageBackend
|
|
4
|
+
from themis.storage.experiment_storage import ExperimentStorage
|
|
5
|
+
|
|
6
|
+
__all__ = ["StorageBackend", "LocalFileStorageBackend", "ExperimentStorage"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: themis-eval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Lightweight evaluation platform for LLM experiments
|
|
5
5
|
Author: Pittawat Taveekitworachai
|
|
6
6
|
License: MIT
|
|
@@ -100,13 +100,14 @@ pip install themis-eval[math,nlp,code,server]
|
|
|
100
100
|
from themis import evaluate
|
|
101
101
|
|
|
102
102
|
# Evaluate any model on any benchmark
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
report = evaluate(
|
|
104
|
+
"gsm8k",
|
|
105
105
|
model="gpt-4",
|
|
106
|
-
limit=100
|
|
106
|
+
limit=100,
|
|
107
107
|
)
|
|
108
108
|
|
|
109
|
-
|
|
109
|
+
accuracy = report.evaluation_report.metrics["ExactMatch"].mean
|
|
110
|
+
print(f"Accuracy: {accuracy:.2%}")
|
|
110
111
|
```
|
|
111
112
|
|
|
112
113
|
### CLI Usage
|
|
@@ -122,6 +123,9 @@ themis compare gpt4-run claude-run
|
|
|
122
123
|
|
|
123
124
|
# Start web dashboard
|
|
124
125
|
themis serve
|
|
126
|
+
|
|
127
|
+
# Share a run
|
|
128
|
+
themis share gpt4-run --output-dir share
|
|
125
129
|
```
|
|
126
130
|
|
|
127
131
|
---
|
|
@@ -130,20 +134,28 @@ themis serve
|
|
|
130
134
|
|
|
131
135
|
### 🎯 Built-in Benchmarks
|
|
132
136
|
|
|
133
|
-
Themis includes
|
|
137
|
+
Themis includes 19 built-in benchmarks out-of-the-box:
|
|
134
138
|
|
|
135
139
|
```python
|
|
136
140
|
# Math reasoning
|
|
137
|
-
evaluate(
|
|
138
|
-
evaluate(
|
|
139
|
-
evaluate(
|
|
141
|
+
evaluate("gsm8k", model="gpt-4", limit=100)
|
|
142
|
+
evaluate("math500", model="gpt-4", limit=50)
|
|
143
|
+
evaluate("aime24", model="gpt-4")
|
|
140
144
|
|
|
141
145
|
# General knowledge
|
|
142
|
-
evaluate(
|
|
143
|
-
evaluate(
|
|
146
|
+
evaluate("mmlu-pro", model="gpt-4", limit=1000)
|
|
147
|
+
evaluate("supergpqa", model="gpt-4")
|
|
148
|
+
|
|
149
|
+
# Science & medical
|
|
150
|
+
evaluate("gpqa", model="gpt-4", limit=200)
|
|
151
|
+
evaluate("medmcqa", model="gpt-4", limit=200)
|
|
152
|
+
|
|
153
|
+
# Commonsense & conversational
|
|
154
|
+
evaluate("commonsense_qa", model="gpt-4", limit=200)
|
|
155
|
+
evaluate("coqa", model="gpt-4", limit=200)
|
|
144
156
|
|
|
145
157
|
# Quick testing
|
|
146
|
-
evaluate(
|
|
158
|
+
evaluate("demo", model="fake-math-llm", limit=10)
|
|
147
159
|
```
|
|
148
160
|
|
|
149
161
|
**See all available benchmarks:**
|
|
@@ -165,8 +177,7 @@ themis list benchmarks
|
|
|
165
177
|
|
|
166
178
|
```python
|
|
167
179
|
# Use specific metrics
|
|
168
|
-
result = evaluate(
|
|
169
|
-
benchmark="gsm8k",
|
|
180
|
+
result = evaluate("gsm8k",
|
|
170
181
|
model="gpt-4",
|
|
171
182
|
metrics=["exact_match", "bleu", "rouge1"],
|
|
172
183
|
)
|
|
@@ -192,7 +203,7 @@ print(report.summary())
|
|
|
192
203
|
|
|
193
204
|
**CLI:**
|
|
194
205
|
```bash
|
|
195
|
-
themis compare run-1 run-2 --
|
|
206
|
+
themis compare run-1 run-2 --output comparison.html
|
|
196
207
|
```
|
|
197
208
|
|
|
198
209
|
### 🌐 Web Dashboard
|
|
@@ -218,19 +229,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
|
|
|
218
229
|
|
|
219
230
|
```python
|
|
220
231
|
# OpenAI
|
|
221
|
-
evaluate(
|
|
232
|
+
evaluate("gsm8k", model="gpt-4")
|
|
222
233
|
|
|
223
234
|
# Anthropic
|
|
224
|
-
evaluate(
|
|
235
|
+
evaluate("gsm8k", model="claude-3-opus-20240229")
|
|
225
236
|
|
|
226
237
|
# Azure OpenAI
|
|
227
|
-
evaluate(
|
|
238
|
+
evaluate("gsm8k", model="azure/gpt-4")
|
|
228
239
|
|
|
229
240
|
# Local models (vLLM, Ollama, etc.)
|
|
230
|
-
evaluate(
|
|
241
|
+
evaluate("gsm8k", model="ollama/llama3")
|
|
231
242
|
|
|
232
243
|
# AWS Bedrock
|
|
233
|
-
evaluate(
|
|
244
|
+
evaluate("gsm8k", model="bedrock/anthropic.claude-3")
|
|
234
245
|
```
|
|
235
246
|
|
|
236
247
|
### 💾 Smart Caching
|
|
@@ -239,8 +250,7 @@ Themis automatically caches results and resumes failed runs:
|
|
|
239
250
|
|
|
240
251
|
```python
|
|
241
252
|
# Run with caching
|
|
242
|
-
result = evaluate(
|
|
243
|
-
benchmark="gsm8k",
|
|
253
|
+
result = evaluate("gsm8k",
|
|
244
254
|
model="gpt-4",
|
|
245
255
|
limit=1000,
|
|
246
256
|
run_id="my-experiment",
|
|
@@ -275,14 +285,13 @@ result = evaluate(
|
|
|
275
285
|
metrics=["exact_match"],
|
|
276
286
|
)
|
|
277
287
|
|
|
278
|
-
print(result.
|
|
288
|
+
print(result.evaluation_report.metrics["ExactMatch"].mean)
|
|
279
289
|
```
|
|
280
290
|
|
|
281
291
|
### Advanced Configuration
|
|
282
292
|
|
|
283
293
|
```python
|
|
284
|
-
result = evaluate(
|
|
285
|
-
benchmark="gsm8k",
|
|
294
|
+
result = evaluate("gsm8k",
|
|
286
295
|
model="gpt-4",
|
|
287
296
|
temperature=0.7,
|
|
288
297
|
max_tokens=512,
|
|
@@ -335,7 +344,7 @@ Themis is built on a clean, modular architecture:
|
|
|
335
344
|
│ │
|
|
336
345
|
┌────▼─────┐ ┌────▼─────┐
|
|
337
346
|
│Benchmarks│ │Evaluation│
|
|
338
|
-
│(
|
|
347
|
+
│(19 built-│ │ Pipeline │
|
|
339
348
|
│ in) │ └────┬─────┘
|
|
340
349
|
└──────────┘ │
|
|
341
350
|
┌────▼─────┐
|
|
@@ -359,7 +368,7 @@ Themis is built on a clean, modular architecture:
|
|
|
359
368
|
|
|
360
369
|
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
361
370
|
- **[Examples](examples-simple/)** - Runnable code examples
|
|
362
|
-
- **[
|
|
371
|
+
- **[Backends API](docs/api/backends.md)** - Custom storage and execution
|
|
363
372
|
- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
|
|
364
373
|
- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
|
|
365
374
|
|
|
@@ -382,14 +391,13 @@ class S3StorageBackend(StorageBackend):
|
|
|
382
391
|
# ... implement other methods
|
|
383
392
|
|
|
384
393
|
# Use custom backend
|
|
385
|
-
result = evaluate(
|
|
386
|
-
benchmark="gsm8k",
|
|
394
|
+
result = evaluate("gsm8k",
|
|
387
395
|
model="gpt-4",
|
|
388
396
|
storage_backend=S3StorageBackend(bucket="my-bucket")
|
|
389
397
|
)
|
|
390
398
|
```
|
|
391
399
|
|
|
392
|
-
See [docs/
|
|
400
|
+
See [docs/api/backends.md](docs/api/backends.md) for details.
|
|
393
401
|
|
|
394
402
|
### Distributed Execution
|
|
395
403
|
|
|
@@ -401,8 +409,7 @@ class RayExecutionBackend(ExecutionBackend):
|
|
|
401
409
|
"""Distributed execution with Ray"""
|
|
402
410
|
# ... implementation
|
|
403
411
|
|
|
404
|
-
result = evaluate(
|
|
405
|
-
benchmark="math500",
|
|
412
|
+
result = evaluate("math500",
|
|
406
413
|
model="gpt-4",
|
|
407
414
|
execution_backend=RayExecutionBackend(num_cpus=32)
|
|
408
415
|
)
|
|
@@ -454,10 +461,10 @@ themis eval <benchmark> --model <model> [options]
|
|
|
454
461
|
themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
|
|
455
462
|
|
|
456
463
|
# Options:
|
|
464
|
+
# --metric NAME Restrict to one metric
|
|
457
465
|
# --storage PATH Storage directory
|
|
458
|
-
# --test STR Statistical test: t_test, bootstrap, permutation
|
|
459
|
-
# --alpha FLOAT Significance level (default: 0.05)
|
|
460
466
|
# --output FILE Export report (.json, .html, .md)
|
|
467
|
+
# --show-diff Include detailed per-sample differences in summary
|
|
461
468
|
```
|
|
462
469
|
|
|
463
470
|
### Server
|
|
@@ -539,6 +546,12 @@ uv run python examples-simple/04_comparison.py
|
|
|
539
546
|
|
|
540
547
|
# API server example
|
|
541
548
|
uv run python examples-simple/05_api_server.py
|
|
549
|
+
|
|
550
|
+
# Resume/cache example
|
|
551
|
+
uv run python examples-simple/08_resume_cache.py
|
|
552
|
+
|
|
553
|
+
# End-to-end research loop example
|
|
554
|
+
uv run python examples-simple/09_research_loop.py
|
|
542
555
|
```
|
|
543
556
|
|
|
544
557
|
---
|