themis-eval 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis_eval-0.2.0/PKG-INFO +596 -0
- themis_eval-0.2.0/README.md +538 -0
- {themis_eval-0.1.0 → themis_eval-0.2.0}/pyproject.toml +28 -6
- themis_eval-0.2.0/themis/__init__.py +25 -0
- {themis_eval-0.1.0 → themis_eval-0.2.0}/themis/_version.py +2 -2
- themis_eval-0.2.0/themis/api.py +343 -0
- themis_eval-0.2.0/themis/backends/__init__.py +17 -0
- themis_eval-0.2.0/themis/backends/execution.py +197 -0
- themis_eval-0.2.0/themis/backends/storage.py +260 -0
- themis_eval-0.2.0/themis/cli/__init__.py +5 -0
- themis_eval-0.2.0/themis/cli/__main__.py +6 -0
- themis_eval-0.2.0/themis/cli/commands/__init__.py +19 -0
- themis_eval-0.2.0/themis/cli/commands/benchmarks.py +221 -0
- themis_eval-0.2.0/themis/cli/commands/comparison.py +394 -0
- themis_eval-0.2.0/themis/cli/commands/config_commands.py +244 -0
- themis_eval-0.2.0/themis/cli/commands/cost.py +214 -0
- themis_eval-0.2.0/themis/cli/commands/demo.py +68 -0
- themis_eval-0.2.0/themis/cli/commands/info.py +90 -0
- themis_eval-0.2.0/themis/cli/commands/leaderboard.py +362 -0
- themis_eval-0.2.0/themis/cli/commands/math_benchmarks.py +318 -0
- themis_eval-0.2.0/themis/cli/commands/mcq_benchmarks.py +207 -0
- themis_eval-0.2.0/themis/cli/commands/results.py +252 -0
- themis_eval-0.2.0/themis/cli/commands/sample_run.py +244 -0
- themis_eval-0.2.0/themis/cli/commands/visualize.py +299 -0
- themis_eval-0.2.0/themis/cli/main.py +463 -0
- themis_eval-0.2.0/themis/cli/new_project.py +33 -0
- themis_eval-0.2.0/themis/cli/utils.py +51 -0
- themis_eval-0.2.0/themis/comparison/__init__.py +25 -0
- themis_eval-0.2.0/themis/comparison/engine.py +348 -0
- themis_eval-0.2.0/themis/comparison/reports.py +283 -0
- themis_eval-0.2.0/themis/comparison/statistics.py +402 -0
- themis_eval-0.2.0/themis/config/__init__.py +19 -0
- themis_eval-0.2.0/themis/config/loader.py +27 -0
- themis_eval-0.2.0/themis/config/registry.py +34 -0
- themis_eval-0.2.0/themis/config/runtime.py +214 -0
- themis_eval-0.2.0/themis/config/schema.py +112 -0
- themis_eval-0.2.0/themis/core/__init__.py +5 -0
- themis_eval-0.2.0/themis/core/conversation.py +354 -0
- themis_eval-0.2.0/themis/core/entities.py +184 -0
- themis_eval-0.2.0/themis/core/serialization.py +231 -0
- themis_eval-0.2.0/themis/core/tools.py +393 -0
- themis_eval-0.2.0/themis/core/types.py +141 -0
- themis_eval-0.2.0/themis/datasets/__init__.py +273 -0
- themis_eval-0.2.0/themis/datasets/base.py +264 -0
- themis_eval-0.2.0/themis/datasets/commonsense_qa.py +174 -0
- themis_eval-0.2.0/themis/datasets/competition_math.py +265 -0
- themis_eval-0.2.0/themis/datasets/coqa.py +133 -0
- themis_eval-0.2.0/themis/datasets/gpqa.py +190 -0
- themis_eval-0.2.0/themis/datasets/gsm8k.py +123 -0
- themis_eval-0.2.0/themis/datasets/gsm_symbolic.py +124 -0
- themis_eval-0.2.0/themis/datasets/math500.py +122 -0
- themis_eval-0.2.0/themis/datasets/med_qa.py +179 -0
- themis_eval-0.2.0/themis/datasets/medmcqa.py +169 -0
- themis_eval-0.2.0/themis/datasets/mmlu_pro.py +262 -0
- themis_eval-0.2.0/themis/datasets/piqa.py +146 -0
- themis_eval-0.2.0/themis/datasets/registry.py +201 -0
- themis_eval-0.2.0/themis/datasets/schema.py +245 -0
- themis_eval-0.2.0/themis/datasets/sciq.py +150 -0
- themis_eval-0.2.0/themis/datasets/social_i_qa.py +151 -0
- themis_eval-0.2.0/themis/datasets/super_gpqa.py +263 -0
- themis_eval-0.2.0/themis/evaluation/__init__.py +1 -0
- themis_eval-0.2.0/themis/evaluation/conditional.py +410 -0
- themis_eval-0.2.0/themis/evaluation/extractors/__init__.py +19 -0
- themis_eval-0.2.0/themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis_eval-0.2.0/themis/evaluation/extractors/exceptions.py +7 -0
- themis_eval-0.2.0/themis/evaluation/extractors/identity_extractor.py +29 -0
- themis_eval-0.2.0/themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis_eval-0.2.0/themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis_eval-0.2.0/themis/evaluation/extractors/regex_extractor.py +43 -0
- themis_eval-0.2.0/themis/evaluation/math_verify_utils.py +87 -0
- themis_eval-0.2.0/themis/evaluation/metrics/__init__.py +21 -0
- themis_eval-0.2.0/themis/evaluation/metrics/code/__init__.py +19 -0
- themis_eval-0.2.0/themis/evaluation/metrics/code/codebleu.py +144 -0
- themis_eval-0.2.0/themis/evaluation/metrics/code/execution.py +280 -0
- themis_eval-0.2.0/themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis_eval-0.2.0/themis/evaluation/metrics/composite_metric.py +47 -0
- themis_eval-0.2.0/themis/evaluation/metrics/consistency_metric.py +80 -0
- themis_eval-0.2.0/themis/evaluation/metrics/exact_match.py +51 -0
- themis_eval-0.2.0/themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis_eval-0.2.0/themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis_eval-0.2.0/themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis_eval-0.2.0/themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis_eval-0.2.0/themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis_eval-0.2.0/themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis_eval-0.2.0/themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis_eval-0.2.0/themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis_eval-0.2.0/themis/evaluation/metrics/response_length.py +33 -0
- themis_eval-0.2.0/themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis_eval-0.2.0/themis/evaluation/pipeline.py +49 -0
- themis_eval-0.2.0/themis/evaluation/pipelines/__init__.py +15 -0
- themis_eval-0.2.0/themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis_eval-0.2.0/themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis_eval-0.2.0/themis/evaluation/reports.py +293 -0
- themis_eval-0.2.0/themis/evaluation/statistics/__init__.py +53 -0
- themis_eval-0.2.0/themis/evaluation/statistics/bootstrap.py +79 -0
- themis_eval-0.2.0/themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis_eval-0.2.0/themis/evaluation/statistics/distributions.py +207 -0
- themis_eval-0.2.0/themis/evaluation/statistics/effect_sizes.py +124 -0
- themis_eval-0.2.0/themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis_eval-0.2.0/themis/evaluation/statistics/types.py +139 -0
- themis_eval-0.2.0/themis/evaluation/strategies/__init__.py +13 -0
- themis_eval-0.2.0/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis_eval-0.2.0/themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis_eval-0.2.0/themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis_eval-0.2.0/themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis_eval-0.2.0/themis/experiment/__init__.py +5 -0
- themis_eval-0.2.0/themis/experiment/builder.py +151 -0
- themis_eval-0.2.0/themis/experiment/cache_manager.py +134 -0
- themis_eval-0.2.0/themis/experiment/comparison.py +631 -0
- themis_eval-0.2.0/themis/experiment/cost.py +310 -0
- themis_eval-0.2.0/themis/experiment/definitions.py +62 -0
- themis_eval-0.2.0/themis/experiment/export.py +798 -0
- themis_eval-0.2.0/themis/experiment/export_csv.py +159 -0
- themis_eval-0.2.0/themis/experiment/integration_manager.py +104 -0
- themis_eval-0.2.0/themis/experiment/math.py +192 -0
- themis_eval-0.2.0/themis/experiment/mcq.py +169 -0
- themis_eval-0.2.0/themis/experiment/orchestrator.py +415 -0
- themis_eval-0.2.0/themis/experiment/pricing.py +317 -0
- themis_eval-0.2.0/themis/experiment/storage.py +1458 -0
- themis_eval-0.2.0/themis/experiment/visualization.py +588 -0
- themis_eval-0.2.0/themis/generation/__init__.py +1 -0
- themis_eval-0.2.0/themis/generation/agentic_runner.py +420 -0
- themis_eval-0.2.0/themis/generation/batching.py +254 -0
- themis_eval-0.2.0/themis/generation/clients.py +143 -0
- themis_eval-0.2.0/themis/generation/conversation_runner.py +236 -0
- themis_eval-0.2.0/themis/generation/plan.py +456 -0
- themis_eval-0.2.0/themis/generation/providers/litellm_provider.py +221 -0
- themis_eval-0.2.0/themis/generation/providers/vllm_provider.py +135 -0
- themis_eval-0.2.0/themis/generation/router.py +34 -0
- themis_eval-0.2.0/themis/generation/runner.py +207 -0
- themis_eval-0.2.0/themis/generation/strategies.py +98 -0
- themis_eval-0.2.0/themis/generation/templates.py +71 -0
- themis_eval-0.2.0/themis/generation/turn_strategies.py +393 -0
- themis_eval-0.2.0/themis/generation/types.py +9 -0
- themis_eval-0.2.0/themis/integrations/huggingface.py +72 -0
- themis_eval-0.2.0/themis/integrations/wandb.py +77 -0
- themis_eval-0.2.0/themis/interfaces/__init__.py +169 -0
- themis_eval-0.2.0/themis/presets/__init__.py +10 -0
- themis_eval-0.2.0/themis/presets/benchmarks.py +354 -0
- themis_eval-0.2.0/themis/presets/models.py +190 -0
- themis_eval-0.2.0/themis/project/__init__.py +20 -0
- themis_eval-0.2.0/themis/project/definitions.py +98 -0
- themis_eval-0.2.0/themis/project/patterns.py +230 -0
- themis_eval-0.2.0/themis/providers/__init__.py +5 -0
- themis_eval-0.2.0/themis/providers/registry.py +39 -0
- themis_eval-0.2.0/themis/py.typed +0 -0
- themis_eval-0.2.0/themis/server/__init__.py +28 -0
- themis_eval-0.2.0/themis/server/app.py +337 -0
- themis_eval-0.2.0/themis/utils/api_generator.py +379 -0
- themis_eval-0.2.0/themis/utils/cost_tracking.py +376 -0
- themis_eval-0.2.0/themis/utils/dashboard.py +452 -0
- themis_eval-0.2.0/themis/utils/logging_utils.py +41 -0
- themis_eval-0.2.0/themis/utils/progress.py +58 -0
- themis_eval-0.2.0/themis/utils/tracing.py +320 -0
- themis_eval-0.2.0/themis_eval.egg-info/PKG-INFO +596 -0
- themis_eval-0.2.0/themis_eval.egg-info/SOURCES.txt +161 -0
- themis_eval-0.2.0/themis_eval.egg-info/requires.txt +47 -0
- themis_eval-0.1.0/PKG-INFO +0 -758
- themis_eval-0.1.0/README.md +0 -718
- themis_eval-0.1.0/themis/__init__.py +0 -14
- themis_eval-0.1.0/themis_eval.egg-info/PKG-INFO +0 -758
- themis_eval-0.1.0/themis_eval.egg-info/SOURCES.txt +0 -12
- themis_eval-0.1.0/themis_eval.egg-info/requires.txt +0 -25
- {themis_eval-0.1.0 → themis_eval-0.2.0}/LICENSE +0 -0
- {themis_eval-0.1.0 → themis_eval-0.2.0}/setup.cfg +0 -0
- {themis_eval-0.1.0 → themis_eval-0.2.0}/tests/test_package_metadata.py +0 -0
- /themis_eval-0.1.0/themis/py.typed → /themis_eval-0.2.0/themis/integrations/__init__.py +0 -0
- {themis_eval-0.1.0 → themis_eval-0.2.0}/themis_eval.egg-info/dependency_links.txt +0 -0
- {themis_eval-0.1.0 → themis_eval-0.2.0}/themis_eval.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,596 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: themis-eval
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Lightweight evaluation platform for LLM experiments
|
|
5
|
+
Author: Pittawat Taveekitworachai
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Resources, https://github.com/Pittawat2542/themis
|
|
8
|
+
Project-URL: Homepage, https://pittawat2542.github.io/themis/
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: pydantic>=2.12.5
|
|
19
|
+
Requires-Dist: cyclopts>=4.0.0
|
|
20
|
+
Requires-Dist: hydra-core>=1.3
|
|
21
|
+
Requires-Dist: tqdm>=4.67
|
|
22
|
+
Requires-Dist: httpx>=0.27
|
|
23
|
+
Requires-Dist: litellm>=1.81.0
|
|
24
|
+
Requires-Dist: tabulate>=0.9.0
|
|
25
|
+
Requires-Dist: tenacity>=9.1.2
|
|
26
|
+
Requires-Dist: plotly>=6.5.0
|
|
27
|
+
Requires-Dist: math-verify>=0.8.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-timeout>=2.3.1; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.8.5; extra == "dev"
|
|
34
|
+
Requires-Dist: mypy>=1.14.0; extra == "dev"
|
|
35
|
+
Provides-Extra: math
|
|
36
|
+
Requires-Dist: datasets>=2.20.0; extra == "math"
|
|
37
|
+
Requires-Dist: math-verify>=0.8.0; extra == "math"
|
|
38
|
+
Provides-Extra: nlp
|
|
39
|
+
Requires-Dist: sacrebleu>=2.4.0; extra == "nlp"
|
|
40
|
+
Requires-Dist: rouge-score>=0.1.2; extra == "nlp"
|
|
41
|
+
Requires-Dist: bert-score>=0.3.13; extra == "nlp"
|
|
42
|
+
Requires-Dist: nltk>=3.8.0; extra == "nlp"
|
|
43
|
+
Provides-Extra: code
|
|
44
|
+
Requires-Dist: codebleu>=0.7.0; extra == "code"
|
|
45
|
+
Provides-Extra: viz
|
|
46
|
+
Requires-Dist: plotly>=5.18.0; extra == "viz"
|
|
47
|
+
Provides-Extra: server
|
|
48
|
+
Requires-Dist: fastapi>=0.128.0; extra == "server"
|
|
49
|
+
Requires-Dist: uvicorn[standard]>=0.32.0; extra == "server"
|
|
50
|
+
Requires-Dist: websockets>=14.0; extra == "server"
|
|
51
|
+
Provides-Extra: docs
|
|
52
|
+
Requires-Dist: mkdocs>=1.6.0; extra == "docs"
|
|
53
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
|
|
54
|
+
Requires-Dist: mkdocstrings[python]>=0.25.0; extra == "docs"
|
|
55
|
+
Provides-Extra: all
|
|
56
|
+
Requires-Dist: themis-eval[code,docs,math,nlp,server,viz]; extra == "all"
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
|
|
59
|
+
# Themis
|
|
60
|
+
|
|
61
|
+
> **Modern LLM evaluation framework for researchers and practitioners**
|
|
62
|
+
|
|
63
|
+
Themis makes it easy to evaluate language models systematically with one-liner Python APIs, built-in benchmarks, statistical comparisons, and a web dashboard.
|
|
64
|
+
|
|
65
|
+
[](https://www.python.org/downloads/)
|
|
66
|
+
[](https://opensource.org/licenses/MIT)
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Why Themis?
|
|
71
|
+
|
|
72
|
+
- **🚀 Simple**: One-line Python API or CLI commands—no configuration files needed
|
|
73
|
+
- **📊 Comprehensive**: 100+ LLM providers, built-in benchmarks, NLP & code metrics
|
|
74
|
+
- **🔬 Statistical**: Compare runs with t-tests, bootstrap, and permutation tests
|
|
75
|
+
- **💾 Reliable**: Automatic caching, resume failed runs, smart cache invalidation
|
|
76
|
+
- **🌐 Visual**: Web dashboard for exploring results and comparisons
|
|
77
|
+
- **🔌 Extensible**: Pluggable backends for custom storage and execution
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
### Installation
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Using pip
|
|
87
|
+
pip install themis-eval
|
|
88
|
+
|
|
89
|
+
# Or with uv (recommended)
|
|
90
|
+
uv pip install themis-eval
|
|
91
|
+
|
|
92
|
+
# With optional features
|
|
93
|
+
pip install themis-eval[math,nlp,code,server]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### One-Liner Evaluation
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from themis import evaluate
|
|
100
|
+
|
|
101
|
+
# Evaluate any model on any benchmark
|
|
102
|
+
result = evaluate(
|
|
103
|
+
benchmark="gsm8k",
|
|
104
|
+
model="gpt-4",
|
|
105
|
+
limit=100
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
print(f"Accuracy: {result.metrics['exact_match']:.2%}")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### CLI Usage
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Evaluate a model
|
|
115
|
+
themis eval gsm8k --model gpt-4 --limit 100
|
|
116
|
+
|
|
117
|
+
# Compare two models
|
|
118
|
+
themis eval gsm8k --model gpt-4 --limit 100 --run-id gpt4-run
|
|
119
|
+
themis eval gsm8k --model claude-3-opus --limit 100 --run-id claude-run
|
|
120
|
+
themis compare gpt4-run claude-run
|
|
121
|
+
|
|
122
|
+
# Start web dashboard
|
|
123
|
+
themis serve
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Features
|
|
129
|
+
|
|
130
|
+
### 🎯 Built-in Benchmarks
|
|
131
|
+
|
|
132
|
+
Themis includes 6 popular benchmarks out-of-the-box:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# Math reasoning
|
|
136
|
+
evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
|
|
137
|
+
evaluate(benchmark="math500", model="gpt-4", limit=50)
|
|
138
|
+
evaluate(benchmark="aime24", model="gpt-4")
|
|
139
|
+
|
|
140
|
+
# General knowledge
|
|
141
|
+
evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
|
|
142
|
+
evaluate(benchmark="supergpqa", model="gpt-4")
|
|
143
|
+
|
|
144
|
+
# Quick testing
|
|
145
|
+
evaluate(benchmark="demo", model="fake-math-llm", limit=10)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**See all available benchmarks:**
|
|
149
|
+
```bash
|
|
150
|
+
themis list benchmarks
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### 📈 Rich Metrics
|
|
154
|
+
|
|
155
|
+
**Math Metrics:**
|
|
156
|
+
- Exact Match
|
|
157
|
+
- Math Verification (symbolic & numeric)
|
|
158
|
+
|
|
159
|
+
**NLP Metrics:**
|
|
160
|
+
- BLEU, ROUGE, BERTScore, METEOR
|
|
161
|
+
|
|
162
|
+
**Code Metrics:**
|
|
163
|
+
- Pass@k, CodeBLEU, Execution Accuracy
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
# Use specific metrics
|
|
167
|
+
result = evaluate(
|
|
168
|
+
benchmark="gsm8k",
|
|
169
|
+
model="gpt-4",
|
|
170
|
+
metrics=["exact_match", "bleu", "rouge1"],
|
|
171
|
+
)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### 🔬 Statistical Comparison
|
|
175
|
+
|
|
176
|
+
Compare multiple runs with statistical significance testing:
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from themis.comparison import compare_runs
|
|
180
|
+
|
|
181
|
+
report = compare_runs(
|
|
182
|
+
run_ids=["gpt4-run", "claude-run"],
|
|
183
|
+
storage_path=".cache/experiments",
|
|
184
|
+
statistical_test="bootstrap",
|
|
185
|
+
alpha=0.05
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
print(report.summary())
|
|
189
|
+
# Shows: win/loss matrices, p-values, effect sizes
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**CLI:**
|
|
193
|
+
```bash
|
|
194
|
+
themis compare run-1 run-2 --test bootstrap --output comparison.html
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### 🌐 Web Dashboard
|
|
198
|
+
|
|
199
|
+
Start the API server and view results in your browser:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
themis serve
|
|
203
|
+
|
|
204
|
+
# Open http://localhost:8080/dashboard
|
|
205
|
+
# API docs at http://localhost:8080/docs
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Features:**
|
|
209
|
+
- List all experiment runs
|
|
210
|
+
- View detailed results
|
|
211
|
+
- Compare multiple runs
|
|
212
|
+
- REST API + WebSocket support
|
|
213
|
+
|
|
214
|
+
### 🔌 100+ LLM Providers
|
|
215
|
+
|
|
216
|
+
Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider support:
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
# OpenAI
|
|
220
|
+
evaluate(benchmark="gsm8k", model="gpt-4")
|
|
221
|
+
|
|
222
|
+
# Anthropic
|
|
223
|
+
evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
|
|
224
|
+
|
|
225
|
+
# Azure OpenAI
|
|
226
|
+
evaluate(benchmark="gsm8k", model="azure/gpt-4")
|
|
227
|
+
|
|
228
|
+
# Local models (vLLM, Ollama, etc.)
|
|
229
|
+
evaluate(benchmark="gsm8k", model="ollama/llama3")
|
|
230
|
+
|
|
231
|
+
# AWS Bedrock
|
|
232
|
+
evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### 💾 Smart Caching
|
|
236
|
+
|
|
237
|
+
Themis automatically caches results and resumes failed runs:
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
# Run with caching
|
|
241
|
+
result = evaluate(
|
|
242
|
+
benchmark="gsm8k",
|
|
243
|
+
model="gpt-4",
|
|
244
|
+
limit=1000,
|
|
245
|
+
run_id="my-experiment",
|
|
246
|
+
resume=True # Skip already-evaluated samples
|
|
247
|
+
)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Cache invalidation is automatic when you change:
|
|
251
|
+
- Model parameters (temperature, max_tokens, etc.)
|
|
252
|
+
- Prompt template
|
|
253
|
+
- Evaluation metrics
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Examples
|
|
258
|
+
|
|
259
|
+
### Custom Dataset
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from themis import evaluate
|
|
263
|
+
|
|
264
|
+
# Your own data
|
|
265
|
+
dataset = [
|
|
266
|
+
{"prompt": "What is 2+2?", "answer": "4"},
|
|
267
|
+
{"prompt": "What is 3+3?", "answer": "6"},
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
result = evaluate(
|
|
271
|
+
dataset,
|
|
272
|
+
model="gpt-4",
|
|
273
|
+
prompt="Answer this math question: {prompt}",
|
|
274
|
+
metrics=["exact_match"],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
print(result.report)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### Advanced Configuration
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
result = evaluate(
|
|
284
|
+
benchmark="gsm8k",
|
|
285
|
+
model="gpt-4",
|
|
286
|
+
temperature=0.7,
|
|
287
|
+
max_tokens=512,
|
|
288
|
+
num_samples=3, # Sample 3 responses per prompt
|
|
289
|
+
workers=8, # Parallel execution
|
|
290
|
+
storage=".cache/my-experiments",
|
|
291
|
+
run_id="experiment-2024-01",
|
|
292
|
+
)
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Programmatic Comparison
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
from themis.comparison.statistics import t_test, bootstrap_confidence_interval
|
|
299
|
+
|
|
300
|
+
# Model A scores
|
|
301
|
+
scores_a = [0.85, 0.87, 0.83, 0.90, 0.82]
|
|
302
|
+
# Model B scores
|
|
303
|
+
scores_b = [0.78, 0.80, 0.79, 0.82, 0.77]
|
|
304
|
+
|
|
305
|
+
# Statistical test
|
|
306
|
+
result = bootstrap_confidence_interval(
|
|
307
|
+
scores_a, scores_b,
|
|
308
|
+
n_bootstrap=10000,
|
|
309
|
+
confidence_level=0.95
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
print(f"Significant: {result.significant}")
|
|
313
|
+
print(f"CI: {result.confidence_interval}")
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
---
|
|
317
|
+
|
|
318
|
+
## Architecture
|
|
319
|
+
|
|
320
|
+
Themis is built on a clean, modular architecture:
|
|
321
|
+
|
|
322
|
+
```
|
|
323
|
+
┌─────────────────────────────────────────┐
|
|
324
|
+
│ themis.evaluate() │ ← Simple API
|
|
325
|
+
│ (One-line evaluation interface) │
|
|
326
|
+
└─────────────────┬───────────────────────┘
|
|
327
|
+
│
|
|
328
|
+
┌────────┴────────┐
|
|
329
|
+
│ │
|
|
330
|
+
┌────▼─────┐ ┌────▼─────┐
|
|
331
|
+
│ Presets │ │Generation│
|
|
332
|
+
│ System │ │ Pipeline │
|
|
333
|
+
└────┬─────┘ └────┬─────┘
|
|
334
|
+
│ │
|
|
335
|
+
┌────▼─────┐ ┌────▼─────┐
|
|
336
|
+
│Benchmarks│ │Evaluation│
|
|
337
|
+
│(6 built- │ │ Pipeline │
|
|
338
|
+
│ in) │ └────┬─────┘
|
|
339
|
+
└──────────┘ │
|
|
340
|
+
┌────▼─────┐
|
|
341
|
+
│ Storage │
|
|
342
|
+
│ (V2) │
|
|
343
|
+
└──────────┘
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
**Key Components:**
|
|
347
|
+
|
|
348
|
+
- **Presets**: Pre-configured benchmarks with prompts, metrics, and datasets
|
|
349
|
+
- **Generation**: Model inference with caching and resume
|
|
350
|
+
- **Evaluation**: Metric computation with smart cache invalidation
|
|
351
|
+
- **Storage**: Atomic writes, file locking, SQLite metadata
|
|
352
|
+
- **Comparison**: Statistical tests, win/loss matrices
|
|
353
|
+
- **Server**: REST API and WebSocket for web dashboard
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## Documentation
|
|
358
|
+
|
|
359
|
+
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
360
|
+
- **[Examples](examples-simple/)** - Runnable code examples
|
|
361
|
+
- **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
|
|
362
|
+
- **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
|
|
363
|
+
- **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## Advanced Usage
|
|
368
|
+
|
|
369
|
+
### Custom Backends
|
|
370
|
+
|
|
371
|
+
Implement custom storage or execution strategies:
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
from themis.backends import StorageBackend, ExecutionBackend
|
|
375
|
+
|
|
376
|
+
class S3StorageBackend(StorageBackend):
|
|
377
|
+
"""Store results in AWS S3"""
|
|
378
|
+
def save_generation_record(self, run_id, record):
|
|
379
|
+
# Upload to S3
|
|
380
|
+
pass
|
|
381
|
+
# ... implement other methods
|
|
382
|
+
|
|
383
|
+
# Use custom backend
|
|
384
|
+
result = evaluate(
|
|
385
|
+
benchmark="gsm8k",
|
|
386
|
+
model="gpt-4",
|
|
387
|
+
storage_backend=S3StorageBackend(bucket="my-bucket")
|
|
388
|
+
)
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
|
|
392
|
+
|
|
393
|
+
### Distributed Execution
|
|
394
|
+
|
|
395
|
+
```python
|
|
396
|
+
from themis.backends import ExecutionBackend
|
|
397
|
+
import ray
|
|
398
|
+
|
|
399
|
+
class RayExecutionBackend(ExecutionBackend):
|
|
400
|
+
"""Distributed execution with Ray"""
|
|
401
|
+
# ... implementation
|
|
402
|
+
|
|
403
|
+
result = evaluate(
|
|
404
|
+
benchmark="math500",
|
|
405
|
+
model="gpt-4",
|
|
406
|
+
execution_backend=RayExecutionBackend(num_cpus=32)
|
|
407
|
+
)
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
### Monitoring & Observability
|
|
411
|
+
|
|
412
|
+
Connect to the WebSocket endpoint for real-time updates:
|
|
413
|
+
|
|
414
|
+
```python
|
|
415
|
+
import asyncio
|
|
416
|
+
import websockets
|
|
417
|
+
import json
|
|
418
|
+
|
|
419
|
+
async def monitor():
|
|
420
|
+
async with websockets.connect("ws://localhost:8080/ws") as ws:
|
|
421
|
+
await ws.send(json.dumps({"type": "subscribe", "run_id": "my-run"}))
|
|
422
|
+
async for message in ws:
|
|
423
|
+
print(json.loads(message))
|
|
424
|
+
|
|
425
|
+
asyncio.run(monitor())
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
---
|
|
429
|
+
|
|
430
|
+
## CLI Reference
|
|
431
|
+
|
|
432
|
+
### Evaluation
|
|
433
|
+
|
|
434
|
+
```bash
|
|
435
|
+
# Basic evaluation
|
|
436
|
+
themis eval <benchmark> --model <model> [options]
|
|
437
|
+
|
|
438
|
+
# Options:
|
|
439
|
+
# --limit N Evaluate first N samples
|
|
440
|
+
# --temperature FLOAT Sampling temperature (default: 0.0)
|
|
441
|
+
# --max-tokens INT Maximum tokens (default: 512)
|
|
442
|
+
# --workers INT Parallel workers (default: 4)
|
|
443
|
+
# --run-id STR Run identifier
|
|
444
|
+
# --storage PATH Storage directory
|
|
445
|
+
# --resume Resume from cache
|
|
446
|
+
# --output FILE Export results (.json, .csv, .html)
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
### Comparison
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
# Compare two or more runs
|
|
453
|
+
themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
|
|
454
|
+
|
|
455
|
+
# Options:
|
|
456
|
+
# --storage PATH Storage directory
|
|
457
|
+
# --test STR Statistical test: t_test, bootstrap, permutation
|
|
458
|
+
# --alpha FLOAT Significance level (default: 0.05)
|
|
459
|
+
# --output FILE Export report (.json, .html, .md)
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
### Server
|
|
463
|
+
|
|
464
|
+
```bash
|
|
465
|
+
# Start API server
|
|
466
|
+
themis serve [options]
|
|
467
|
+
|
|
468
|
+
# Options:
|
|
469
|
+
# --port INT Port (default: 8080)
|
|
470
|
+
# --host STR Host (default: 127.0.0.1)
|
|
471
|
+
# --storage PATH Storage directory
|
|
472
|
+
# --reload Auto-reload (dev mode)
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
### List
|
|
476
|
+
|
|
477
|
+
```bash
|
|
478
|
+
# List available resources
|
|
479
|
+
themis list <what>
|
|
480
|
+
|
|
481
|
+
# Options:
|
|
482
|
+
# runs List all experiment runs
|
|
483
|
+
# benchmarks List available benchmarks
|
|
484
|
+
# metrics List available metrics
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
---
|
|
488
|
+
|
|
489
|
+
## Development
|
|
490
|
+
|
|
491
|
+
### Setup
|
|
492
|
+
|
|
493
|
+
```bash
|
|
494
|
+
# Clone repository
|
|
495
|
+
git clone https://github.com/yourusername/themis.git
|
|
496
|
+
cd themis
|
|
497
|
+
|
|
498
|
+
# Install with dev dependencies
|
|
499
|
+
uv pip install -e ".[dev,math,nlp,code,server]"
|
|
500
|
+
|
|
501
|
+
# Run tests
|
|
502
|
+
uv run pytest
|
|
503
|
+
|
|
504
|
+
# Run specific test
|
|
505
|
+
uv run pytest tests/comparison/test_statistics.py -v
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
### Project Structure
|
|
509
|
+
|
|
510
|
+
```
|
|
511
|
+
themis/
|
|
512
|
+
├── themis/
|
|
513
|
+
│ ├── api.py # Main evaluate() function
|
|
514
|
+
│ ├── presets/ # Benchmark presets
|
|
515
|
+
│ ├── generation/ # Model inference
|
|
516
|
+
│ ├── evaluation/ # Metrics & evaluation
|
|
517
|
+
│ ├── comparison/ # Statistical comparison
|
|
518
|
+
│ ├── backends/ # Pluggable backends
|
|
519
|
+
│ ├── server/ # FastAPI server
|
|
520
|
+
│ └── cli/ # CLI commands
|
|
521
|
+
├── tests/ # Test suite
|
|
522
|
+
├── examples-simple/ # Minimal examples
|
|
523
|
+
├── docs/ # Documentation
|
|
524
|
+
└── pyproject.toml # Package configuration
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
### Running Examples
|
|
528
|
+
|
|
529
|
+
```bash
|
|
530
|
+
# Simple quickstart
|
|
531
|
+
uv run python examples-simple/01_quickstart.py
|
|
532
|
+
|
|
533
|
+
# Custom dataset
|
|
534
|
+
uv run python examples-simple/02_custom_dataset.py
|
|
535
|
+
|
|
536
|
+
# Comparison example
|
|
537
|
+
uv run python examples-simple/04_comparison.py
|
|
538
|
+
|
|
539
|
+
# API server example
|
|
540
|
+
uv run python examples-simple/05_api_server.py
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
---
|
|
544
|
+
|
|
545
|
+
## Contributing
|
|
546
|
+
|
|
547
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
548
|
+
|
|
549
|
+
Areas where we'd love help:
|
|
550
|
+
- Additional benchmark presets
|
|
551
|
+
- New evaluation metrics
|
|
552
|
+
- Backend implementations (Ray, S3, etc.)
|
|
553
|
+
- Documentation improvements
|
|
554
|
+
- Bug reports and feature requests
|
|
555
|
+
|
|
556
|
+
---
|
|
557
|
+
|
|
558
|
+
## Citation
|
|
559
|
+
|
|
560
|
+
If you use Themis in your research, please cite:
|
|
561
|
+
|
|
562
|
+
```bibtex
|
|
563
|
+
@software{themis2024,
|
|
564
|
+
title = {Themis: Modern LLM Evaluation Framework},
|
|
565
|
+
author = {Your Name},
|
|
566
|
+
year = {2024},
|
|
567
|
+
url = {https://github.com/yourusername/themis}
|
|
568
|
+
}
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
---
|
|
572
|
+
|
|
573
|
+
## License
|
|
574
|
+
|
|
575
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
576
|
+
|
|
577
|
+
---
|
|
578
|
+
|
|
579
|
+
## Acknowledgments
|
|
580
|
+
|
|
581
|
+
- Built on [LiteLLM](https://github.com/BerriAI/litellm) for provider support
|
|
582
|
+
- Inspired by [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
|
|
583
|
+
- Statistical methods from established research practices
|
|
584
|
+
|
|
585
|
+
---
|
|
586
|
+
|
|
587
|
+
## Support
|
|
588
|
+
|
|
589
|
+
- **Documentation**: [docs/index.md](docs/index.md)
|
|
590
|
+
- **Examples**: [examples-simple/](examples-simple/)
|
|
591
|
+
- **Issues**: [GitHub Issues](https://github.com/yourusername/themis/issues)
|
|
592
|
+
- **Discussions**: [GitHub Discussions](https://github.com/yourusername/themis/discussions)
|
|
593
|
+
|
|
594
|
+
---
|
|
595
|
+
|
|
596
|
+
**Made with ❤️ for the LLM research community**
|