themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,758 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: themis-eval
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Lightweight evaluation platform for LLM experiments
|
|
5
|
-
Author: Pittawat Taveekitworachai
|
|
6
|
-
License: MIT
|
|
7
|
-
Project-URL: Resources, https://github.com/Pittawat2542/themis
|
|
8
|
-
Project-URL: Homepage, https://pittawat2542.github.io/themis/
|
|
9
|
-
Classifier: Development Status :: 3 - Alpha
|
|
10
|
-
Classifier: Intended Audience :: Developers
|
|
11
|
-
Classifier: Intended Audience :: Science/Research
|
|
12
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
-
Requires-Python: >=3.12
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE
|
|
18
|
-
Requires-Dist: pydantic>=2.7
|
|
19
|
-
Requires-Dist: cyclopts>=2.9
|
|
20
|
-
Requires-Dist: hydra-core>=1.3
|
|
21
|
-
Requires-Dist: tqdm>=4.67
|
|
22
|
-
Requires-Dist: httpx>=0.27
|
|
23
|
-
Requires-Dist: litellm>=1.79.0
|
|
24
|
-
Requires-Dist: tabulate>=0.9.0
|
|
25
|
-
Requires-Dist: tenacity>=9.1.2
|
|
26
|
-
Requires-Dist: plotly>=6.5.0
|
|
27
|
-
Requires-Dist: math-verify>=0.8.0
|
|
28
|
-
Provides-Extra: dev
|
|
29
|
-
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
|
-
Provides-Extra: math
|
|
31
|
-
Requires-Dist: datasets>=2.20.0; extra == "math"
|
|
32
|
-
Requires-Dist: math-verify>=0.8.0; extra == "math"
|
|
33
|
-
Provides-Extra: viz
|
|
34
|
-
Requires-Dist: plotly>=5.18.0; extra == "viz"
|
|
35
|
-
Provides-Extra: docs
|
|
36
|
-
Requires-Dist: mkdocs>=1.6.0; extra == "docs"
|
|
37
|
-
Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
|
|
38
|
-
Requires-Dist: mkdocstrings[python]>=0.25.0; extra == "docs"
|
|
39
|
-
Dynamic: license-file
|
|
40
|
-
|
|
41
|
-
# Themis
|
|
42
|
-
|
|
43
|
-
> **Lightweight evaluation platform for LLM experiments**
|
|
44
|
-
|
|
45
|
-
Themis orchestrates prompt templates, LLM providers, generation strategies, evaluation metrics, and storage into reproducible, resumable pipelines for systematic LLM experimentation.
|
|
46
|
-
|
|
47
|
-
[](https://www.python.org/downloads/)
|
|
48
|
-
[](https://opensource.org/licenses/MIT)
|
|
49
|
-
|
|
50
|
-
---
|
|
51
|
-
|
|
52
|
-
## Why Themis?
|
|
53
|
-
|
|
54
|
-
- **🎯 Config-driven**: Define experiments in JSON/YAML, run them with a single command
|
|
55
|
-
- **🔄 Resumable**: Automatic caching and resume—never lose your expensive LLM runs
|
|
56
|
-
- **📊 Systematic**: Grid search over models × prompts × sampling strategies
|
|
57
|
-
- **🔌 Provider-agnostic**: Works with 100+ LLM providers via LiteLLM (OpenAI, Anthropic, Azure, AWS Bedrock, Google AI, local LLMs, and more)
|
|
58
|
-
- **📈 Built-in evaluation**: Exact match, math verification, custom metrics
|
|
59
|
-
- **🎓 Production-ready**: Type-safe configs, structured logging, progress tracking
|
|
60
|
-
|
|
61
|
-
---
|
|
62
|
-
|
|
63
|
-
## Table of Contents
|
|
64
|
-
|
|
65
|
-
1. [Installation](#installation)
|
|
66
|
-
2. [Quick Start](#quick-start)
|
|
67
|
-
3. [Examples & Tutorials](#examples--tutorials)
|
|
68
|
-
4. [Core Concepts](#core-concepts)
|
|
69
|
-
5. [CLI Reference](#cli-reference)
|
|
70
|
-
6. [Configuration](#configuration)
|
|
71
|
-
7. [Architecture](#architecture)
|
|
72
|
-
8. [Development](#development)
|
|
73
|
-
9. [Documentation](#documentation)
|
|
74
|
-
|
|
75
|
-
---
|
|
76
|
-
|
|
77
|
-
## Installation
|
|
78
|
-
|
|
79
|
-
### Using uv (Recommended)
|
|
80
|
-
|
|
81
|
-
```bash
|
|
82
|
-
# Clone the repository
|
|
83
|
-
git clone https://github.com/yourusername/themis.git
|
|
84
|
-
cd themis
|
|
85
|
-
|
|
86
|
-
# Install with uv
|
|
87
|
-
uv sync
|
|
88
|
-
|
|
89
|
-
# Verify installation
|
|
90
|
-
uv run python -m themis.cli --version
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
### Using pip
|
|
94
|
-
|
|
95
|
-
```bash
|
|
96
|
-
# Basic installation
|
|
97
|
-
pip install -e .
|
|
98
|
-
|
|
99
|
-
# With development tools
|
|
100
|
-
pip install -e ".[dev]"
|
|
101
|
-
|
|
102
|
-
# With math evaluation support
|
|
103
|
-
pip install -e ".[math]"
|
|
104
|
-
|
|
105
|
-
# Full installation
|
|
106
|
-
pip install -e ".[dev,math]"
|
|
107
|
-
```
|
|
108
|
-
|
|
109
|
-
### Requirements
|
|
110
|
-
|
|
111
|
-
- Python 3.12+
|
|
112
|
-
- Dependencies: `pydantic`, `cyclopts`, `hydra-core`, `tqdm`, `httpx`
|
|
113
|
-
|
|
114
|
-
---
|
|
115
|
-
|
|
116
|
-
## Quick Start
|
|
117
|
-
|
|
118
|
-
### 1. Explore Available Components
|
|
119
|
-
|
|
120
|
-
See what's available in your installation:
|
|
121
|
-
|
|
122
|
-
```bash
|
|
123
|
-
# Show system info and quick start
|
|
124
|
-
uv run python -m themis.cli info
|
|
125
|
-
|
|
126
|
-
# List available LLM providers
|
|
127
|
-
uv run python -m themis.cli list-providers --verbose
|
|
128
|
-
|
|
129
|
-
# List available benchmarks
|
|
130
|
-
uv run python -m themis.cli list-benchmarks --verbose
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
### 2. Run the Built-in Demo
|
|
134
|
-
|
|
135
|
-
Test your installation with the fake model provider:
|
|
136
|
-
|
|
137
|
-
```bash
|
|
138
|
-
# Run a quick smoke test
|
|
139
|
-
uv run python -m themis.cli demo
|
|
140
|
-
|
|
141
|
-
# See what's happening
|
|
142
|
-
uv run python -m themis.cli demo --log-level info
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
This runs two inline math prompts through a fake LLM provider to verify the pipeline works end-to-end.
|
|
146
|
-
|
|
147
|
-
### 3. Try Your First Real Experiment
|
|
148
|
-
|
|
149
|
-
Start with the comprehensive examples cookbook:
|
|
150
|
-
|
|
151
|
-
```bash
|
|
152
|
-
# Your first experiment (15 minutes)
|
|
153
|
-
uv run python -m examples.getting_started.cli run
|
|
154
|
-
|
|
155
|
-
# Preview what will happen
|
|
156
|
-
uv run python -m examples.getting_started.cli run --dry-run
|
|
157
|
-
|
|
158
|
-
# Export results
|
|
159
|
-
uv run python -m examples.getting_started.cli run --csv-output results.csv
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
### 4. Connect to a Real LLM
|
|
163
|
-
|
|
164
|
-
Use any OpenAI-compatible endpoint (LM Studio, Ollama, vLLM, OpenAI):
|
|
165
|
-
|
|
166
|
-
```bash
|
|
167
|
-
# Generate a config file
|
|
168
|
-
uv run python -m themis.cli init --template math500 --output my_config.yaml
|
|
169
|
-
|
|
170
|
-
# Edit my_config.yaml: set base_url, api_key, model name
|
|
171
|
-
# Then validate it
|
|
172
|
-
uv run python -m themis.cli validate-config --config my_config.yaml
|
|
173
|
-
|
|
174
|
-
# Run the experiment
|
|
175
|
-
uv run python -m themis.cli run-config --config my_config.yaml
|
|
176
|
-
```
|
|
177
|
-
|
|
178
|
-
---
|
|
179
|
-
|
|
180
|
-
## Examples & Tutorials
|
|
181
|
-
|
|
182
|
-
**👉 Start here: [`examples/README.md`](examples/README.md)**
|
|
183
|
-
|
|
184
|
-
A comprehensive, hands-on cookbook with 5 progressive tutorials:
|
|
185
|
-
|
|
186
|
-
### Learning Path
|
|
187
|
-
|
|
188
|
-
| Example | Focus | Time | What You'll Learn |
|
|
189
|
-
|---------|-------|------|-------------------|
|
|
190
|
-
| **[getting_started](examples/getting_started/)** | Basics | 15 min | Prompts, models, sampling, evaluation |
|
|
191
|
-
| **[config_file](examples/config_file/)** | Configuration | 20 min | JSON configs, grid searches, resumability |
|
|
192
|
-
| **[prompt_engineering](examples/prompt_engineering/)** | Prompt Strategies | 25 min | Zero-shot, few-shot, chain-of-thought, systematic comparison |
|
|
193
|
-
| **[projects](examples/projects/)** | Organization | 45 min | Multi-experiment projects, research workflows |
|
|
194
|
-
| **[advanced](examples/advanced/)** | Customization | 60 min | Custom runners, pipelines, metrics, agentic workflows |
|
|
195
|
-
|
|
196
|
-
### Quick Reference
|
|
197
|
-
|
|
198
|
-
**🚀 [COOKBOOK.md](COOKBOOK.md)** - Cheat sheet with common patterns and troubleshooting
|
|
199
|
-
|
|
200
|
-
**Example commands:**
|
|
201
|
-
```bash
|
|
202
|
-
# Basic experiment
|
|
203
|
-
uv run python -m examples.getting_started.cli run
|
|
204
|
-
|
|
205
|
-
# Grid search (2 models × 3 temperatures)
|
|
206
|
-
uv run python -m examples.config_file.cli run --config-path grid_search.json
|
|
207
|
-
|
|
208
|
-
# Prompt engineering experiment
|
|
209
|
-
uv run python -m examples.prompt_engineering.cli run
|
|
210
|
-
|
|
211
|
-
# Multi-experiment project
|
|
212
|
-
uv run python -m examples.projects.cli list-experiments
|
|
213
|
-
uv run python -m examples.projects.cli run --experiment zero-shot
|
|
214
|
-
|
|
215
|
-
# Custom behavior
|
|
216
|
-
uv run python -m examples.advanced.cli run --enable-subject-breakdown
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
---
|
|
220
|
-
|
|
221
|
-
## Core Concepts
|
|
222
|
-
|
|
223
|
-
### Three-Layer Architecture
|
|
224
|
-
|
|
225
|
-
```
|
|
226
|
-
╭─────────────────────────────────────────────────────────╮
|
|
227
|
-
│ Configuration Layer (JSON/YAML/CLI) │
|
|
228
|
-
│ • Dataset specs • Models • Sampling • Storage │
|
|
229
|
-
╰─────────────────────────────────────────────────────────╯
|
|
230
|
-
↓
|
|
231
|
-
╭─────────────────────────────────────────────────────────╮
|
|
232
|
-
│ Experiment Layer (Orchestration) │
|
|
233
|
-
│ • Builder patterns • Runner coordination │
|
|
234
|
-
│ • Progress tracking • Caching & resume │
|
|
235
|
-
╰─────────────────────────────────────────────────────────╯
|
|
236
|
-
↓
|
|
237
|
-
╭───────────────────────╮ ╭────────────────────────────╮
|
|
238
|
-
│ Generation Domain │ │ Evaluation Domain │
|
|
239
|
-
│ • Prompts │───▶│ • Extractors (JSON/math) │
|
|
240
|
-
│ • Providers │ │ • Metrics (exact/custom) │
|
|
241
|
-
│ • Sampling plans │ │ • Aggregation │
|
|
242
|
-
│ • Retry logic │ │ │
|
|
243
|
-
╰───────────────────────╯ ╰────────────────────────────╯
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
### Key Components
|
|
247
|
-
|
|
248
|
-
- **Generation**: Prompt templates → Provider routing → Sampling strategies → Retry/backoff
|
|
249
|
-
- **Evaluation**: Response extraction → Metric computation → Multi-attempt scoring → Aggregation
|
|
250
|
-
- **Experiment**: Dataset loading → Generation plans → Runner execution → Storage → Reporting
|
|
251
|
-
|
|
252
|
-
See [`docs/DIAGRAM.md`](docs/DIAGRAM.md) for detailed architecture diagrams.
|
|
253
|
-
|
|
254
|
-
---
|
|
255
|
-
|
|
256
|
-
## CLI Reference
|
|
257
|
-
|
|
258
|
-
Themis provides experiment commands and utility commands:
|
|
259
|
-
|
|
260
|
-
### Utility Commands
|
|
261
|
-
|
|
262
|
-
Quick commands for discovering and configuring Themis:
|
|
263
|
-
|
|
264
|
-
#### `info` - System Information
|
|
265
|
-
|
|
266
|
-
Show installed components and quick start guide:
|
|
267
|
-
|
|
268
|
-
```bash
|
|
269
|
-
uv run python -m themis.cli info
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
Displays:
|
|
273
|
-
- Version and Python info
|
|
274
|
-
- Installed providers
|
|
275
|
-
- Available benchmarks
|
|
276
|
-
- Example locations
|
|
277
|
-
- Documentation links
|
|
278
|
-
|
|
279
|
-
#### `list-providers` - Available LLM Providers
|
|
280
|
-
|
|
281
|
-
List all registered LLM providers:
|
|
282
|
-
|
|
283
|
-
```bash
|
|
284
|
-
uv run python -m themis.cli list-providers
|
|
285
|
-
|
|
286
|
-
# Show detailed information
|
|
287
|
-
uv run python -m themis.cli list-providers --verbose
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
Shows built-in providers (fake, litellm, vllm) and any custom registered providers.
|
|
291
|
-
|
|
292
|
-
**Note:** The `litellm` provider supports 100+ LLM providers including OpenAI, Anthropic, Azure OpenAI, AWS Bedrock, Google AI, Cohere, and many more. See [docs/LITELLM_PROVIDER.md](docs/LITELLM_PROVIDER.md) for details.
|
|
293
|
-
|
|
294
|
-
#### `list-benchmarks` - Available Datasets
|
|
295
|
-
|
|
296
|
-
List available datasets and benchmarks:
|
|
297
|
-
|
|
298
|
-
```bash
|
|
299
|
-
uv run python -m themis.cli list-benchmarks
|
|
300
|
-
|
|
301
|
-
# Show details with subjects and commands
|
|
302
|
-
uv run python -m themis.cli list-benchmarks --verbose
|
|
303
|
-
```
|
|
304
|
-
|
|
305
|
-
Shows math500, competition math benchmarks (aime24/aime25/amc23/olympiadbench/beyondaime), supergpqa, mmlu-pro, demo, and inline dataset options.
|
|
306
|
-
|
|
307
|
-
#### `init` - Generate Config File
|
|
308
|
-
|
|
309
|
-
Create a sample configuration file:
|
|
310
|
-
|
|
311
|
-
```bash
|
|
312
|
-
# Basic template
|
|
313
|
-
uv run python -m themis.cli init
|
|
314
|
-
|
|
315
|
-
# MATH-500 with OpenAI-compatible endpoint
|
|
316
|
-
uv run python -m themis.cli init --template math500 --output my_config.yaml
|
|
317
|
-
|
|
318
|
-
# Inline dataset template
|
|
319
|
-
uv run python -m themis.cli init --template inline --output custom.yaml
|
|
320
|
-
```
|
|
321
|
-
|
|
322
|
-
Available templates:
|
|
323
|
-
- `basic` - Simple fake provider setup for testing
|
|
324
|
-
- `math500` - MATH-500 benchmark with OpenAI-compatible provider
|
|
325
|
-
- `inline` - Custom inline dataset with examples
|
|
326
|
-
|
|
327
|
-
#### `validate-config` - Validate Configuration
|
|
328
|
-
|
|
329
|
-
Check a config file for errors without running:
|
|
330
|
-
|
|
331
|
-
```bash
|
|
332
|
-
uv run python -m themis.cli validate-config --config my_config.yaml
|
|
333
|
-
```
|
|
334
|
-
|
|
335
|
-
Shows parsed configuration and identifies errors before running expensive experiments.
|
|
336
|
-
|
|
337
|
-
#### `new-project` - Create a New Project
|
|
338
|
-
|
|
339
|
-
Scaffold a new project directory:
|
|
340
|
-
|
|
341
|
-
```bash
|
|
342
|
-
uv run python -m themis.cli new-project --project-name my-new-project
|
|
343
|
-
```
|
|
344
|
-
|
|
345
|
-
This will create a new directory called `my-new-project` with a basic project structure, including a sample configuration file, a CLI script, and a README.
|
|
346
|
-
|
|
347
|
-
---
|
|
348
|
-
|
|
349
|
-
### Experiment Commands
|
|
350
|
-
|
|
351
|
-
Commands for running experiments:
|
|
352
|
-
|
|
353
|
-
#### `demo` - Built-in Smoke Test
|
|
354
|
-
|
|
355
|
-
Quick verification using fake providers:
|
|
356
|
-
|
|
357
|
-
```bash
|
|
358
|
-
uv run python -m themis.cli demo [OPTIONS]
|
|
359
|
-
|
|
360
|
-
Options:
|
|
361
|
-
--log-level TEXT Logging verbosity: critical|error|warning|info|debug|trace
|
|
362
|
-
--help Show help message
|
|
363
|
-
```
|
|
364
|
-
|
|
365
|
-
#### `math500` - MATH-500 Benchmark
|
|
366
|
-
|
|
367
|
-
Zero-shot evaluation on the MATH-500 dataset:
|
|
368
|
-
|
|
369
|
-
```bash
|
|
370
|
-
uv run python -m themis.cli math500 [OPTIONS]
|
|
371
|
-
|
|
372
|
-
Options:
|
|
373
|
-
--source TEXT Dataset source: 'huggingface' (default) or 'local'
|
|
374
|
-
--data-dir PATH Local MATH-500 directory (if --source local)
|
|
375
|
-
--limit INTEGER Limit number of samples (for testing)
|
|
376
|
-
--storage PATH Cache directory (default: .cache/themis)
|
|
377
|
-
--run-id TEXT Unique run identifier for resumability
|
|
378
|
-
--temperature FLOAT Model temperature (default: 0.0)
|
|
379
|
-
--log-level TEXT Logging verbosity
|
|
380
|
-
--resume / --no-resume Resume from cache (default: true)
|
|
381
|
-
|
|
382
|
-
Examples:
|
|
383
|
-
# Quick test with 50 samples
|
|
384
|
-
uv run python -m themis.cli math500 --limit 50
|
|
385
|
-
|
|
386
|
-
# Full evaluation with custom storage
|
|
387
|
-
uv run python -m themis.cli math500 \
|
|
388
|
-
--storage .cache/math500-eval \
|
|
389
|
-
--run-id run-2024-01-15 \
|
|
390
|
-
--temperature 0.0
|
|
391
|
-
|
|
392
|
-
# Use local MATH-500 dataset
|
|
393
|
-
uv run python -m themis.cli math500 \
|
|
394
|
-
--source local \
|
|
395
|
-
--data-dir /path/to/MATH-500 \
|
|
396
|
-
--limit 100
|
|
397
|
-
```
|
|
398
|
-
|
|
399
|
-
#### Competition Math Benchmarks (`aime24`, `aime25`, `amc23`, `olympiadbench`, `beyondaime`)
|
|
400
|
-
|
|
401
|
-
Zero-shot evaluation across multiple competition datasets:
|
|
402
|
-
|
|
403
|
-
```bash
|
|
404
|
-
uv run python -m themis.cli <benchmark> [OPTIONS]
|
|
405
|
-
|
|
406
|
-
Benchmarks:
|
|
407
|
-
aime24, aime25, amc23, olympiadbench, beyondaime
|
|
408
|
-
|
|
409
|
-
Options:
|
|
410
|
-
--source TEXT Dataset source: 'huggingface' (default) or 'local'
|
|
411
|
-
--split TEXT Dataset split to load (default: test)
|
|
412
|
-
--data-dir PATH Local dataset directory (if --source local)
|
|
413
|
-
--limit INTEGER Limit number of samples (for smoke tests)
|
|
414
|
-
--subjects TEXT... Optional subject filters (if provided by the dataset)
|
|
415
|
-
--storage PATH Cache directory for generations
|
|
416
|
-
--run-id TEXT Unique run identifier for resumability
|
|
417
|
-
--temperature FLOAT Model temperature (default: 0.0)
|
|
418
|
-
--log-level TEXT Logging verbosity
|
|
419
|
-
--resume / --no-resume Resume from cache (default: true)
|
|
420
|
-
```
|
|
421
|
-
|
|
422
|
-
#### `supergpqa` - SuperGPQA Benchmark
|
|
423
|
-
|
|
424
|
-
Multiple-choice evaluation on the SuperGPQA dataset:
|
|
425
|
-
|
|
426
|
-
```bash
|
|
427
|
-
uv run python -m themis.cli supergpqa [OPTIONS]
|
|
428
|
-
|
|
429
|
-
Options:
|
|
430
|
-
--source TEXT Dataset source: 'huggingface' (default) or 'local'
|
|
431
|
-
--split TEXT Dataset split to load (default: test)
|
|
432
|
-
--data-dir PATH Local dataset directory (if --source local)
|
|
433
|
-
--limit INTEGER Limit number of samples (for quick smoke tests)
|
|
434
|
-
--subjects TEXT... Optional category filters
|
|
435
|
-
--storage PATH Cache directory for generations
|
|
436
|
-
--run-id TEXT Unique run identifier for resumability
|
|
437
|
-
--temperature FLOAT Model temperature (default: 0.0)
|
|
438
|
-
--log-level TEXT Logging verbosity
|
|
439
|
-
--resume / --no-resume Resume from cache (default: true)
|
|
440
|
-
```
|
|
441
|
-
|
|
442
|
-
#### `mmlu-pro` - MMLU-Pro Benchmark
|
|
443
|
-
|
|
444
|
-
Professional-level multiple-choice evaluation with refined distractors:
|
|
445
|
-
|
|
446
|
-
```bash
|
|
447
|
-
uv run python -m themis.cli mmlu-pro [OPTIONS]
|
|
448
|
-
|
|
449
|
-
Options:
|
|
450
|
-
--source TEXT Dataset source: 'huggingface' (default) or 'local'
|
|
451
|
-
--split TEXT Dataset split to load (default: test)
|
|
452
|
-
--data-dir PATH Local dataset directory (if --source local)
|
|
453
|
-
--limit INTEGER Limit number of samples
|
|
454
|
-
--subjects TEXT... Optional subject filters
|
|
455
|
-
--storage PATH Cache directory for generations
|
|
456
|
-
--run-id TEXT Unique run identifier for resumability
|
|
457
|
-
--temperature FLOAT Model temperature (default: 0.0)
|
|
458
|
-
--log-level TEXT Logging verbosity
|
|
459
|
-
--resume / --no-resume Resume from cache (default: true)
|
|
460
|
-
```
|
|
461
|
-
|
|
462
|
-
#### `run-config` - Config-Driven Experiments
|
|
463
|
-
|
|
464
|
-
Execute experiments defined in YAML config files:
|
|
465
|
-
|
|
466
|
-
```bash
|
|
467
|
-
uv run python -m themis.cli run-config [OPTIONS]
|
|
468
|
-
|
|
469
|
-
Options:
|
|
470
|
-
--config PATH Path to YAML configuration file
|
|
471
|
-
--overrides TEXT Hydra-style overrides (space-separated)
|
|
472
|
-
--log-level TEXT Logging verbosity
|
|
473
|
-
|
|
474
|
-
Examples:
|
|
475
|
-
# Generate a config file first
|
|
476
|
-
uv run python -m themis.cli init --output my_config.yaml
|
|
477
|
-
|
|
478
|
-
# Run from config
|
|
479
|
-
uv run python -m themis.cli run-config --config my_config.yaml
|
|
480
|
-
|
|
481
|
-
# Override specific parameters
|
|
482
|
-
uv run python -m themis.cli run-config \
|
|
483
|
-
--config my_config.yaml \
|
|
484
|
-
--overrides "generation.sampling.temperature=0.2 max_samples=100"
|
|
485
|
-
|
|
486
|
-
# Multiple overrides
|
|
487
|
-
uv run python -m themis.cli run-config \
|
|
488
|
-
--config my_config.yaml \
|
|
489
|
-
--overrides "storage.run_id=new-run dataset.limit=50" \
|
|
490
|
-
--log-level debug
|
|
491
|
-
```
|
|
492
|
-
|
|
493
|
-
---
|
|
494
|
-
|
|
495
|
-
## Configuration
|
|
496
|
-
|
|
497
|
-
### Config File Structure
|
|
498
|
-
|
|
499
|
-
Themis uses JSON or YAML for configuration. Here's a complete example:
|
|
500
|
-
|
|
501
|
-
```json
|
|
502
|
-
{
|
|
503
|
-
"run_id": "my-experiment",
|
|
504
|
-
"storage_dir": ".cache/my-experiment",
|
|
505
|
-
"resume": true,
|
|
506
|
-
"models": [
|
|
507
|
-
{
|
|
508
|
-
"name": "gpt-4",
|
|
509
|
-
"provider": "litellm",
|
|
510
|
-
"provider_options": {
|
|
511
|
-
"api_key": "sk-...",
|
|
512
|
-
"timeout": 60,
|
|
513
|
-
"n_parallel": 10
|
|
514
|
-
}
|
|
515
|
-
},
|
|
516
|
-
{
|
|
517
|
-
"name": "claude-3-opus-20240229",
|
|
518
|
-
"provider": "anthropic",
|
|
519
|
-
"provider_options": {
|
|
520
|
-
"timeout": 120
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
],
|
|
524
|
-
"samplings": [
|
|
525
|
-
{"name": "greedy", "temperature": 0.0, "max_tokens": 512},
|
|
526
|
-
{"name": "creative", "temperature": 0.8, "max_tokens": 512}
|
|
527
|
-
],
|
|
528
|
-
"datasets": [
|
|
529
|
-
{
|
|
530
|
-
"name": "math500",
|
|
531
|
-
"kind": "math500_hf",
|
|
532
|
-
"limit": 50
|
|
533
|
-
}
|
|
534
|
-
]
|
|
535
|
-
}
|
|
536
|
-
```
|
|
537
|
-
|
|
538
|
-
### Configuration Options
|
|
539
|
-
|
|
540
|
-
**Core settings:**
|
|
541
|
-
- `run_id`: Unique identifier for caching and resumability
|
|
542
|
-
- `storage_dir`: Where to cache generations and results
|
|
543
|
-
- `resume`: Continue from previous runs (default: true)
|
|
544
|
-
|
|
545
|
-
**Models:**
|
|
546
|
-
- `name`: Model identifier
|
|
547
|
-
- `provider`: `litellm`, `openai`, `anthropic`, `azure`, `bedrock`, `fake`, or custom
|
|
548
|
-
- `provider_options`: Provider-specific configuration (API keys, endpoints, timeouts)
|
|
549
|
-
|
|
550
|
-
**Samplings:**
|
|
551
|
-
- `name`: Sampling strategy name
|
|
552
|
-
- `temperature`: Randomness (0.0 = deterministic, 1.0+ = creative)
|
|
553
|
-
- `max_tokens`: Maximum response length
|
|
554
|
-
- `top_p`, `top_k`: Nucleus/top-k sampling (optional)
|
|
555
|
-
|
|
556
|
-
**Datasets:**
|
|
557
|
-
- `name`: Dataset identifier
|
|
558
|
-
- `kind`: `demo`, `math500_hf`, `math500_local`, `inline`, or custom
|
|
559
|
-
- `limit`: Maximum samples (for testing)
|
|
560
|
-
- `source_path`: Path for local datasets (optional)
|
|
561
|
-
|
|
562
|
-
See [`docs/CONFIGURATION.md`](docs/CONFIGURATION.md) for the complete schema and [`docs/EXAMPLES.md`](docs/EXAMPLES.md) for common recipes.
|
|
563
|
-
|
|
564
|
-
---
|
|
565
|
-
|
|
566
|
-
## Architecture
|
|
567
|
-
|
|
568
|
-
### Module Organization
|
|
569
|
-
|
|
570
|
-
```
|
|
571
|
-
themis/
|
|
572
|
-
├── cli/ # Command-line interface (Cyclopts)
|
|
573
|
-
├── config/ # Configuration schema & loader (Pydantic, Hydra)
|
|
574
|
-
├── core/ # Core entities (prompts, sampling specs, results)
|
|
575
|
-
├── datasets/ # Dataset loaders (inline, HuggingFace, local)
|
|
576
|
-
├── evaluation/ # Extractors, metrics, evaluation strategies
|
|
577
|
-
├── experiment/ # Orchestration, builder patterns, storage
|
|
578
|
-
├── generation/ # Generation strategies, runners, retry logic
|
|
579
|
-
├── interfaces/ # Abstract base classes
|
|
580
|
-
├── project/ # Multi-experiment project management
|
|
581
|
-
├── providers/ # LLM provider implementations
|
|
582
|
-
└── utils/ # Logging, progress tracking, helpers
|
|
583
|
-
```
|
|
584
|
-
|
|
585
|
-
### Extension Points
|
|
586
|
-
|
|
587
|
-
Themis is designed for extensibility:
|
|
588
|
-
|
|
589
|
-
- **Custom providers**: Implement `Provider` interface for new LLM APIs
|
|
590
|
-
- **Custom datasets**: Implement `DatasetLoader` for new data sources
|
|
591
|
-
- **Custom metrics**: Implement `Metric` interface for domain-specific evaluation
|
|
592
|
-
- **Custom runners**: Override generation loops for specialized workflows
|
|
593
|
-
- **Custom pipelines**: Build evaluation pipelines with custom extractors
|
|
594
|
-
|
|
595
|
-
See [`docs/ADDING_COMPONENTS.md`](docs/ADDING_COMPONENTS.md) for detailed extension guides and [`examples/advanced/`](examples/advanced/) for working examples.
|
|
596
|
-
|
|
597
|
-
---
|
|
598
|
-
|
|
599
|
-
## Development
|
|
600
|
-
|
|
601
|
-
### Running Tests
|
|
602
|
-
|
|
603
|
-
```bash
|
|
604
|
-
# Full test suite
|
|
605
|
-
uv run pytest
|
|
606
|
-
|
|
607
|
-
# Specific test file
|
|
608
|
-
uv run pytest tests/generation/test_strategies.py
|
|
609
|
-
|
|
610
|
-
# With coverage
|
|
611
|
-
uv run pytest --cov=themis --cov-report=html
|
|
612
|
-
|
|
613
|
-
# Verbose output
|
|
614
|
-
uv run pytest -v
|
|
615
|
-
```
|
|
616
|
-
|
|
617
|
-
### Project Commands
|
|
618
|
-
|
|
619
|
-
```bash
|
|
620
|
-
# Smoke test core CLI
|
|
621
|
-
uv run python -m themis.cli demo
|
|
622
|
-
|
|
623
|
-
# Test example pipelines
|
|
624
|
-
uv run python -m examples.getting_started.cli run --dry-run
|
|
625
|
-
|
|
626
|
-
# Run with specific storage (keeps cache local)
|
|
627
|
-
uv run python -m themis.cli math500 \
|
|
628
|
-
--storage .cache/dev-test \
|
|
629
|
-
--limit 5
|
|
630
|
-
```
|
|
631
|
-
|
|
632
|
-
### Code Style
|
|
633
|
-
|
|
634
|
-
- Python 3.12+, PEP 8 (4-space indent)
|
|
635
|
-
- Type hints throughout (mypy-compatible)
|
|
636
|
-
- Dataclasses and Pydantic models for configs/entities
|
|
637
|
-
- File names: `snake_case`
|
|
638
|
-
- Classes: `PascalCase`
|
|
639
|
-
- CLI commands: `dashed-names` (handled by Cyclopts)
|
|
640
|
-
|
|
641
|
-
### Contributing
|
|
642
|
-
|
|
643
|
-
1. Fork the repository
|
|
644
|
-
2. Create a feature branch
|
|
645
|
-
3. Make your changes with tests
|
|
646
|
-
4. Run `uv run pytest` to verify
|
|
647
|
-
5. Submit a pull request with:
|
|
648
|
-
- Summary of changes
|
|
649
|
-
- Test evidence
|
|
650
|
-
- Documentation updates (if applicable)
|
|
651
|
-
- Links to related issues
|
|
652
|
-
|
|
653
|
-
---
|
|
654
|
-
|
|
655
|
-
## Documentation
|
|
656
|
-
|
|
657
|
-
### Core Documentation
|
|
658
|
-
|
|
659
|
-
- **[examples/README.md](examples/README.md)** - Comprehensive tutorial cookbook (START HERE!)
|
|
660
|
-
- **[COOKBOOK.md](COOKBOOK.md)** - Quick reference and cheat sheet
|
|
661
|
-
- **[docs/CONFIGURATION.md](docs/CONFIGURATION.md)** - Complete configuration schema
|
|
662
|
-
- **[docs/ADDING_COMPONENTS.md](docs/ADDING_COMPONENTS.md)** - Extension guide
|
|
663
|
-
- **[docs/DIAGRAM.md](docs/DIAGRAM.md)** - Architecture diagrams
|
|
664
|
-
- **[docs/EXAMPLES.md](docs/EXAMPLES.md)** - Additional recipes and patterns
|
|
665
|
-
- **[AGENTS.md](AGENTS.md)** - Repository guidelines for AI agents
|
|
666
|
-
|
|
667
|
-
### Key Features
|
|
668
|
-
|
|
669
|
-
✅ **Resumability**: Automatic caching by `run_id`—interrupted runs pick up where they left off
|
|
670
|
-
✅ **Grid Search**: Cartesian product over models × samplings × prompts
|
|
671
|
-
✅ **Progress Tracking**: tqdm progress bars and structured logging
|
|
672
|
-
✅ **Type Safety**: Pydantic validation for configs and runtime entities
|
|
673
|
-
✅ **Provider Agnostic**: Unified interface for 100+ providers via LiteLLM (OpenAI, Anthropic, Azure, Bedrock, local LLMs, etc.)
|
|
674
|
-
✅ **Math Evaluation**: Built-in math-verify integration for numeric correctness
|
|
675
|
-
✅ **Export Options**: CSV, JSON, HTML output formats
|
|
676
|
-
✅ **Multi-Experiment Projects**: Organize related experiments with shared configs
|
|
677
|
-
✅ **Integrations**: Log results to Weights & Biases and upload artifacts to Hugging Face Hub
|
|
678
|
-
|
|
679
|
-
---
|
|
680
|
-
|
|
681
|
-
## Use Cases
|
|
682
|
-
|
|
683
|
-
### Academic Research
|
|
684
|
-
- Systematic prompt engineering experiments
|
|
685
|
-
- Model comparison studies
|
|
686
|
-
- Benchmark evaluations (MATH, GSM8K, etc.)
|
|
687
|
-
- Reproducible experiment pipelines
|
|
688
|
-
|
|
689
|
-
### LLM Development
|
|
690
|
-
- Testing model variants during training
|
|
691
|
-
- Evaluating fine-tuned models
|
|
692
|
-
- Comparing sampling strategies
|
|
693
|
-
- A/B testing prompts and templates
|
|
694
|
-
|
|
695
|
-
### Production Monitoring
|
|
696
|
-
- Regression testing for model updates
|
|
697
|
-
- Quality assurance on real-world examples
|
|
698
|
-
- Performance benchmarking
|
|
699
|
-
- Cost/latency analysis
|
|
700
|
-
|
|
701
|
-
---
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
---
|
|
705
|
-
|
|
706
|
-
## Citation
|
|
707
|
-
|
|
708
|
-
If you use Themis in your research, please cite it using the following metadata:
|
|
709
|
-
|
|
710
|
-
```yaml
|
|
711
|
-
@software{themis2025,
|
|
712
|
-
author = {Pittawat Taveekitworachai},
|
|
713
|
-
title = {Themis: Lightweight evaluation platform for LLM experiments},
|
|
714
|
-
year = {2025},
|
|
715
|
-
url = {https://github.com/Pittawat2542/themis}
|
|
716
|
-
}
|
|
717
|
-
```
|
|
718
|
-
|
|
719
|
-
See [`CITATION.cff`](CITATION.cff) for more details.
|
|
720
|
-
|
|
721
|
-
---
|
|
722
|
-
|
|
723
|
-
## Contributing
|
|
724
|
-
|
|
725
|
-
We welcome contributions! Please see [`CONTRIBUTING.md`](CONTRIBUTING.md) for details on how to get started, run tests, and submit pull requests.
|
|
726
|
-
|
|
727
|
-
---
|
|
728
|
-
|
|
729
|
-
## License
|
|
730
|
-
|
|
731
|
-
MIT License - see [LICENSE](LICENSE) file for details
|
|
732
|
-
|
|
733
|
-
---
|
|
734
|
-
|
|
735
|
-
## Getting Help
|
|
736
|
-
|
|
737
|
-
- **Start with examples**: [`examples/README.md`](examples/README.md) has comprehensive tutorials
|
|
738
|
-
- **Quick reference**: [`COOKBOOK.md`](COOKBOOK.md) for common patterns
|
|
739
|
-
- **Check docs**: [`docs/`](docs/) directory has detailed guides
|
|
740
|
-
- **Search issues**: Look for similar problems in GitHub issues
|
|
741
|
-
- **Create issue**: Open a new issue with your question or bug report
|
|
742
|
-
|
|
743
|
-
---
|
|
744
|
-
|
|
745
|
-
## Roadmap
|
|
746
|
-
|
|
747
|
-
- [x] Universal provider support via LiteLLM (100+ providers)
|
|
748
|
-
- [ ] Web UI for experiment visualization
|
|
749
|
-
- [ ] Distributed generation support
|
|
750
|
-
- [ ] Advanced metrics (BLEU, ROUGE, semantic similarity)
|
|
751
|
-
- [ ] Experiment comparison dashboard
|
|
752
|
-
- [ ] Cost tracking and optimization
|
|
753
|
-
|
|
754
|
-
---
|
|
755
|
-
|
|
756
|
-
**Happy experimenting!** 🚀
|
|
757
|
-
|
|
758
|
-
For a guided introduction, start with the [examples](examples/README.md). For quick lookups, check the [cookbook](COOKBOOK.md).
|