themis-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
themis/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """Themis experiment platform."""
2
+
3
+ from themis import config, core, evaluation, experiment, generation, project
4
+ from themis._version import __version__
5
+
6
+ __all__ = [
7
+ "config",
8
+ "core",
9
+ "evaluation",
10
+ "experiment",
11
+ "generation",
12
+ "project",
13
+ "__version__",
14
+ ]
themis/_version.py ADDED
@@ -0,0 +1,17 @@
1
+ """Package version helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from importlib import metadata
6
+
7
+
8
+ def _detect_version() -> str:
9
+ try:
10
+ return metadata.version("themis")
11
+ except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
+ return "0.0.0"
13
+
14
+
15
+ __version__ = _detect_version()
16
+
17
+ __all__ = ["__version__"]
themis/py.typed ADDED
File without changes
@@ -0,0 +1,758 @@
1
+ Metadata-Version: 2.4
2
+ Name: themis-eval
3
+ Version: 0.1.0
4
+ Summary: Lightweight evaluation platform for LLM experiments
5
+ Author: Pittawat Taveekitworachai
6
+ License: MIT
7
+ Project-URL: Resources, https://github.com/Pittawat2542/themis
8
+ Project-URL: Homepage, https://pittawat2542.github.io/themis/
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.12
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: pydantic>=2.7
19
+ Requires-Dist: cyclopts>=2.9
20
+ Requires-Dist: hydra-core>=1.3
21
+ Requires-Dist: tqdm>=4.67
22
+ Requires-Dist: httpx>=0.27
23
+ Requires-Dist: litellm>=1.79.0
24
+ Requires-Dist: tabulate>=0.9.0
25
+ Requires-Dist: tenacity>=9.1.2
26
+ Requires-Dist: plotly>=6.5.0
27
+ Requires-Dist: math-verify>=0.8.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.0; extra == "dev"
30
+ Provides-Extra: math
31
+ Requires-Dist: datasets>=2.20.0; extra == "math"
32
+ Requires-Dist: math-verify>=0.8.0; extra == "math"
33
+ Provides-Extra: viz
34
+ Requires-Dist: plotly>=5.18.0; extra == "viz"
35
+ Provides-Extra: docs
36
+ Requires-Dist: mkdocs>=1.6.0; extra == "docs"
37
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
38
+ Requires-Dist: mkdocstrings[python]>=0.25.0; extra == "docs"
39
+ Dynamic: license-file
40
+
41
+ # Themis
42
+
43
+ > **Lightweight evaluation platform for LLM experiments**
44
+
45
+ Themis orchestrates prompt templates, LLM providers, generation strategies, evaluation metrics, and storage into reproducible, resumable pipelines for systematic LLM experimentation.
46
+
47
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
48
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
49
+
50
+ ---
51
+
52
+ ## Why Themis?
53
+
54
+ - **🎯 Config-driven**: Define experiments in JSON/YAML, run them with a single command
55
+ - **🔄 Resumable**: Automatic caching and resume—never lose your expensive LLM runs
56
+ - **📊 Systematic**: Grid search over models × prompts × sampling strategies
57
+ - **🔌 Provider-agnostic**: Works with 100+ LLM providers via LiteLLM (OpenAI, Anthropic, Azure, AWS Bedrock, Google AI, local LLMs, and more)
58
+ - **📈 Built-in evaluation**: Exact match, math verification, custom metrics
59
+ - **🎓 Production-ready**: Type-safe configs, structured logging, progress tracking
60
+
61
+ ---
62
+
63
+ ## Table of Contents
64
+
65
+ 1. [Installation](#installation)
66
+ 2. [Quick Start](#quick-start)
67
+ 3. [Examples & Tutorials](#examples--tutorials)
68
+ 4. [Core Concepts](#core-concepts)
69
+ 5. [CLI Reference](#cli-reference)
70
+ 6. [Configuration](#configuration)
71
+ 7. [Architecture](#architecture)
72
+ 8. [Development](#development)
73
+ 9. [Documentation](#documentation)
74
+
75
+ ---
76
+
77
+ ## Installation
78
+
79
+ ### Using uv (Recommended)
80
+
81
+ ```bash
82
+ # Clone the repository
83
+ git clone https://github.com/yourusername/themis.git
84
+ cd themis
85
+
86
+ # Install with uv
87
+ uv sync
88
+
89
+ # Verify installation
90
+ uv run python -m themis.cli --version
91
+ ```
92
+
93
+ ### Using pip
94
+
95
+ ```bash
96
+ # Basic installation
97
+ pip install -e .
98
+
99
+ # With development tools
100
+ pip install -e ".[dev]"
101
+
102
+ # With math evaluation support
103
+ pip install -e ".[math]"
104
+
105
+ # Full installation
106
+ pip install -e ".[dev,math]"
107
+ ```
108
+
109
+ ### Requirements
110
+
111
+ - Python 3.12+
112
+ - Dependencies: `pydantic`, `cyclopts`, `hydra-core`, `tqdm`, `httpx`
113
+
114
+ ---
115
+
116
+ ## Quick Start
117
+
118
+ ### 1. Explore Available Components
119
+
120
+ See what's available in your installation:
121
+
122
+ ```bash
123
+ # Show system info and quick start
124
+ uv run python -m themis.cli info
125
+
126
+ # List available LLM providers
127
+ uv run python -m themis.cli list-providers --verbose
128
+
129
+ # List available benchmarks
130
+ uv run python -m themis.cli list-benchmarks --verbose
131
+ ```
132
+
133
+ ### 2. Run the Built-in Demo
134
+
135
+ Test your installation with the fake model provider:
136
+
137
+ ```bash
138
+ # Run a quick smoke test
139
+ uv run python -m themis.cli demo
140
+
141
+ # See what's happening
142
+ uv run python -m themis.cli demo --log-level info
143
+ ```
144
+
145
+ This runs two inline math prompts through a fake LLM provider to verify the pipeline works end-to-end.
146
+
147
+ ### 3. Try Your First Real Experiment
148
+
149
+ Start with the comprehensive examples cookbook:
150
+
151
+ ```bash
152
+ # Your first experiment (15 minutes)
153
+ uv run python -m examples.getting_started.cli run
154
+
155
+ # Preview what will happen
156
+ uv run python -m examples.getting_started.cli run --dry-run
157
+
158
+ # Export results
159
+ uv run python -m examples.getting_started.cli run --csv-output results.csv
160
+ ```
161
+
162
+ ### 4. Connect to a Real LLM
163
+
164
+ Use any OpenAI-compatible endpoint (LM Studio, Ollama, vLLM, OpenAI):
165
+
166
+ ```bash
167
+ # Generate a config file
168
+ uv run python -m themis.cli init --template math500 --output my_config.yaml
169
+
170
+ # Edit my_config.yaml: set base_url, api_key, model name
171
+ # Then validate it
172
+ uv run python -m themis.cli validate-config --config my_config.yaml
173
+
174
+ # Run the experiment
175
+ uv run python -m themis.cli run-config --config my_config.yaml
176
+ ```
177
+
178
+ ---
179
+
180
+ ## Examples & Tutorials
181
+
182
+ **👉 Start here: [`examples/README.md`](examples/README.md)**
183
+
184
+ A comprehensive, hands-on cookbook with 5 progressive tutorials:
185
+
186
+ ### Learning Path
187
+
188
+ | Example | Focus | Time | What You'll Learn |
189
+ |---------|-------|------|-------------------|
190
+ | **[getting_started](examples/getting_started/)** | Basics | 15 min | Prompts, models, sampling, evaluation |
191
+ | **[config_file](examples/config_file/)** | Configuration | 20 min | JSON configs, grid searches, resumability |
192
+ | **[prompt_engineering](examples/prompt_engineering/)** | Prompt Strategies | 25 min | Zero-shot, few-shot, chain-of-thought, systematic comparison |
193
+ | **[projects](examples/projects/)** | Organization | 45 min | Multi-experiment projects, research workflows |
194
+ | **[advanced](examples/advanced/)** | Customization | 60 min | Custom runners, pipelines, metrics, agentic workflows |
195
+
196
+ ### Quick Reference
197
+
198
+ **🚀 [COOKBOOK.md](COOKBOOK.md)** - Cheat sheet with common patterns and troubleshooting
199
+
200
+ **Example commands:**
201
+ ```bash
202
+ # Basic experiment
203
+ uv run python -m examples.getting_started.cli run
204
+
205
+ # Grid search (2 models × 3 temperatures)
206
+ uv run python -m examples.config_file.cli run --config-path grid_search.json
207
+
208
+ # Prompt engineering experiment
209
+ uv run python -m examples.prompt_engineering.cli run
210
+
211
+ # Multi-experiment project
212
+ uv run python -m examples.projects.cli list-experiments
213
+ uv run python -m examples.projects.cli run --experiment zero-shot
214
+
215
+ # Custom behavior
216
+ uv run python -m examples.advanced.cli run --enable-subject-breakdown
217
+ ```
218
+
219
+ ---
220
+
221
+ ## Core Concepts
222
+
223
+ ### Three-Layer Architecture
224
+
225
+ ```
226
+ ╭─────────────────────────────────────────────────────────╮
227
+ │ Configuration Layer (JSON/YAML/CLI) │
228
+ │ • Dataset specs • Models • Sampling • Storage │
229
+ ╰─────────────────────────────────────────────────────────╯
230
+
231
+ ╭─────────────────────────────────────────────────────────╮
232
+ │ Experiment Layer (Orchestration) │
233
+ │ • Builder patterns • Runner coordination │
234
+ │ • Progress tracking • Caching & resume │
235
+ ╰─────────────────────────────────────────────────────────╯
236
+
237
+ ╭───────────────────────╮ ╭────────────────────────────╮
238
+ │ Generation Domain │ │ Evaluation Domain │
239
+ │ • Prompts │───▶│ • Extractors (JSON/math) │
240
+ │ • Providers │ │ • Metrics (exact/custom) │
241
+ │ • Sampling plans │ │ • Aggregation │
242
+ │ • Retry logic │ │ │
243
+ ╰───────────────────────╯ ╰────────────────────────────╯
244
+ ```
245
+
246
+ ### Key Components
247
+
248
+ - **Generation**: Prompt templates → Provider routing → Sampling strategies → Retry/backoff
249
+ - **Evaluation**: Response extraction → Metric computation → Multi-attempt scoring → Aggregation
250
+ - **Experiment**: Dataset loading → Generation plans → Runner execution → Storage → Reporting
251
+
252
+ See [`docs/DIAGRAM.md`](docs/DIAGRAM.md) for detailed architecture diagrams.
253
+
254
+ ---
255
+
256
+ ## CLI Reference
257
+
258
+ Themis provides experiment commands and utility commands:
259
+
260
+ ### Utility Commands
261
+
262
+ Quick commands for discovering and configuring Themis:
263
+
264
+ #### `info` - System Information
265
+
266
+ Show installed components and quick start guide:
267
+
268
+ ```bash
269
+ uv run python -m themis.cli info
270
+ ```
271
+
272
+ Displays:
273
+ - Version and Python info
274
+ - Installed providers
275
+ - Available benchmarks
276
+ - Example locations
277
+ - Documentation links
278
+
279
+ #### `list-providers` - Available LLM Providers
280
+
281
+ List all registered LLM providers:
282
+
283
+ ```bash
284
+ uv run python -m themis.cli list-providers
285
+
286
+ # Show detailed information
287
+ uv run python -m themis.cli list-providers --verbose
288
+ ```
289
+
290
+ Shows built-in providers (fake, litellm, vllm) and any custom registered providers.
291
+
292
+ **Note:** The `litellm` provider supports 100+ LLM providers including OpenAI, Anthropic, Azure OpenAI, AWS Bedrock, Google AI, Cohere, and many more. See [docs/LITELLM_PROVIDER.md](docs/LITELLM_PROVIDER.md) for details.
293
+
294
+ #### `list-benchmarks` - Available Datasets
295
+
296
+ List available datasets and benchmarks:
297
+
298
+ ```bash
299
+ uv run python -m themis.cli list-benchmarks
300
+
301
+ # Show details with subjects and commands
302
+ uv run python -m themis.cli list-benchmarks --verbose
303
+ ```
304
+
305
+ Shows math500, competition math benchmarks (aime24/aime25/amc23/olympiadbench/beyondaime), supergpqa, mmlu-pro, demo, and inline dataset options.
306
+
307
+ #### `init` - Generate Config File
308
+
309
+ Create a sample configuration file:
310
+
311
+ ```bash
312
+ # Basic template
313
+ uv run python -m themis.cli init
314
+
315
+ # MATH-500 with OpenAI-compatible endpoint
316
+ uv run python -m themis.cli init --template math500 --output my_config.yaml
317
+
318
+ # Inline dataset template
319
+ uv run python -m themis.cli init --template inline --output custom.yaml
320
+ ```
321
+
322
+ Available templates:
323
+ - `basic` - Simple fake provider setup for testing
324
+ - `math500` - MATH-500 benchmark with OpenAI-compatible provider
325
+ - `inline` - Custom inline dataset with examples
326
+
327
+ #### `validate-config` - Validate Configuration
328
+
329
+ Check a config file for errors without running:
330
+
331
+ ```bash
332
+ uv run python -m themis.cli validate-config --config my_config.yaml
333
+ ```
334
+
335
+ Shows parsed configuration and identifies errors before running expensive experiments.
336
+
337
+ #### `new-project` - Create a New Project
338
+
339
+ Scaffold a new project directory:
340
+
341
+ ```bash
342
+ uv run python -m themis.cli new-project --project-name my-new-project
343
+ ```
344
+
345
+ This will create a new directory called `my-new-project` with a basic project structure, including a sample configuration file, a CLI script, and a README.
346
+
347
+ ---
348
+
349
+ ### Experiment Commands
350
+
351
+ Commands for running experiments:
352
+
353
+ #### `demo` - Built-in Smoke Test
354
+
355
+ Quick verification using fake providers:
356
+
357
+ ```bash
358
+ uv run python -m themis.cli demo [OPTIONS]
359
+
360
+ Options:
361
+ --log-level TEXT Logging verbosity: critical|error|warning|info|debug|trace
362
+ --help Show help message
363
+ ```
364
+
365
+ #### `math500` - MATH-500 Benchmark
366
+
367
+ Zero-shot evaluation on the MATH-500 dataset:
368
+
369
+ ```bash
370
+ uv run python -m themis.cli math500 [OPTIONS]
371
+
372
+ Options:
373
+ --source TEXT Dataset source: 'huggingface' (default) or 'local'
374
+ --data-dir PATH Local MATH-500 directory (if --source local)
375
+ --limit INTEGER Limit number of samples (for testing)
376
+ --storage PATH Cache directory (default: .cache/themis)
377
+ --run-id TEXT Unique run identifier for resumability
378
+ --temperature FLOAT Model temperature (default: 0.0)
379
+ --log-level TEXT Logging verbosity
380
+ --resume / --no-resume Resume from cache (default: true)
381
+
382
+ Examples:
383
+ # Quick test with 50 samples
384
+ uv run python -m themis.cli math500 --limit 50
385
+
386
+ # Full evaluation with custom storage
387
+ uv run python -m themis.cli math500 \
388
+ --storage .cache/math500-eval \
389
+ --run-id run-2024-01-15 \
390
+ --temperature 0.0
391
+
392
+ # Use local MATH-500 dataset
393
+ uv run python -m themis.cli math500 \
394
+ --source local \
395
+ --data-dir /path/to/MATH-500 \
396
+ --limit 100
397
+ ```
398
+
399
+ #### Competition Math Benchmarks (`aime24`, `aime25`, `amc23`, `olympiadbench`, `beyondaime`)
400
+
401
+ Zero-shot evaluation across multiple competition datasets:
402
+
403
+ ```bash
404
+ uv run python -m themis.cli <benchmark> [OPTIONS]
405
+
406
+ Benchmarks:
407
+ aime24, aime25, amc23, olympiadbench, beyondaime
408
+
409
+ Options:
410
+ --source TEXT Dataset source: 'huggingface' (default) or 'local'
411
+ --split TEXT Dataset split to load (default: test)
412
+ --data-dir PATH Local dataset directory (if --source local)
413
+ --limit INTEGER Limit number of samples (for smoke tests)
414
+ --subjects TEXT... Optional subject filters (if provided by the dataset)
415
+ --storage PATH Cache directory for generations
416
+ --run-id TEXT Unique run identifier for resumability
417
+ --temperature FLOAT Model temperature (default: 0.0)
418
+ --log-level TEXT Logging verbosity
419
+ --resume / --no-resume Resume from cache (default: true)
420
+ ```
421
+
422
+ #### `supergpqa` - SuperGPQA Benchmark
423
+
424
+ Multiple-choice evaluation on the SuperGPQA dataset:
425
+
426
+ ```bash
427
+ uv run python -m themis.cli supergpqa [OPTIONS]
428
+
429
+ Options:
430
+ --source TEXT Dataset source: 'huggingface' (default) or 'local'
431
+ --split TEXT Dataset split to load (default: test)
432
+ --data-dir PATH Local dataset directory (if --source local)
433
+ --limit INTEGER Limit number of samples (for quick smoke tests)
434
+ --subjects TEXT... Optional category filters
435
+ --storage PATH Cache directory for generations
436
+ --run-id TEXT Unique run identifier for resumability
437
+ --temperature FLOAT Model temperature (default: 0.0)
438
+ --log-level TEXT Logging verbosity
439
+ --resume / --no-resume Resume from cache (default: true)
440
+ ```
441
+
442
+ #### `mmlu-pro` - MMLU-Pro Benchmark
443
+
444
+ Professional-level multiple-choice evaluation with refined distractors:
445
+
446
+ ```bash
447
+ uv run python -m themis.cli mmlu-pro [OPTIONS]
448
+
449
+ Options:
450
+ --source TEXT Dataset source: 'huggingface' (default) or 'local'
451
+ --split TEXT Dataset split to load (default: test)
452
+ --data-dir PATH Local dataset directory (if --source local)
453
+ --limit INTEGER Limit number of samples
454
+ --subjects TEXT... Optional subject filters
455
+ --storage PATH Cache directory for generations
456
+ --run-id TEXT Unique run identifier for resumability
457
+ --temperature FLOAT Model temperature (default: 0.0)
458
+ --log-level TEXT Logging verbosity
459
+ --resume / --no-resume Resume from cache (default: true)
460
+ ```
461
+
462
+ #### `run-config` - Config-Driven Experiments
463
+
464
+ Execute experiments defined in YAML config files:
465
+
466
+ ```bash
467
+ uv run python -m themis.cli run-config [OPTIONS]
468
+
469
+ Options:
470
+ --config PATH Path to YAML configuration file
471
+ --overrides TEXT Hydra-style overrides (space-separated)
472
+ --log-level TEXT Logging verbosity
473
+
474
+ Examples:
475
+ # Generate a config file first
476
+ uv run python -m themis.cli init --output my_config.yaml
477
+
478
+ # Run from config
479
+ uv run python -m themis.cli run-config --config my_config.yaml
480
+
481
+ # Override specific parameters
482
+ uv run python -m themis.cli run-config \
483
+ --config my_config.yaml \
484
+ --overrides "generation.sampling.temperature=0.2 max_samples=100"
485
+
486
+ # Multiple overrides
487
+ uv run python -m themis.cli run-config \
488
+ --config my_config.yaml \
489
+ --overrides "storage.run_id=new-run dataset.limit=50" \
490
+ --log-level debug
491
+ ```
492
+
493
+ ---
494
+
495
+ ## Configuration
496
+
497
+ ### Config File Structure
498
+
499
+ Themis uses JSON or YAML for configuration. Here's a complete example:
500
+
501
+ ```json
502
+ {
503
+ "run_id": "my-experiment",
504
+ "storage_dir": ".cache/my-experiment",
505
+ "resume": true,
506
+ "models": [
507
+ {
508
+ "name": "gpt-4",
509
+ "provider": "litellm",
510
+ "provider_options": {
511
+ "api_key": "sk-...",
512
+ "timeout": 60,
513
+ "n_parallel": 10
514
+ }
515
+ },
516
+ {
517
+ "name": "claude-3-opus-20240229",
518
+ "provider": "anthropic",
519
+ "provider_options": {
520
+ "timeout": 120
521
+ }
522
+ }
523
+ ],
524
+ "samplings": [
525
+ {"name": "greedy", "temperature": 0.0, "max_tokens": 512},
526
+ {"name": "creative", "temperature": 0.8, "max_tokens": 512}
527
+ ],
528
+ "datasets": [
529
+ {
530
+ "name": "math500",
531
+ "kind": "math500_hf",
532
+ "limit": 50
533
+ }
534
+ ]
535
+ }
536
+ ```
537
+
538
+ ### Configuration Options
539
+
540
+ **Core settings:**
541
+ - `run_id`: Unique identifier for caching and resumability
542
+ - `storage_dir`: Where to cache generations and results
543
+ - `resume`: Continue from previous runs (default: true)
544
+
545
+ **Models:**
546
+ - `name`: Model identifier
547
+ - `provider`: `litellm`, `openai`, `anthropic`, `azure`, `bedrock`, `fake`, or custom
548
+ - `provider_options`: Provider-specific configuration (API keys, endpoints, timeouts)
549
+
550
+ **Samplings:**
551
+ - `name`: Sampling strategy name
552
+ - `temperature`: Randomness (0.0 = deterministic, 1.0+ = creative)
553
+ - `max_tokens`: Maximum response length
554
+ - `top_p`, `top_k`: Nucleus/top-k sampling (optional)
555
+
556
+ **Datasets:**
557
+ - `name`: Dataset identifier
558
+ - `kind`: `demo`, `math500_hf`, `math500_local`, `inline`, or custom
559
+ - `limit`: Maximum samples (for testing)
560
+ - `source_path`: Path for local datasets (optional)
561
+
562
+ See [`docs/CONFIGURATION.md`](docs/CONFIGURATION.md) for the complete schema and [`docs/EXAMPLES.md`](docs/EXAMPLES.md) for common recipes.
563
+
564
+ ---
565
+
566
+ ## Architecture
567
+
568
+ ### Module Organization
569
+
570
+ ```
571
+ themis/
572
+ ├── cli/ # Command-line interface (Cyclopts)
573
+ ├── config/ # Configuration schema & loader (Pydantic, Hydra)
574
+ ├── core/ # Core entities (prompts, sampling specs, results)
575
+ ├── datasets/ # Dataset loaders (inline, HuggingFace, local)
576
+ ├── evaluation/ # Extractors, metrics, evaluation strategies
577
+ ├── experiment/ # Orchestration, builder patterns, storage
578
+ ├── generation/ # Generation strategies, runners, retry logic
579
+ ├── interfaces/ # Abstract base classes
580
+ ├── project/ # Multi-experiment project management
581
+ ├── providers/ # LLM provider implementations
582
+ └── utils/ # Logging, progress tracking, helpers
583
+ ```
584
+
585
+ ### Extension Points
586
+
587
+ Themis is designed for extensibility:
588
+
589
+ - **Custom providers**: Implement `Provider` interface for new LLM APIs
590
+ - **Custom datasets**: Implement `DatasetLoader` for new data sources
591
+ - **Custom metrics**: Implement `Metric` interface for domain-specific evaluation
592
+ - **Custom runners**: Override generation loops for specialized workflows
593
+ - **Custom pipelines**: Build evaluation pipelines with custom extractors
594
+
595
+ See [`docs/ADDING_COMPONENTS.md`](docs/ADDING_COMPONENTS.md) for detailed extension guides and [`examples/advanced/`](examples/advanced/) for working examples.
596
+
597
+ ---
598
+
599
+ ## Development
600
+
601
+ ### Running Tests
602
+
603
+ ```bash
604
+ # Full test suite
605
+ uv run pytest
606
+
607
+ # Specific test file
608
+ uv run pytest tests/generation/test_strategies.py
609
+
610
+ # With coverage
611
+ uv run pytest --cov=themis --cov-report=html
612
+
613
+ # Verbose output
614
+ uv run pytest -v
615
+ ```
616
+
617
+ ### Project Commands
618
+
619
+ ```bash
620
+ # Smoke test core CLI
621
+ uv run python -m themis.cli demo
622
+
623
+ # Test example pipelines
624
+ uv run python -m examples.getting_started.cli run --dry-run
625
+
626
+ # Run with specific storage (keeps cache local)
627
+ uv run python -m themis.cli math500 \
628
+ --storage .cache/dev-test \
629
+ --limit 5
630
+ ```
631
+
632
+ ### Code Style
633
+
634
+ - Python 3.12+, PEP 8 (4-space indent)
635
+ - Type hints throughout (mypy-compatible)
636
+ - Dataclasses and Pydantic models for configs/entities
637
+ - File names: `snake_case`
638
+ - Classes: `PascalCase`
639
+ - CLI commands: `dashed-names` (handled by Cyclopts)
640
+
641
+ ### Contributing
642
+
643
+ 1. Fork the repository
644
+ 2. Create a feature branch
645
+ 3. Make your changes with tests
646
+ 4. Run `uv run pytest` to verify
647
+ 5. Submit a pull request with:
648
+ - Summary of changes
649
+ - Test evidence
650
+ - Documentation updates (if applicable)
651
+ - Links to related issues
652
+
653
+ ---
654
+
655
+ ## Documentation
656
+
657
+ ### Core Documentation
658
+
659
+ - **[examples/README.md](examples/README.md)** - Comprehensive tutorial cookbook (START HERE!)
660
+ - **[COOKBOOK.md](COOKBOOK.md)** - Quick reference and cheat sheet
661
+ - **[docs/CONFIGURATION.md](docs/CONFIGURATION.md)** - Complete configuration schema
662
+ - **[docs/ADDING_COMPONENTS.md](docs/ADDING_COMPONENTS.md)** - Extension guide
663
+ - **[docs/DIAGRAM.md](docs/DIAGRAM.md)** - Architecture diagrams
664
+ - **[docs/EXAMPLES.md](docs/EXAMPLES.md)** - Additional recipes and patterns
665
+ - **[AGENTS.md](AGENTS.md)** - Repository guidelines for AI agents
666
+
667
+ ### Key Features
668
+
669
+ ✅ **Resumability**: Automatic caching by `run_id`—interrupted runs pick up where they left off
670
+ ✅ **Grid Search**: Cartesian product over models × samplings × prompts
671
+ ✅ **Progress Tracking**: tqdm progress bars and structured logging
672
+ ✅ **Type Safety**: Pydantic validation for configs and runtime entities
673
+ ✅ **Provider Agnostic**: Unified interface for 100+ providers via LiteLLM (OpenAI, Anthropic, Azure, Bedrock, local LLMs, etc.)
674
+ ✅ **Math Evaluation**: Built-in math-verify integration for numeric correctness
675
+ ✅ **Export Options**: CSV, JSON, HTML output formats
676
+ ✅ **Multi-Experiment Projects**: Organize related experiments with shared configs
677
+ ✅ **Integrations**: Log results to Weights & Biases and upload artifacts to Hugging Face Hub
678
+
679
+ ---
680
+
681
+ ## Use Cases
682
+
683
+ ### Academic Research
684
+ - Systematic prompt engineering experiments
685
+ - Model comparison studies
686
+ - Benchmark evaluations (MATH, GSM8K, etc.)
687
+ - Reproducible experiment pipelines
688
+
689
+ ### LLM Development
690
+ - Testing model variants during training
691
+ - Evaluating fine-tuned models
692
+ - Comparing sampling strategies
693
+ - A/B testing prompts and templates
694
+
695
+ ### Production Monitoring
696
+ - Regression testing for model updates
697
+ - Quality assurance on real-world examples
698
+ - Performance benchmarking
699
+ - Cost/latency analysis
700
+
701
+ ---
702
+
703
+
704
+ ---
705
+
706
+ ## Citation
707
+
708
+ If you use Themis in your research, please cite it using the following metadata:
709
+
710
+ ```yaml
711
+ @software{themis2025,
712
+ author = {Pittawat Taveekitworachai},
713
+ title = {Themis: Lightweight evaluation platform for LLM experiments},
714
+ year = {2025},
715
+ url = {https://github.com/Pittawat2542/themis}
716
+ }
717
+ ```
718
+
719
+ See [`CITATION.cff`](CITATION.cff) for more details.
720
+
721
+ ---
722
+
723
+ ## Contributing
724
+
725
+ We welcome contributions! Please see [`CONTRIBUTING.md`](CONTRIBUTING.md) for details on how to get started, run tests, and submit pull requests.
726
+
727
+ ---
728
+
729
+ ## License
730
+
731
+ MIT License - see [LICENSE](LICENSE) file for details
732
+
733
+ ---
734
+
735
+ ## Getting Help
736
+
737
+ - **Start with examples**: [`examples/README.md`](examples/README.md) has comprehensive tutorials
738
+ - **Quick reference**: [`COOKBOOK.md`](COOKBOOK.md) for common patterns
739
+ - **Check docs**: [`docs/`](docs/) directory has detailed guides
740
+ - **Search issues**: Look for similar problems in GitHub issues
741
+ - **Create issue**: Open a new issue with your question or bug report
742
+
743
+ ---
744
+
745
+ ## Roadmap
746
+
747
+ - [x] Universal provider support via LiteLLM (100+ providers)
748
+ - [ ] Web UI for experiment visualization
749
+ - [ ] Distributed generation support
750
+ - [ ] Advanced metrics (BLEU, ROUGE, semantic similarity)
751
+ - [ ] Experiment comparison dashboard
752
+ - [ ] Cost tracking and optimization
753
+
754
+ ---
755
+
756
+ **Happy experimenting!** 🚀
757
+
758
+ For a guided introduction, start with the [examples](examples/README.md). For quick lookups, check the [cookbook](COOKBOOK.md).
@@ -0,0 +1,8 @@
1
+ themis/__init__.py,sha256=3bKi1PneI5PaTaDPXsArCVvfinkLFDRU91lvZIeg7V0,281
2
+ themis/_version.py,sha256=tc4TJqWVv2dx4UzItLqneMPaG7vM8CQFDNW5pJgNoKg,345
3
+ themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ themis_eval-0.1.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
5
+ themis_eval-0.1.0.dist-info/METADATA,sha256=bRdc6UhSKYmptIJVhp4cEK8K2-Vvc77rZnVfYav0uS4,23516
6
+ themis_eval-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ themis_eval-0.1.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
8
+ themis_eval-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Themis Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ themis