themis-eval 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. themis_eval-0.2.0/PKG-INFO +596 -0
  2. themis_eval-0.2.0/README.md +538 -0
  3. {themis_eval-0.1.0 → themis_eval-0.2.0}/pyproject.toml +28 -6
  4. themis_eval-0.2.0/themis/__init__.py +25 -0
  5. {themis_eval-0.1.0 → themis_eval-0.2.0}/themis/_version.py +2 -2
  6. themis_eval-0.2.0/themis/api.py +343 -0
  7. themis_eval-0.2.0/themis/backends/__init__.py +17 -0
  8. themis_eval-0.2.0/themis/backends/execution.py +197 -0
  9. themis_eval-0.2.0/themis/backends/storage.py +260 -0
  10. themis_eval-0.2.0/themis/cli/__init__.py +5 -0
  11. themis_eval-0.2.0/themis/cli/__main__.py +6 -0
  12. themis_eval-0.2.0/themis/cli/commands/__init__.py +19 -0
  13. themis_eval-0.2.0/themis/cli/commands/benchmarks.py +221 -0
  14. themis_eval-0.2.0/themis/cli/commands/comparison.py +394 -0
  15. themis_eval-0.2.0/themis/cli/commands/config_commands.py +244 -0
  16. themis_eval-0.2.0/themis/cli/commands/cost.py +214 -0
  17. themis_eval-0.2.0/themis/cli/commands/demo.py +68 -0
  18. themis_eval-0.2.0/themis/cli/commands/info.py +90 -0
  19. themis_eval-0.2.0/themis/cli/commands/leaderboard.py +362 -0
  20. themis_eval-0.2.0/themis/cli/commands/math_benchmarks.py +318 -0
  21. themis_eval-0.2.0/themis/cli/commands/mcq_benchmarks.py +207 -0
  22. themis_eval-0.2.0/themis/cli/commands/results.py +252 -0
  23. themis_eval-0.2.0/themis/cli/commands/sample_run.py +244 -0
  24. themis_eval-0.2.0/themis/cli/commands/visualize.py +299 -0
  25. themis_eval-0.2.0/themis/cli/main.py +463 -0
  26. themis_eval-0.2.0/themis/cli/new_project.py +33 -0
  27. themis_eval-0.2.0/themis/cli/utils.py +51 -0
  28. themis_eval-0.2.0/themis/comparison/__init__.py +25 -0
  29. themis_eval-0.2.0/themis/comparison/engine.py +348 -0
  30. themis_eval-0.2.0/themis/comparison/reports.py +283 -0
  31. themis_eval-0.2.0/themis/comparison/statistics.py +402 -0
  32. themis_eval-0.2.0/themis/config/__init__.py +19 -0
  33. themis_eval-0.2.0/themis/config/loader.py +27 -0
  34. themis_eval-0.2.0/themis/config/registry.py +34 -0
  35. themis_eval-0.2.0/themis/config/runtime.py +214 -0
  36. themis_eval-0.2.0/themis/config/schema.py +112 -0
  37. themis_eval-0.2.0/themis/core/__init__.py +5 -0
  38. themis_eval-0.2.0/themis/core/conversation.py +354 -0
  39. themis_eval-0.2.0/themis/core/entities.py +184 -0
  40. themis_eval-0.2.0/themis/core/serialization.py +231 -0
  41. themis_eval-0.2.0/themis/core/tools.py +393 -0
  42. themis_eval-0.2.0/themis/core/types.py +141 -0
  43. themis_eval-0.2.0/themis/datasets/__init__.py +273 -0
  44. themis_eval-0.2.0/themis/datasets/base.py +264 -0
  45. themis_eval-0.2.0/themis/datasets/commonsense_qa.py +174 -0
  46. themis_eval-0.2.0/themis/datasets/competition_math.py +265 -0
  47. themis_eval-0.2.0/themis/datasets/coqa.py +133 -0
  48. themis_eval-0.2.0/themis/datasets/gpqa.py +190 -0
  49. themis_eval-0.2.0/themis/datasets/gsm8k.py +123 -0
  50. themis_eval-0.2.0/themis/datasets/gsm_symbolic.py +124 -0
  51. themis_eval-0.2.0/themis/datasets/math500.py +122 -0
  52. themis_eval-0.2.0/themis/datasets/med_qa.py +179 -0
  53. themis_eval-0.2.0/themis/datasets/medmcqa.py +169 -0
  54. themis_eval-0.2.0/themis/datasets/mmlu_pro.py +262 -0
  55. themis_eval-0.2.0/themis/datasets/piqa.py +146 -0
  56. themis_eval-0.2.0/themis/datasets/registry.py +201 -0
  57. themis_eval-0.2.0/themis/datasets/schema.py +245 -0
  58. themis_eval-0.2.0/themis/datasets/sciq.py +150 -0
  59. themis_eval-0.2.0/themis/datasets/social_i_qa.py +151 -0
  60. themis_eval-0.2.0/themis/datasets/super_gpqa.py +263 -0
  61. themis_eval-0.2.0/themis/evaluation/__init__.py +1 -0
  62. themis_eval-0.2.0/themis/evaluation/conditional.py +410 -0
  63. themis_eval-0.2.0/themis/evaluation/extractors/__init__.py +19 -0
  64. themis_eval-0.2.0/themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  65. themis_eval-0.2.0/themis/evaluation/extractors/exceptions.py +7 -0
  66. themis_eval-0.2.0/themis/evaluation/extractors/identity_extractor.py +29 -0
  67. themis_eval-0.2.0/themis/evaluation/extractors/json_field_extractor.py +45 -0
  68. themis_eval-0.2.0/themis/evaluation/extractors/math_verify_extractor.py +37 -0
  69. themis_eval-0.2.0/themis/evaluation/extractors/regex_extractor.py +43 -0
  70. themis_eval-0.2.0/themis/evaluation/math_verify_utils.py +87 -0
  71. themis_eval-0.2.0/themis/evaluation/metrics/__init__.py +21 -0
  72. themis_eval-0.2.0/themis/evaluation/metrics/code/__init__.py +19 -0
  73. themis_eval-0.2.0/themis/evaluation/metrics/code/codebleu.py +144 -0
  74. themis_eval-0.2.0/themis/evaluation/metrics/code/execution.py +280 -0
  75. themis_eval-0.2.0/themis/evaluation/metrics/code/pass_at_k.py +181 -0
  76. themis_eval-0.2.0/themis/evaluation/metrics/composite_metric.py +47 -0
  77. themis_eval-0.2.0/themis/evaluation/metrics/consistency_metric.py +80 -0
  78. themis_eval-0.2.0/themis/evaluation/metrics/exact_match.py +51 -0
  79. themis_eval-0.2.0/themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  80. themis_eval-0.2.0/themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  81. themis_eval-0.2.0/themis/evaluation/metrics/nlp/__init__.py +21 -0
  82. themis_eval-0.2.0/themis/evaluation/metrics/nlp/bertscore.py +138 -0
  83. themis_eval-0.2.0/themis/evaluation/metrics/nlp/bleu.py +129 -0
  84. themis_eval-0.2.0/themis/evaluation/metrics/nlp/meteor.py +153 -0
  85. themis_eval-0.2.0/themis/evaluation/metrics/nlp/rouge.py +136 -0
  86. themis_eval-0.2.0/themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  87. themis_eval-0.2.0/themis/evaluation/metrics/response_length.py +33 -0
  88. themis_eval-0.2.0/themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  89. themis_eval-0.2.0/themis/evaluation/pipeline.py +49 -0
  90. themis_eval-0.2.0/themis/evaluation/pipelines/__init__.py +15 -0
  91. themis_eval-0.2.0/themis/evaluation/pipelines/composable_pipeline.py +357 -0
  92. themis_eval-0.2.0/themis/evaluation/pipelines/standard_pipeline.py +348 -0
  93. themis_eval-0.2.0/themis/evaluation/reports.py +293 -0
  94. themis_eval-0.2.0/themis/evaluation/statistics/__init__.py +53 -0
  95. themis_eval-0.2.0/themis/evaluation/statistics/bootstrap.py +79 -0
  96. themis_eval-0.2.0/themis/evaluation/statistics/confidence_intervals.py +121 -0
  97. themis_eval-0.2.0/themis/evaluation/statistics/distributions.py +207 -0
  98. themis_eval-0.2.0/themis/evaluation/statistics/effect_sizes.py +124 -0
  99. themis_eval-0.2.0/themis/evaluation/statistics/hypothesis_tests.py +305 -0
  100. themis_eval-0.2.0/themis/evaluation/statistics/types.py +139 -0
  101. themis_eval-0.2.0/themis/evaluation/strategies/__init__.py +13 -0
  102. themis_eval-0.2.0/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  103. themis_eval-0.2.0/themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  104. themis_eval-0.2.0/themis/evaluation/strategies/evaluation_strategy.py +24 -0
  105. themis_eval-0.2.0/themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  106. themis_eval-0.2.0/themis/experiment/__init__.py +5 -0
  107. themis_eval-0.2.0/themis/experiment/builder.py +151 -0
  108. themis_eval-0.2.0/themis/experiment/cache_manager.py +134 -0
  109. themis_eval-0.2.0/themis/experiment/comparison.py +631 -0
  110. themis_eval-0.2.0/themis/experiment/cost.py +310 -0
  111. themis_eval-0.2.0/themis/experiment/definitions.py +62 -0
  112. themis_eval-0.2.0/themis/experiment/export.py +798 -0
  113. themis_eval-0.2.0/themis/experiment/export_csv.py +159 -0
  114. themis_eval-0.2.0/themis/experiment/integration_manager.py +104 -0
  115. themis_eval-0.2.0/themis/experiment/math.py +192 -0
  116. themis_eval-0.2.0/themis/experiment/mcq.py +169 -0
  117. themis_eval-0.2.0/themis/experiment/orchestrator.py +415 -0
  118. themis_eval-0.2.0/themis/experiment/pricing.py +317 -0
  119. themis_eval-0.2.0/themis/experiment/storage.py +1458 -0
  120. themis_eval-0.2.0/themis/experiment/visualization.py +588 -0
  121. themis_eval-0.2.0/themis/generation/__init__.py +1 -0
  122. themis_eval-0.2.0/themis/generation/agentic_runner.py +420 -0
  123. themis_eval-0.2.0/themis/generation/batching.py +254 -0
  124. themis_eval-0.2.0/themis/generation/clients.py +143 -0
  125. themis_eval-0.2.0/themis/generation/conversation_runner.py +236 -0
  126. themis_eval-0.2.0/themis/generation/plan.py +456 -0
  127. themis_eval-0.2.0/themis/generation/providers/litellm_provider.py +221 -0
  128. themis_eval-0.2.0/themis/generation/providers/vllm_provider.py +135 -0
  129. themis_eval-0.2.0/themis/generation/router.py +34 -0
  130. themis_eval-0.2.0/themis/generation/runner.py +207 -0
  131. themis_eval-0.2.0/themis/generation/strategies.py +98 -0
  132. themis_eval-0.2.0/themis/generation/templates.py +71 -0
  133. themis_eval-0.2.0/themis/generation/turn_strategies.py +393 -0
  134. themis_eval-0.2.0/themis/generation/types.py +9 -0
  135. themis_eval-0.2.0/themis/integrations/huggingface.py +72 -0
  136. themis_eval-0.2.0/themis/integrations/wandb.py +77 -0
  137. themis_eval-0.2.0/themis/interfaces/__init__.py +169 -0
  138. themis_eval-0.2.0/themis/presets/__init__.py +10 -0
  139. themis_eval-0.2.0/themis/presets/benchmarks.py +354 -0
  140. themis_eval-0.2.0/themis/presets/models.py +190 -0
  141. themis_eval-0.2.0/themis/project/__init__.py +20 -0
  142. themis_eval-0.2.0/themis/project/definitions.py +98 -0
  143. themis_eval-0.2.0/themis/project/patterns.py +230 -0
  144. themis_eval-0.2.0/themis/providers/__init__.py +5 -0
  145. themis_eval-0.2.0/themis/providers/registry.py +39 -0
  146. themis_eval-0.2.0/themis/py.typed +0 -0
  147. themis_eval-0.2.0/themis/server/__init__.py +28 -0
  148. themis_eval-0.2.0/themis/server/app.py +337 -0
  149. themis_eval-0.2.0/themis/utils/api_generator.py +379 -0
  150. themis_eval-0.2.0/themis/utils/cost_tracking.py +376 -0
  151. themis_eval-0.2.0/themis/utils/dashboard.py +452 -0
  152. themis_eval-0.2.0/themis/utils/logging_utils.py +41 -0
  153. themis_eval-0.2.0/themis/utils/progress.py +58 -0
  154. themis_eval-0.2.0/themis/utils/tracing.py +320 -0
  155. themis_eval-0.2.0/themis_eval.egg-info/PKG-INFO +596 -0
  156. themis_eval-0.2.0/themis_eval.egg-info/SOURCES.txt +161 -0
  157. themis_eval-0.2.0/themis_eval.egg-info/requires.txt +47 -0
  158. themis_eval-0.1.0/PKG-INFO +0 -758
  159. themis_eval-0.1.0/README.md +0 -718
  160. themis_eval-0.1.0/themis/__init__.py +0 -14
  161. themis_eval-0.1.0/themis_eval.egg-info/PKG-INFO +0 -758
  162. themis_eval-0.1.0/themis_eval.egg-info/SOURCES.txt +0 -12
  163. themis_eval-0.1.0/themis_eval.egg-info/requires.txt +0 -25
  164. {themis_eval-0.1.0 → themis_eval-0.2.0}/LICENSE +0 -0
  165. {themis_eval-0.1.0 → themis_eval-0.2.0}/setup.cfg +0 -0
  166. {themis_eval-0.1.0 → themis_eval-0.2.0}/tests/test_package_metadata.py +0 -0
  167. /themis_eval-0.1.0/themis/py.typed → /themis_eval-0.2.0/themis/integrations/__init__.py +0 -0
  168. {themis_eval-0.1.0 → themis_eval-0.2.0}/themis_eval.egg-info/dependency_links.txt +0 -0
  169. {themis_eval-0.1.0 → themis_eval-0.2.0}/themis_eval.egg-info/top_level.txt +0 -0
@@ -0,0 +1,596 @@
1
+ Metadata-Version: 2.4
2
+ Name: themis-eval
3
+ Version: 0.2.0
4
+ Summary: Lightweight evaluation platform for LLM experiments
5
+ Author: Pittawat Taveekitworachai
6
+ License: MIT
7
+ Project-URL: Resources, https://github.com/Pittawat2542/themis
8
+ Project-URL: Homepage, https://pittawat2542.github.io/themis/
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.12
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: pydantic>=2.12.5
19
+ Requires-Dist: cyclopts>=4.0.0
20
+ Requires-Dist: hydra-core>=1.3
21
+ Requires-Dist: tqdm>=4.67
22
+ Requires-Dist: httpx>=0.27
23
+ Requires-Dist: litellm>=1.81.0
24
+ Requires-Dist: tabulate>=0.9.0
25
+ Requires-Dist: tenacity>=9.1.2
26
+ Requires-Dist: plotly>=6.5.0
27
+ Requires-Dist: math-verify>=0.8.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
31
+ Requires-Dist: pytest-timeout>=2.3.1; extra == "dev"
32
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == "dev"
33
+ Requires-Dist: ruff>=0.8.5; extra == "dev"
34
+ Requires-Dist: mypy>=1.14.0; extra == "dev"
35
+ Provides-Extra: math
36
+ Requires-Dist: datasets>=2.20.0; extra == "math"
37
+ Requires-Dist: math-verify>=0.8.0; extra == "math"
38
+ Provides-Extra: nlp
39
+ Requires-Dist: sacrebleu>=2.4.0; extra == "nlp"
40
+ Requires-Dist: rouge-score>=0.1.2; extra == "nlp"
41
+ Requires-Dist: bert-score>=0.3.13; extra == "nlp"
42
+ Requires-Dist: nltk>=3.8.0; extra == "nlp"
43
+ Provides-Extra: code
44
+ Requires-Dist: codebleu>=0.7.0; extra == "code"
45
+ Provides-Extra: viz
46
+ Requires-Dist: plotly>=5.18.0; extra == "viz"
47
+ Provides-Extra: server
48
+ Requires-Dist: fastapi>=0.128.0; extra == "server"
49
+ Requires-Dist: uvicorn[standard]>=0.32.0; extra == "server"
50
+ Requires-Dist: websockets>=14.0; extra == "server"
51
+ Provides-Extra: docs
52
+ Requires-Dist: mkdocs>=1.6.0; extra == "docs"
53
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
54
+ Requires-Dist: mkdocstrings[python]>=0.25.0; extra == "docs"
55
+ Provides-Extra: all
56
+ Requires-Dist: themis-eval[code,docs,math,nlp,server,viz]; extra == "all"
57
+ Dynamic: license-file
58
+
59
+ # Themis
60
+
61
+ > **Modern LLM evaluation framework for researchers and practitioners**
62
+
63
+ Themis makes it easy to evaluate language models systematically with one-liner Python APIs, built-in benchmarks, statistical comparisons, and a web dashboard.
64
+
65
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
66
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
67
+
68
+ ---
69
+
70
+ ## Why Themis?
71
+
72
+ - **🚀 Simple**: One-line Python API or CLI commands—no configuration files needed
73
+ - **📊 Comprehensive**: 100+ LLM providers, built-in benchmarks, NLP & code metrics
74
+ - **🔬 Statistical**: Compare runs with t-tests, bootstrap, and permutation tests
75
+ - **💾 Reliable**: Automatic caching, resume failed runs, smart cache invalidation
76
+ - **🌐 Visual**: Web dashboard for exploring results and comparisons
77
+ - **🔌 Extensible**: Pluggable backends for custom storage and execution
78
+
79
+ ---
80
+
81
+ ## Quick Start
82
+
83
+ ### Installation
84
+
85
+ ```bash
86
+ # Using pip
87
+ pip install themis-eval
88
+
89
+ # Or with uv (recommended)
90
+ uv pip install themis-eval
91
+
92
+ # With optional features
93
+ pip install themis-eval[math,nlp,code,server]
94
+ ```
95
+
96
+ ### One-Liner Evaluation
97
+
98
+ ```python
99
+ from themis import evaluate
100
+
101
+ # Evaluate any model on any benchmark
102
+ result = evaluate(
103
+ benchmark="gsm8k",
104
+ model="gpt-4",
105
+ limit=100
106
+ )
107
+
108
+ print(f"Accuracy: {result.metrics['exact_match']:.2%}")
109
+ ```
110
+
111
+ ### CLI Usage
112
+
113
+ ```bash
114
+ # Evaluate a model
115
+ themis eval gsm8k --model gpt-4 --limit 100
116
+
117
+ # Compare two models
118
+ themis eval gsm8k --model gpt-4 --limit 100 --run-id gpt4-run
119
+ themis eval gsm8k --model claude-3-opus --limit 100 --run-id claude-run
120
+ themis compare gpt4-run claude-run
121
+
122
+ # Start web dashboard
123
+ themis serve
124
+ ```
125
+
126
+ ---
127
+
128
+ ## Features
129
+
130
+ ### 🎯 Built-in Benchmarks
131
+
132
+ Themis includes 6 popular benchmarks out-of-the-box:
133
+
134
+ ```python
135
+ # Math reasoning
136
+ evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
137
+ evaluate(benchmark="math500", model="gpt-4", limit=50)
138
+ evaluate(benchmark="aime24", model="gpt-4")
139
+
140
+ # General knowledge
141
+ evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
142
+ evaluate(benchmark="supergpqa", model="gpt-4")
143
+
144
+ # Quick testing
145
+ evaluate(benchmark="demo", model="fake-math-llm", limit=10)
146
+ ```
147
+
148
+ **See all available benchmarks:**
149
+ ```bash
150
+ themis list benchmarks
151
+ ```
152
+
153
+ ### 📈 Rich Metrics
154
+
155
+ **Math Metrics:**
156
+ - Exact Match
157
+ - Math Verification (symbolic & numeric)
158
+
159
+ **NLP Metrics:**
160
+ - BLEU, ROUGE, BERTScore, METEOR
161
+
162
+ **Code Metrics:**
163
+ - Pass@k, CodeBLEU, Execution Accuracy
164
+
165
+ ```python
166
+ # Use specific metrics
167
+ result = evaluate(
168
+ benchmark="gsm8k",
169
+ model="gpt-4",
170
+ metrics=["exact_match", "bleu", "rouge1"],
171
+ )
172
+ ```
173
+
174
+ ### 🔬 Statistical Comparison
175
+
176
+ Compare multiple runs with statistical significance testing:
177
+
178
+ ```python
179
+ from themis.comparison import compare_runs
180
+
181
+ report = compare_runs(
182
+ run_ids=["gpt4-run", "claude-run"],
183
+ storage_path=".cache/experiments",
184
+ statistical_test="bootstrap",
185
+ alpha=0.05
186
+ )
187
+
188
+ print(report.summary())
189
+ # Shows: win/loss matrices, p-values, effect sizes
190
+ ```
191
+
192
+ **CLI:**
193
+ ```bash
194
+ themis compare run-1 run-2 --test bootstrap --output comparison.html
195
+ ```
196
+
197
+ ### 🌐 Web Dashboard
198
+
199
+ Start the API server and view results in your browser:
200
+
201
+ ```bash
202
+ themis serve
203
+
204
+ # Open http://localhost:8080/dashboard
205
+ # API docs at http://localhost:8080/docs
206
+ ```
207
+
208
+ **Features:**
209
+ - List all experiment runs
210
+ - View detailed results
211
+ - Compare multiple runs
212
+ - REST API + WebSocket support
213
+
214
+ ### 🔌 100+ LLM Providers
215
+
216
+ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider support:
217
+
218
+ ```python
219
+ # OpenAI
220
+ evaluate(benchmark="gsm8k", model="gpt-4")
221
+
222
+ # Anthropic
223
+ evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
224
+
225
+ # Azure OpenAI
226
+ evaluate(benchmark="gsm8k", model="azure/gpt-4")
227
+
228
+ # Local models (vLLM, Ollama, etc.)
229
+ evaluate(benchmark="gsm8k", model="ollama/llama3")
230
+
231
+ # AWS Bedrock
232
+ evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
233
+ ```
234
+
235
+ ### 💾 Smart Caching
236
+
237
+ Themis automatically caches results and resumes failed runs:
238
+
239
+ ```python
240
+ # Run with caching
241
+ result = evaluate(
242
+ benchmark="gsm8k",
243
+ model="gpt-4",
244
+ limit=1000,
245
+ run_id="my-experiment",
246
+ resume=True # Skip already-evaluated samples
247
+ )
248
+ ```
249
+
250
+ Cache invalidation is automatic when you change:
251
+ - Model parameters (temperature, max_tokens, etc.)
252
+ - Prompt template
253
+ - Evaluation metrics
254
+
255
+ ---
256
+
257
+ ## Examples
258
+
259
+ ### Custom Dataset
260
+
261
+ ```python
262
+ from themis import evaluate
263
+
264
+ # Your own data
265
+ dataset = [
266
+ {"prompt": "What is 2+2?", "answer": "4"},
267
+ {"prompt": "What is 3+3?", "answer": "6"},
268
+ ]
269
+
270
+ result = evaluate(
271
+ dataset,
272
+ model="gpt-4",
273
+ prompt="Answer this math question: {prompt}",
274
+ metrics=["exact_match"],
275
+ )
276
+
277
+ print(result.report)
278
+ ```
279
+
280
+ ### Advanced Configuration
281
+
282
+ ```python
283
+ result = evaluate(
284
+ benchmark="gsm8k",
285
+ model="gpt-4",
286
+ temperature=0.7,
287
+ max_tokens=512,
288
+ num_samples=3, # Sample 3 responses per prompt
289
+ workers=8, # Parallel execution
290
+ storage=".cache/my-experiments",
291
+ run_id="experiment-2024-01",
292
+ )
293
+ ```
294
+
295
+ ### Programmatic Comparison
296
+
297
+ ```python
298
+ from themis.comparison.statistics import t_test, bootstrap_confidence_interval
299
+
300
+ # Model A scores
301
+ scores_a = [0.85, 0.87, 0.83, 0.90, 0.82]
302
+ # Model B scores
303
+ scores_b = [0.78, 0.80, 0.79, 0.82, 0.77]
304
+
305
+ # Statistical test
306
+ result = bootstrap_confidence_interval(
307
+ scores_a, scores_b,
308
+ n_bootstrap=10000,
309
+ confidence_level=0.95
310
+ )
311
+
312
+ print(f"Significant: {result.significant}")
313
+ print(f"CI: {result.confidence_interval}")
314
+ ```
315
+
316
+ ---
317
+
318
+ ## Architecture
319
+
320
+ Themis is built on a clean, modular architecture:
321
+
322
+ ```
323
+ ┌─────────────────────────────────────────┐
324
+ │ themis.evaluate() │ ← Simple API
325
+ │ (One-line evaluation interface) │
326
+ └─────────────────┬───────────────────────┘
327
+
328
+ ┌────────┴────────┐
329
+ │ │
330
+ ┌────▼─────┐ ┌────▼─────┐
331
+ │ Presets │ │Generation│
332
+ │ System │ │ Pipeline │
333
+ └────┬─────┘ └────┬─────┘
334
+ │ │
335
+ ┌────▼─────┐ ┌────▼─────┐
336
+ │Benchmarks│ │Evaluation│
337
+ │(6 built- │ │ Pipeline │
338
+ │ in) │ └────┬─────┘
339
+ └──────────┘ │
340
+ ┌────▼─────┐
341
+ │ Storage │
342
+ │ (V2) │
343
+ └──────────┘
344
+ ```
345
+
346
+ **Key Components:**
347
+
348
+ - **Presets**: Pre-configured benchmarks with prompts, metrics, and datasets
349
+ - **Generation**: Model inference with caching and resume
350
+ - **Evaluation**: Metric computation with smart cache invalidation
351
+ - **Storage**: Atomic writes, file locking, SQLite metadata
352
+ - **Comparison**: Statistical tests, win/loss matrices
353
+ - **Server**: REST API and WebSocket for web dashboard
354
+
355
+ ---
356
+
357
+ ## Documentation
358
+
359
+ - **[API Reference](docs/index.md)** - Detailed API documentation
360
+ - **[Examples](examples-simple/)** - Runnable code examples
361
+ - **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
362
+ - **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
363
+ - **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
364
+
365
+ ---
366
+
367
+ ## Advanced Usage
368
+
369
+ ### Custom Backends
370
+
371
+ Implement custom storage or execution strategies:
372
+
373
+ ```python
374
+ from themis.backends import StorageBackend, ExecutionBackend
375
+
376
+ class S3StorageBackend(StorageBackend):
377
+ """Store results in AWS S3"""
378
+ def save_generation_record(self, run_id, record):
379
+ # Upload to S3
380
+ pass
381
+ # ... implement other methods
382
+
383
+ # Use custom backend
384
+ result = evaluate(
385
+ benchmark="gsm8k",
386
+ model="gpt-4",
387
+ storage_backend=S3StorageBackend(bucket="my-bucket")
388
+ )
389
+ ```
390
+
391
+ See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
392
+
393
+ ### Distributed Execution
394
+
395
+ ```python
396
+ from themis.backends import ExecutionBackend
397
+ import ray
398
+
399
+ class RayExecutionBackend(ExecutionBackend):
400
+ """Distributed execution with Ray"""
401
+ # ... implementation
402
+
403
+ result = evaluate(
404
+ benchmark="math500",
405
+ model="gpt-4",
406
+ execution_backend=RayExecutionBackend(num_cpus=32)
407
+ )
408
+ ```
409
+
410
+ ### Monitoring & Observability
411
+
412
+ Connect to the WebSocket endpoint for real-time updates:
413
+
414
+ ```python
415
+ import asyncio
416
+ import websockets
417
+ import json
418
+
419
+ async def monitor():
420
+ async with websockets.connect("ws://localhost:8080/ws") as ws:
421
+ await ws.send(json.dumps({"type": "subscribe", "run_id": "my-run"}))
422
+ async for message in ws:
423
+ print(json.loads(message))
424
+
425
+ asyncio.run(monitor())
426
+ ```
427
+
428
+ ---
429
+
430
+ ## CLI Reference
431
+
432
+ ### Evaluation
433
+
434
+ ```bash
435
+ # Basic evaluation
436
+ themis eval <benchmark> --model <model> [options]
437
+
438
+ # Options:
439
+ # --limit N Evaluate first N samples
440
+ # --temperature FLOAT Sampling temperature (default: 0.0)
441
+ # --max-tokens INT Maximum tokens (default: 512)
442
+ # --workers INT Parallel workers (default: 4)
443
+ # --run-id STR Run identifier
444
+ # --storage PATH Storage directory
445
+ # --resume Resume from cache
446
+ # --output FILE Export results (.json, .csv, .html)
447
+ ```
448
+
449
+ ### Comparison
450
+
451
+ ```bash
452
+ # Compare two or more runs
453
+ themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
454
+
455
+ # Options:
456
+ # --storage PATH Storage directory
457
+ # --test STR Statistical test: t_test, bootstrap, permutation
458
+ # --alpha FLOAT Significance level (default: 0.05)
459
+ # --output FILE Export report (.json, .html, .md)
460
+ ```
461
+
462
+ ### Server
463
+
464
+ ```bash
465
+ # Start API server
466
+ themis serve [options]
467
+
468
+ # Options:
469
+ # --port INT Port (default: 8080)
470
+ # --host STR Host (default: 127.0.0.1)
471
+ # --storage PATH Storage directory
472
+ # --reload Auto-reload (dev mode)
473
+ ```
474
+
475
+ ### List
476
+
477
+ ```bash
478
+ # List available resources
479
+ themis list <what>
480
+
481
+ # Options:
482
+ # runs List all experiment runs
483
+ # benchmarks List available benchmarks
484
+ # metrics List available metrics
485
+ ```
486
+
487
+ ---
488
+
489
+ ## Development
490
+
491
+ ### Setup
492
+
493
+ ```bash
494
+ # Clone repository
495
+ git clone https://github.com/yourusername/themis.git
496
+ cd themis
497
+
498
+ # Install with dev dependencies
499
+ uv pip install -e ".[dev,math,nlp,code,server]"
500
+
501
+ # Run tests
502
+ uv run pytest
503
+
504
+ # Run specific test
505
+ uv run pytest tests/comparison/test_statistics.py -v
506
+ ```
507
+
508
+ ### Project Structure
509
+
510
+ ```
511
+ themis/
512
+ ├── themis/
513
+ │ ├── api.py # Main evaluate() function
514
+ │ ├── presets/ # Benchmark presets
515
+ │ ├── generation/ # Model inference
516
+ │ ├── evaluation/ # Metrics & evaluation
517
+ │ ├── comparison/ # Statistical comparison
518
+ │ ├── backends/ # Pluggable backends
519
+ │ ├── server/ # FastAPI server
520
+ │ └── cli/ # CLI commands
521
+ ├── tests/ # Test suite
522
+ ├── examples-simple/ # Minimal examples
523
+ ├── docs/ # Documentation
524
+ └── pyproject.toml # Package configuration
525
+ ```
526
+
527
+ ### Running Examples
528
+
529
+ ```bash
530
+ # Simple quickstart
531
+ uv run python examples-simple/01_quickstart.py
532
+
533
+ # Custom dataset
534
+ uv run python examples-simple/02_custom_dataset.py
535
+
536
+ # Comparison example
537
+ uv run python examples-simple/04_comparison.py
538
+
539
+ # API server example
540
+ uv run python examples-simple/05_api_server.py
541
+ ```
542
+
543
+ ---
544
+
545
+ ## Contributing
546
+
547
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
548
+
549
+ Areas where we'd love help:
550
+ - Additional benchmark presets
551
+ - New evaluation metrics
552
+ - Backend implementations (Ray, S3, etc.)
553
+ - Documentation improvements
554
+ - Bug reports and feature requests
555
+
556
+ ---
557
+
558
+ ## Citation
559
+
560
+ If you use Themis in your research, please cite:
561
+
562
+ ```bibtex
563
+ @software{themis2024,
564
+ title = {Themis: Modern LLM Evaluation Framework},
565
+ author = {Your Name},
566
+ year = {2024},
567
+ url = {https://github.com/yourusername/themis}
568
+ }
569
+ ```
570
+
571
+ ---
572
+
573
+ ## License
574
+
575
+ MIT License - see [LICENSE](LICENSE) for details.
576
+
577
+ ---
578
+
579
+ ## Acknowledgments
580
+
581
+ - Built on [LiteLLM](https://github.com/BerriAI/litellm) for provider support
582
+ - Inspired by [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
583
+ - Statistical methods from established research practices
584
+
585
+ ---
586
+
587
+ ## Support
588
+
589
+ - **Documentation**: [docs/index.md](docs/index.md)
590
+ - **Examples**: [examples-simple/](examples-simple/)
591
+ - **Issues**: [GitHub Issues](https://github.com/yourusername/themis/issues)
592
+ - **Discussions**: [GitHub Discussions](https://github.com/yourusername/themis/discussions)
593
+
594
+ ---
595
+
596
+ **Made with ❤️ for the LLM research community**