themis-eval 0.2.2__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {themis_eval-0.2.2/themis_eval.egg-info → themis_eval-1.0.0}/PKG-INFO +47 -34
- {themis_eval-0.2.2 → themis_eval-1.0.0}/README.md +46 -33
- {themis_eval-0.2.2 → themis_eval-1.0.0}/pyproject.toml +1 -1
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/__init__.py +5 -2
- themis_eval-1.0.0/themis/_version.py +30 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/api.py +83 -145
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/backends/storage.py +5 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/info.py +2 -11
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/main.py +231 -40
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/engine.py +7 -13
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/entities.py +4 -0
- themis_eval-1.0.0/themis/evaluation/metric_pipeline.py +12 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipeline.py +22 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipelines/__init__.py +4 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipelines/composable_pipeline.py +55 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipelines/standard_pipeline.py +18 -1
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
- themis_eval-1.0.0/themis/experiment/__init__.py +5 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/cache_manager.py +15 -1
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/definitions.py +1 -1
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/orchestrator.py +21 -11
- themis_eval-1.0.0/themis/experiment/share.py +264 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/storage.py +345 -298
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/plan.py +28 -6
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/router.py +22 -4
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/runner.py +16 -1
- themis_eval-1.0.0/themis/presets/benchmarks.py +939 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/server/app.py +38 -26
- themis_eval-1.0.0/themis/session.py +125 -0
- themis_eval-1.0.0/themis/specs/__init__.py +7 -0
- themis_eval-1.0.0/themis/specs/execution.py +26 -0
- themis_eval-1.0.0/themis/specs/experiment.py +33 -0
- themis_eval-1.0.0/themis/specs/storage.py +18 -0
- themis_eval-1.0.0/themis/storage/__init__.py +6 -0
- themis_eval-1.0.0/themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0/themis_eval.egg-info}/PKG-INFO +47 -34
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/SOURCES.txt +9 -2
- themis_eval-0.2.2/themis/_version.py +0 -17
- themis_eval-0.2.2/themis/experiment/__init__.py +0 -5
- themis_eval-0.2.2/themis/experiment/builder.py +0 -151
- themis_eval-0.2.2/themis/experiment/export_csv.py +0 -159
- themis_eval-0.2.2/themis/presets/benchmarks.py +0 -354
- {themis_eval-0.2.2 → themis_eval-1.0.0}/LICENSE +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/setup.cfg +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/tests/test_package_metadata.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/backends/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/backends/execution.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/__main__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/benchmarks.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/comparison.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/config_commands.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/cost.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/demo.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/leaderboard.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/math_benchmarks.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/mcq_benchmarks.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/results.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/sample_run.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/visualize.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/new_project.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/utils.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/reports.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/statistics.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/loader.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/registry.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/runtime.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/schema.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/conversation.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/serialization.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/tools.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/types.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/base.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/commonsense_qa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/competition_math.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/coqa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/gpqa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/gsm8k.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/gsm_symbolic.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/math500.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/med_qa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/medmcqa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/mmlu_pro.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/piqa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/registry.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/schema.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/sciq.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/social_i_qa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/super_gpqa.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/conditional.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/error_taxonomy_extractor.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/exceptions.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/identity_extractor.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/json_field_extractor.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/math_verify_extractor.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/regex_extractor.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/math_verify_utils.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/codebleu.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/execution.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/pass_at_k.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/composite_metric.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/consistency_metric.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/exact_match.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/length_difference_tolerance.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/math_verify_accuracy.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/bertscore.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/bleu.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/meteor.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/rouge.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/pairwise_judge_metric.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/response_length.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/rubric_judge_metric.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/reports.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/bootstrap.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/confidence_intervals.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/distributions.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/effect_sizes.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/hypothesis_tests.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/types.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/default_evaluation_strategy.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/evaluation_strategy.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/comparison.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/cost.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/export.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/integration_manager.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/math.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/mcq.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/pricing.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/visualization.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/agentic_runner.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/batching.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/clients.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/conversation_runner.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/providers/litellm_provider.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/providers/vllm_provider.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/strategies.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/templates.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/turn_strategies.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/types.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/integrations/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/integrations/huggingface.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/integrations/wandb.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/interfaces/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/presets/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/presets/models.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/project/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/project/definitions.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/project/patterns.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/providers/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/providers/registry.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/py.typed +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/server/__init__.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/api_generator.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/cost_tracking.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/dashboard.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/logging_utils.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/progress.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/tracing.py +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/dependency_links.txt +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/requires.txt +0 -0
- {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: themis-eval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Lightweight evaluation platform for LLM experiments
|
|
5
5
|
Author: Pittawat Taveekitworachai
|
|
6
6
|
License: MIT
|
|
@@ -100,13 +100,14 @@ pip install themis-eval[math,nlp,code,server]
|
|
|
100
100
|
from themis import evaluate
|
|
101
101
|
|
|
102
102
|
# Evaluate any model on any benchmark
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
report = evaluate(
|
|
104
|
+
"gsm8k",
|
|
105
105
|
model="gpt-4",
|
|
106
|
-
limit=100
|
|
106
|
+
limit=100,
|
|
107
107
|
)
|
|
108
108
|
|
|
109
|
-
|
|
109
|
+
accuracy = report.evaluation_report.metrics["ExactMatch"].mean
|
|
110
|
+
print(f"Accuracy: {accuracy:.2%}")
|
|
110
111
|
```
|
|
111
112
|
|
|
112
113
|
### CLI Usage
|
|
@@ -122,6 +123,9 @@ themis compare gpt4-run claude-run
|
|
|
122
123
|
|
|
123
124
|
# Start web dashboard
|
|
124
125
|
themis serve
|
|
126
|
+
|
|
127
|
+
# Share a run
|
|
128
|
+
themis share gpt4-run --output-dir share
|
|
125
129
|
```
|
|
126
130
|
|
|
127
131
|
---
|
|
@@ -130,20 +134,28 @@ themis serve
|
|
|
130
134
|
|
|
131
135
|
### 🎯 Built-in Benchmarks
|
|
132
136
|
|
|
133
|
-
Themis includes
|
|
137
|
+
Themis includes 19 built-in benchmarks out-of-the-box:
|
|
134
138
|
|
|
135
139
|
```python
|
|
136
140
|
# Math reasoning
|
|
137
|
-
evaluate(
|
|
138
|
-
evaluate(
|
|
139
|
-
evaluate(
|
|
141
|
+
evaluate("gsm8k", model="gpt-4", limit=100)
|
|
142
|
+
evaluate("math500", model="gpt-4", limit=50)
|
|
143
|
+
evaluate("aime24", model="gpt-4")
|
|
140
144
|
|
|
141
145
|
# General knowledge
|
|
142
|
-
evaluate(
|
|
143
|
-
evaluate(
|
|
146
|
+
evaluate("mmlu-pro", model="gpt-4", limit=1000)
|
|
147
|
+
evaluate("supergpqa", model="gpt-4")
|
|
148
|
+
|
|
149
|
+
# Science & medical
|
|
150
|
+
evaluate("gpqa", model="gpt-4", limit=200)
|
|
151
|
+
evaluate("medmcqa", model="gpt-4", limit=200)
|
|
152
|
+
|
|
153
|
+
# Commonsense & conversational
|
|
154
|
+
evaluate("commonsense_qa", model="gpt-4", limit=200)
|
|
155
|
+
evaluate("coqa", model="gpt-4", limit=200)
|
|
144
156
|
|
|
145
157
|
# Quick testing
|
|
146
|
-
evaluate(
|
|
158
|
+
evaluate("demo", model="fake-math-llm", limit=10)
|
|
147
159
|
```
|
|
148
160
|
|
|
149
161
|
**See all available benchmarks:**
|
|
@@ -165,8 +177,7 @@ themis list benchmarks
|
|
|
165
177
|
|
|
166
178
|
```python
|
|
167
179
|
# Use specific metrics
|
|
168
|
-
result = evaluate(
|
|
169
|
-
benchmark="gsm8k",
|
|
180
|
+
result = evaluate("gsm8k",
|
|
170
181
|
model="gpt-4",
|
|
171
182
|
metrics=["exact_match", "bleu", "rouge1"],
|
|
172
183
|
)
|
|
@@ -192,7 +203,7 @@ print(report.summary())
|
|
|
192
203
|
|
|
193
204
|
**CLI:**
|
|
194
205
|
```bash
|
|
195
|
-
themis compare run-1 run-2 --
|
|
206
|
+
themis compare run-1 run-2 --output comparison.html
|
|
196
207
|
```
|
|
197
208
|
|
|
198
209
|
### 🌐 Web Dashboard
|
|
@@ -218,19 +229,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
|
|
|
218
229
|
|
|
219
230
|
```python
|
|
220
231
|
# OpenAI
|
|
221
|
-
evaluate(
|
|
232
|
+
evaluate("gsm8k", model="gpt-4")
|
|
222
233
|
|
|
223
234
|
# Anthropic
|
|
224
|
-
evaluate(
|
|
235
|
+
evaluate("gsm8k", model="claude-3-opus-20240229")
|
|
225
236
|
|
|
226
237
|
# Azure OpenAI
|
|
227
|
-
evaluate(
|
|
238
|
+
evaluate("gsm8k", model="azure/gpt-4")
|
|
228
239
|
|
|
229
240
|
# Local models (vLLM, Ollama, etc.)
|
|
230
|
-
evaluate(
|
|
241
|
+
evaluate("gsm8k", model="ollama/llama3")
|
|
231
242
|
|
|
232
243
|
# AWS Bedrock
|
|
233
|
-
evaluate(
|
|
244
|
+
evaluate("gsm8k", model="bedrock/anthropic.claude-3")
|
|
234
245
|
```
|
|
235
246
|
|
|
236
247
|
### 💾 Smart Caching
|
|
@@ -239,8 +250,7 @@ Themis automatically caches results and resumes failed runs:
|
|
|
239
250
|
|
|
240
251
|
```python
|
|
241
252
|
# Run with caching
|
|
242
|
-
result = evaluate(
|
|
243
|
-
benchmark="gsm8k",
|
|
253
|
+
result = evaluate("gsm8k",
|
|
244
254
|
model="gpt-4",
|
|
245
255
|
limit=1000,
|
|
246
256
|
run_id="my-experiment",
|
|
@@ -275,14 +285,13 @@ result = evaluate(
|
|
|
275
285
|
metrics=["exact_match"],
|
|
276
286
|
)
|
|
277
287
|
|
|
278
|
-
print(result.
|
|
288
|
+
print(result.evaluation_report.metrics["ExactMatch"].mean)
|
|
279
289
|
```
|
|
280
290
|
|
|
281
291
|
### Advanced Configuration
|
|
282
292
|
|
|
283
293
|
```python
|
|
284
|
-
result = evaluate(
|
|
285
|
-
benchmark="gsm8k",
|
|
294
|
+
result = evaluate("gsm8k",
|
|
286
295
|
model="gpt-4",
|
|
287
296
|
temperature=0.7,
|
|
288
297
|
max_tokens=512,
|
|
@@ -335,7 +344,7 @@ Themis is built on a clean, modular architecture:
|
|
|
335
344
|
│ │
|
|
336
345
|
┌────▼─────┐ ┌────▼─────┐
|
|
337
346
|
│Benchmarks│ │Evaluation│
|
|
338
|
-
│(
|
|
347
|
+
│(19 built-│ │ Pipeline │
|
|
339
348
|
│ in) │ └────┬─────┘
|
|
340
349
|
└──────────┘ │
|
|
341
350
|
┌────▼─────┐
|
|
@@ -359,7 +368,7 @@ Themis is built on a clean, modular architecture:
|
|
|
359
368
|
|
|
360
369
|
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
361
370
|
- **[Examples](examples-simple/)** - Runnable code examples
|
|
362
|
-
- **[
|
|
371
|
+
- **[Backends API](docs/api/backends.md)** - Custom storage and execution
|
|
363
372
|
- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
|
|
364
373
|
- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
|
|
365
374
|
|
|
@@ -382,14 +391,13 @@ class S3StorageBackend(StorageBackend):
|
|
|
382
391
|
# ... implement other methods
|
|
383
392
|
|
|
384
393
|
# Use custom backend
|
|
385
|
-
result = evaluate(
|
|
386
|
-
benchmark="gsm8k",
|
|
394
|
+
result = evaluate("gsm8k",
|
|
387
395
|
model="gpt-4",
|
|
388
396
|
storage_backend=S3StorageBackend(bucket="my-bucket")
|
|
389
397
|
)
|
|
390
398
|
```
|
|
391
399
|
|
|
392
|
-
See [docs/
|
|
400
|
+
See [docs/api/backends.md](docs/api/backends.md) for details.
|
|
393
401
|
|
|
394
402
|
### Distributed Execution
|
|
395
403
|
|
|
@@ -401,8 +409,7 @@ class RayExecutionBackend(ExecutionBackend):
|
|
|
401
409
|
"""Distributed execution with Ray"""
|
|
402
410
|
# ... implementation
|
|
403
411
|
|
|
404
|
-
result = evaluate(
|
|
405
|
-
benchmark="math500",
|
|
412
|
+
result = evaluate("math500",
|
|
406
413
|
model="gpt-4",
|
|
407
414
|
execution_backend=RayExecutionBackend(num_cpus=32)
|
|
408
415
|
)
|
|
@@ -454,10 +461,10 @@ themis eval <benchmark> --model <model> [options]
|
|
|
454
461
|
themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
|
|
455
462
|
|
|
456
463
|
# Options:
|
|
464
|
+
# --metric NAME Restrict to one metric
|
|
457
465
|
# --storage PATH Storage directory
|
|
458
|
-
# --test STR Statistical test: t_test, bootstrap, permutation
|
|
459
|
-
# --alpha FLOAT Significance level (default: 0.05)
|
|
460
466
|
# --output FILE Export report (.json, .html, .md)
|
|
467
|
+
# --show-diff Include detailed per-sample differences in summary
|
|
461
468
|
```
|
|
462
469
|
|
|
463
470
|
### Server
|
|
@@ -539,6 +546,12 @@ uv run python examples-simple/04_comparison.py
|
|
|
539
546
|
|
|
540
547
|
# API server example
|
|
541
548
|
uv run python examples-simple/05_api_server.py
|
|
549
|
+
|
|
550
|
+
# Resume/cache example
|
|
551
|
+
uv run python examples-simple/08_resume_cache.py
|
|
552
|
+
|
|
553
|
+
# End-to-end research loop example
|
|
554
|
+
uv run python examples-simple/09_research_loop.py
|
|
542
555
|
```
|
|
543
556
|
|
|
544
557
|
---
|
|
@@ -41,13 +41,14 @@ pip install themis-eval[math,nlp,code,server]
|
|
|
41
41
|
from themis import evaluate
|
|
42
42
|
|
|
43
43
|
# Evaluate any model on any benchmark
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
report = evaluate(
|
|
45
|
+
"gsm8k",
|
|
46
46
|
model="gpt-4",
|
|
47
|
-
limit=100
|
|
47
|
+
limit=100,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
accuracy = report.evaluation_report.metrics["ExactMatch"].mean
|
|
51
|
+
print(f"Accuracy: {accuracy:.2%}")
|
|
51
52
|
```
|
|
52
53
|
|
|
53
54
|
### CLI Usage
|
|
@@ -63,6 +64,9 @@ themis compare gpt4-run claude-run
|
|
|
63
64
|
|
|
64
65
|
# Start web dashboard
|
|
65
66
|
themis serve
|
|
67
|
+
|
|
68
|
+
# Share a run
|
|
69
|
+
themis share gpt4-run --output-dir share
|
|
66
70
|
```
|
|
67
71
|
|
|
68
72
|
---
|
|
@@ -71,20 +75,28 @@ themis serve
|
|
|
71
75
|
|
|
72
76
|
### 🎯 Built-in Benchmarks
|
|
73
77
|
|
|
74
|
-
Themis includes
|
|
78
|
+
Themis includes 19 built-in benchmarks out-of-the-box:
|
|
75
79
|
|
|
76
80
|
```python
|
|
77
81
|
# Math reasoning
|
|
78
|
-
evaluate(
|
|
79
|
-
evaluate(
|
|
80
|
-
evaluate(
|
|
82
|
+
evaluate("gsm8k", model="gpt-4", limit=100)
|
|
83
|
+
evaluate("math500", model="gpt-4", limit=50)
|
|
84
|
+
evaluate("aime24", model="gpt-4")
|
|
81
85
|
|
|
82
86
|
# General knowledge
|
|
83
|
-
evaluate(
|
|
84
|
-
evaluate(
|
|
87
|
+
evaluate("mmlu-pro", model="gpt-4", limit=1000)
|
|
88
|
+
evaluate("supergpqa", model="gpt-4")
|
|
89
|
+
|
|
90
|
+
# Science & medical
|
|
91
|
+
evaluate("gpqa", model="gpt-4", limit=200)
|
|
92
|
+
evaluate("medmcqa", model="gpt-4", limit=200)
|
|
93
|
+
|
|
94
|
+
# Commonsense & conversational
|
|
95
|
+
evaluate("commonsense_qa", model="gpt-4", limit=200)
|
|
96
|
+
evaluate("coqa", model="gpt-4", limit=200)
|
|
85
97
|
|
|
86
98
|
# Quick testing
|
|
87
|
-
evaluate(
|
|
99
|
+
evaluate("demo", model="fake-math-llm", limit=10)
|
|
88
100
|
```
|
|
89
101
|
|
|
90
102
|
**See all available benchmarks:**
|
|
@@ -106,8 +118,7 @@ themis list benchmarks
|
|
|
106
118
|
|
|
107
119
|
```python
|
|
108
120
|
# Use specific metrics
|
|
109
|
-
result = evaluate(
|
|
110
|
-
benchmark="gsm8k",
|
|
121
|
+
result = evaluate("gsm8k",
|
|
111
122
|
model="gpt-4",
|
|
112
123
|
metrics=["exact_match", "bleu", "rouge1"],
|
|
113
124
|
)
|
|
@@ -133,7 +144,7 @@ print(report.summary())
|
|
|
133
144
|
|
|
134
145
|
**CLI:**
|
|
135
146
|
```bash
|
|
136
|
-
themis compare run-1 run-2 --
|
|
147
|
+
themis compare run-1 run-2 --output comparison.html
|
|
137
148
|
```
|
|
138
149
|
|
|
139
150
|
### 🌐 Web Dashboard
|
|
@@ -159,19 +170,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
|
|
|
159
170
|
|
|
160
171
|
```python
|
|
161
172
|
# OpenAI
|
|
162
|
-
evaluate(
|
|
173
|
+
evaluate("gsm8k", model="gpt-4")
|
|
163
174
|
|
|
164
175
|
# Anthropic
|
|
165
|
-
evaluate(
|
|
176
|
+
evaluate("gsm8k", model="claude-3-opus-20240229")
|
|
166
177
|
|
|
167
178
|
# Azure OpenAI
|
|
168
|
-
evaluate(
|
|
179
|
+
evaluate("gsm8k", model="azure/gpt-4")
|
|
169
180
|
|
|
170
181
|
# Local models (vLLM, Ollama, etc.)
|
|
171
|
-
evaluate(
|
|
182
|
+
evaluate("gsm8k", model="ollama/llama3")
|
|
172
183
|
|
|
173
184
|
# AWS Bedrock
|
|
174
|
-
evaluate(
|
|
185
|
+
evaluate("gsm8k", model="bedrock/anthropic.claude-3")
|
|
175
186
|
```
|
|
176
187
|
|
|
177
188
|
### 💾 Smart Caching
|
|
@@ -180,8 +191,7 @@ Themis automatically caches results and resumes failed runs:
|
|
|
180
191
|
|
|
181
192
|
```python
|
|
182
193
|
# Run with caching
|
|
183
|
-
result = evaluate(
|
|
184
|
-
benchmark="gsm8k",
|
|
194
|
+
result = evaluate("gsm8k",
|
|
185
195
|
model="gpt-4",
|
|
186
196
|
limit=1000,
|
|
187
197
|
run_id="my-experiment",
|
|
@@ -216,14 +226,13 @@ result = evaluate(
|
|
|
216
226
|
metrics=["exact_match"],
|
|
217
227
|
)
|
|
218
228
|
|
|
219
|
-
print(result.
|
|
229
|
+
print(result.evaluation_report.metrics["ExactMatch"].mean)
|
|
220
230
|
```
|
|
221
231
|
|
|
222
232
|
### Advanced Configuration
|
|
223
233
|
|
|
224
234
|
```python
|
|
225
|
-
result = evaluate(
|
|
226
|
-
benchmark="gsm8k",
|
|
235
|
+
result = evaluate("gsm8k",
|
|
227
236
|
model="gpt-4",
|
|
228
237
|
temperature=0.7,
|
|
229
238
|
max_tokens=512,
|
|
@@ -276,7 +285,7 @@ Themis is built on a clean, modular architecture:
|
|
|
276
285
|
│ │
|
|
277
286
|
┌────▼─────┐ ┌────▼─────┐
|
|
278
287
|
│Benchmarks│ │Evaluation│
|
|
279
|
-
│(
|
|
288
|
+
│(19 built-│ │ Pipeline │
|
|
280
289
|
│ in) │ └────┬─────┘
|
|
281
290
|
└──────────┘ │
|
|
282
291
|
┌────▼─────┐
|
|
@@ -300,7 +309,7 @@ Themis is built on a clean, modular architecture:
|
|
|
300
309
|
|
|
301
310
|
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
302
311
|
- **[Examples](examples-simple/)** - Runnable code examples
|
|
303
|
-
- **[
|
|
312
|
+
- **[Backends API](docs/api/backends.md)** - Custom storage and execution
|
|
304
313
|
- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
|
|
305
314
|
- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
|
|
306
315
|
|
|
@@ -323,14 +332,13 @@ class S3StorageBackend(StorageBackend):
|
|
|
323
332
|
# ... implement other methods
|
|
324
333
|
|
|
325
334
|
# Use custom backend
|
|
326
|
-
result = evaluate(
|
|
327
|
-
benchmark="gsm8k",
|
|
335
|
+
result = evaluate("gsm8k",
|
|
328
336
|
model="gpt-4",
|
|
329
337
|
storage_backend=S3StorageBackend(bucket="my-bucket")
|
|
330
338
|
)
|
|
331
339
|
```
|
|
332
340
|
|
|
333
|
-
See [docs/
|
|
341
|
+
See [docs/api/backends.md](docs/api/backends.md) for details.
|
|
334
342
|
|
|
335
343
|
### Distributed Execution
|
|
336
344
|
|
|
@@ -342,8 +350,7 @@ class RayExecutionBackend(ExecutionBackend):
|
|
|
342
350
|
"""Distributed execution with Ray"""
|
|
343
351
|
# ... implementation
|
|
344
352
|
|
|
345
|
-
result = evaluate(
|
|
346
|
-
benchmark="math500",
|
|
353
|
+
result = evaluate("math500",
|
|
347
354
|
model="gpt-4",
|
|
348
355
|
execution_backend=RayExecutionBackend(num_cpus=32)
|
|
349
356
|
)
|
|
@@ -395,10 +402,10 @@ themis eval <benchmark> --model <model> [options]
|
|
|
395
402
|
themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
|
|
396
403
|
|
|
397
404
|
# Options:
|
|
405
|
+
# --metric NAME Restrict to one metric
|
|
398
406
|
# --storage PATH Storage directory
|
|
399
|
-
# --test STR Statistical test: t_test, bootstrap, permutation
|
|
400
|
-
# --alpha FLOAT Significance level (default: 0.05)
|
|
401
407
|
# --output FILE Export report (.json, .html, .md)
|
|
408
|
+
# --show-diff Include detailed per-sample differences in summary
|
|
402
409
|
```
|
|
403
410
|
|
|
404
411
|
### Server
|
|
@@ -480,6 +487,12 @@ uv run python examples-simple/04_comparison.py
|
|
|
480
487
|
|
|
481
488
|
# API server example
|
|
482
489
|
uv run python examples-simple/05_api_server.py
|
|
490
|
+
|
|
491
|
+
# Resume/cache example
|
|
492
|
+
uv run python examples-simple/08_resume_cache.py
|
|
493
|
+
|
|
494
|
+
# End-to-end research loop example
|
|
495
|
+
uv run python examples-simple/09_research_loop.py
|
|
483
496
|
```
|
|
484
497
|
|
|
485
498
|
---
|
|
@@ -12,9 +12,10 @@ Extension APIs for registering custom components:
|
|
|
12
12
|
- themis.register_benchmark() - Register custom benchmark presets
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
from themis import config, core, evaluation,
|
|
15
|
+
from themis import config, core, evaluation, generation, project, session
|
|
16
16
|
from themis._version import __version__
|
|
17
17
|
from themis.api import evaluate, get_registered_metrics, register_metric
|
|
18
|
+
from themis.session import ExperimentSession
|
|
18
19
|
from themis.datasets import register_dataset, list_datasets, is_dataset_registered
|
|
19
20
|
from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
|
|
20
21
|
from themis.providers import register_provider
|
|
@@ -39,9 +40,11 @@ __all__ = [
|
|
|
39
40
|
"config",
|
|
40
41
|
"core",
|
|
41
42
|
"evaluation",
|
|
42
|
-
"experiment",
|
|
43
43
|
"generation",
|
|
44
44
|
"project",
|
|
45
|
+
"session",
|
|
46
|
+
# Session API
|
|
47
|
+
"ExperimentSession",
|
|
45
48
|
# Version
|
|
46
49
|
"__version__",
|
|
47
50
|
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Package version helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib import metadata
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import tomllib
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _read_local_pyproject_version() -> str:
|
|
11
|
+
"""Return the version declared in pyproject.toml for local development."""
|
|
12
|
+
pyproject_path = Path(__file__).resolve().parents[1] / "pyproject.toml"
|
|
13
|
+
try:
|
|
14
|
+
with pyproject_path.open("rb") as fh:
|
|
15
|
+
data = tomllib.load(fh)
|
|
16
|
+
except FileNotFoundError:
|
|
17
|
+
return "0.0.0"
|
|
18
|
+
return data.get("project", {}).get("version", "0.0.0")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _detect_version() -> str:
|
|
22
|
+
try:
|
|
23
|
+
return metadata.version("themis-eval")
|
|
24
|
+
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
25
|
+
return _read_local_pyproject_version()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
__version__ = _detect_version()
|
|
29
|
+
|
|
30
|
+
__all__ = ["__version__"]
|