themis-eval 0.2.2__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. {themis_eval-0.2.2/themis_eval.egg-info → themis_eval-1.0.0}/PKG-INFO +47 -34
  2. {themis_eval-0.2.2 → themis_eval-1.0.0}/README.md +46 -33
  3. {themis_eval-0.2.2 → themis_eval-1.0.0}/pyproject.toml +1 -1
  4. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/__init__.py +5 -2
  5. themis_eval-1.0.0/themis/_version.py +30 -0
  6. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/api.py +83 -145
  7. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/backends/storage.py +5 -0
  8. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/info.py +2 -11
  9. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/main.py +231 -40
  10. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/engine.py +7 -13
  11. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/entities.py +4 -0
  12. themis_eval-1.0.0/themis/evaluation/metric_pipeline.py +12 -0
  13. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipeline.py +22 -0
  14. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipelines/__init__.py +4 -0
  15. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipelines/composable_pipeline.py +55 -0
  16. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/pipelines/standard_pipeline.py +18 -1
  17. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
  18. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
  19. themis_eval-1.0.0/themis/experiment/__init__.py +5 -0
  20. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/cache_manager.py +15 -1
  21. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/definitions.py +1 -1
  22. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/orchestrator.py +21 -11
  23. themis_eval-1.0.0/themis/experiment/share.py +264 -0
  24. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/storage.py +345 -298
  25. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/plan.py +28 -6
  26. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/router.py +22 -4
  27. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/runner.py +16 -1
  28. themis_eval-1.0.0/themis/presets/benchmarks.py +939 -0
  29. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/server/app.py +38 -26
  30. themis_eval-1.0.0/themis/session.py +125 -0
  31. themis_eval-1.0.0/themis/specs/__init__.py +7 -0
  32. themis_eval-1.0.0/themis/specs/execution.py +26 -0
  33. themis_eval-1.0.0/themis/specs/experiment.py +33 -0
  34. themis_eval-1.0.0/themis/specs/storage.py +18 -0
  35. themis_eval-1.0.0/themis/storage/__init__.py +6 -0
  36. themis_eval-1.0.0/themis/storage/experiment_storage.py +7 -0
  37. {themis_eval-0.2.2 → themis_eval-1.0.0/themis_eval.egg-info}/PKG-INFO +47 -34
  38. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/SOURCES.txt +9 -2
  39. themis_eval-0.2.2/themis/_version.py +0 -17
  40. themis_eval-0.2.2/themis/experiment/__init__.py +0 -5
  41. themis_eval-0.2.2/themis/experiment/builder.py +0 -151
  42. themis_eval-0.2.2/themis/experiment/export_csv.py +0 -159
  43. themis_eval-0.2.2/themis/presets/benchmarks.py +0 -354
  44. {themis_eval-0.2.2 → themis_eval-1.0.0}/LICENSE +0 -0
  45. {themis_eval-0.2.2 → themis_eval-1.0.0}/setup.cfg +0 -0
  46. {themis_eval-0.2.2 → themis_eval-1.0.0}/tests/test_package_metadata.py +0 -0
  47. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/backends/__init__.py +0 -0
  48. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/backends/execution.py +0 -0
  49. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/__init__.py +0 -0
  50. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/__main__.py +0 -0
  51. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/__init__.py +0 -0
  52. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/benchmarks.py +0 -0
  53. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/comparison.py +0 -0
  54. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/config_commands.py +0 -0
  55. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/cost.py +0 -0
  56. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/demo.py +0 -0
  57. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/leaderboard.py +0 -0
  58. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/math_benchmarks.py +0 -0
  59. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/mcq_benchmarks.py +0 -0
  60. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/results.py +0 -0
  61. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/sample_run.py +0 -0
  62. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/commands/visualize.py +0 -0
  63. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/new_project.py +0 -0
  64. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/cli/utils.py +0 -0
  65. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/__init__.py +0 -0
  66. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/reports.py +0 -0
  67. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/comparison/statistics.py +0 -0
  68. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/__init__.py +0 -0
  69. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/loader.py +0 -0
  70. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/registry.py +0 -0
  71. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/runtime.py +0 -0
  72. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/config/schema.py +0 -0
  73. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/__init__.py +0 -0
  74. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/conversation.py +0 -0
  75. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/serialization.py +0 -0
  76. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/tools.py +0 -0
  77. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/core/types.py +0 -0
  78. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/__init__.py +0 -0
  79. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/base.py +0 -0
  80. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/commonsense_qa.py +0 -0
  81. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/competition_math.py +0 -0
  82. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/coqa.py +0 -0
  83. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/gpqa.py +0 -0
  84. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/gsm8k.py +0 -0
  85. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/gsm_symbolic.py +0 -0
  86. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/math500.py +0 -0
  87. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/med_qa.py +0 -0
  88. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/medmcqa.py +0 -0
  89. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/mmlu_pro.py +0 -0
  90. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/piqa.py +0 -0
  91. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/registry.py +0 -0
  92. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/schema.py +0 -0
  93. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/sciq.py +0 -0
  94. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/social_i_qa.py +0 -0
  95. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/datasets/super_gpqa.py +0 -0
  96. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/__init__.py +0 -0
  97. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/conditional.py +0 -0
  98. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/__init__.py +0 -0
  99. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/error_taxonomy_extractor.py +0 -0
  100. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/exceptions.py +0 -0
  101. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/identity_extractor.py +0 -0
  102. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/json_field_extractor.py +0 -0
  103. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/math_verify_extractor.py +0 -0
  104. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/extractors/regex_extractor.py +0 -0
  105. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/math_verify_utils.py +0 -0
  106. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/__init__.py +0 -0
  107. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/__init__.py +0 -0
  108. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/codebleu.py +0 -0
  109. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/execution.py +0 -0
  110. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/code/pass_at_k.py +0 -0
  111. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/composite_metric.py +0 -0
  112. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/consistency_metric.py +0 -0
  113. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/exact_match.py +0 -0
  114. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/length_difference_tolerance.py +0 -0
  115. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/math_verify_accuracy.py +0 -0
  116. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/__init__.py +0 -0
  117. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/bertscore.py +0 -0
  118. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/bleu.py +0 -0
  119. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/meteor.py +0 -0
  120. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/nlp/rouge.py +0 -0
  121. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/pairwise_judge_metric.py +0 -0
  122. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/response_length.py +0 -0
  123. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/metrics/rubric_judge_metric.py +0 -0
  124. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/reports.py +0 -0
  125. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/__init__.py +0 -0
  126. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/bootstrap.py +0 -0
  127. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/confidence_intervals.py +0 -0
  128. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/distributions.py +0 -0
  129. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/effect_sizes.py +0 -0
  130. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/hypothesis_tests.py +0 -0
  131. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/statistics/types.py +0 -0
  132. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/__init__.py +0 -0
  133. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/default_evaluation_strategy.py +0 -0
  134. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/evaluation/strategies/evaluation_strategy.py +0 -0
  135. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/comparison.py +0 -0
  136. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/cost.py +0 -0
  137. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/export.py +0 -0
  138. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/integration_manager.py +0 -0
  139. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/math.py +0 -0
  140. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/mcq.py +0 -0
  141. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/pricing.py +0 -0
  142. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/experiment/visualization.py +0 -0
  143. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/__init__.py +0 -0
  144. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/agentic_runner.py +0 -0
  145. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/batching.py +0 -0
  146. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/clients.py +0 -0
  147. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/conversation_runner.py +0 -0
  148. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/providers/litellm_provider.py +0 -0
  149. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/providers/vllm_provider.py +0 -0
  150. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/strategies.py +0 -0
  151. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/templates.py +0 -0
  152. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/turn_strategies.py +0 -0
  153. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/generation/types.py +0 -0
  154. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/integrations/__init__.py +0 -0
  155. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/integrations/huggingface.py +0 -0
  156. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/integrations/wandb.py +0 -0
  157. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/interfaces/__init__.py +0 -0
  158. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/presets/__init__.py +0 -0
  159. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/presets/models.py +0 -0
  160. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/project/__init__.py +0 -0
  161. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/project/definitions.py +0 -0
  162. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/project/patterns.py +0 -0
  163. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/providers/__init__.py +0 -0
  164. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/providers/registry.py +0 -0
  165. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/py.typed +0 -0
  166. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/server/__init__.py +0 -0
  167. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/api_generator.py +0 -0
  168. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/cost_tracking.py +0 -0
  169. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/dashboard.py +0 -0
  170. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/logging_utils.py +0 -0
  171. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/progress.py +0 -0
  172. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis/utils/tracing.py +0 -0
  173. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/dependency_links.txt +0 -0
  174. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/requires.txt +0 -0
  175. {themis_eval-0.2.2 → themis_eval-1.0.0}/themis_eval.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.2
3
+ Version: 1.0.0
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -100,13 +100,14 @@ pip install themis-eval[math,nlp,code,server]
100
100
  from themis import evaluate
101
101
 
102
102
  # Evaluate any model on any benchmark
103
- result = evaluate(
104
- benchmark="gsm8k",
103
+ report = evaluate(
104
+ "gsm8k",
105
105
  model="gpt-4",
106
- limit=100
106
+ limit=100,
107
107
  )
108
108
 
109
- print(f"Accuracy: {result.metrics['exact_match']:.2%}")
109
+ accuracy = report.evaluation_report.metrics["ExactMatch"].mean
110
+ print(f"Accuracy: {accuracy:.2%}")
110
111
  ```
111
112
 
112
113
  ### CLI Usage
@@ -122,6 +123,9 @@ themis compare gpt4-run claude-run
122
123
 
123
124
  # Start web dashboard
124
125
  themis serve
126
+
127
+ # Share a run
128
+ themis share gpt4-run --output-dir share
125
129
  ```
126
130
 
127
131
  ---
@@ -130,20 +134,28 @@ themis serve
130
134
 
131
135
  ### 🎯 Built-in Benchmarks
132
136
 
133
- Themis includes 6 popular benchmarks out-of-the-box:
137
+ Themis includes 19 built-in benchmarks out-of-the-box:
134
138
 
135
139
  ```python
136
140
  # Math reasoning
137
- evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
138
- evaluate(benchmark="math500", model="gpt-4", limit=50)
139
- evaluate(benchmark="aime24", model="gpt-4")
141
+ evaluate("gsm8k", model="gpt-4", limit=100)
142
+ evaluate("math500", model="gpt-4", limit=50)
143
+ evaluate("aime24", model="gpt-4")
140
144
 
141
145
  # General knowledge
142
- evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
143
- evaluate(benchmark="supergpqa", model="gpt-4")
146
+ evaluate("mmlu-pro", model="gpt-4", limit=1000)
147
+ evaluate("supergpqa", model="gpt-4")
148
+
149
+ # Science & medical
150
+ evaluate("gpqa", model="gpt-4", limit=200)
151
+ evaluate("medmcqa", model="gpt-4", limit=200)
152
+
153
+ # Commonsense & conversational
154
+ evaluate("commonsense_qa", model="gpt-4", limit=200)
155
+ evaluate("coqa", model="gpt-4", limit=200)
144
156
 
145
157
  # Quick testing
146
- evaluate(benchmark="demo", model="fake-math-llm", limit=10)
158
+ evaluate("demo", model="fake-math-llm", limit=10)
147
159
  ```
148
160
 
149
161
  **See all available benchmarks:**
@@ -165,8 +177,7 @@ themis list benchmarks
165
177
 
166
178
  ```python
167
179
  # Use specific metrics
168
- result = evaluate(
169
- benchmark="gsm8k",
180
+ result = evaluate("gsm8k",
170
181
  model="gpt-4",
171
182
  metrics=["exact_match", "bleu", "rouge1"],
172
183
  )
@@ -192,7 +203,7 @@ print(report.summary())
192
203
 
193
204
  **CLI:**
194
205
  ```bash
195
- themis compare run-1 run-2 --test bootstrap --output comparison.html
206
+ themis compare run-1 run-2 --output comparison.html
196
207
  ```
197
208
 
198
209
  ### 🌐 Web Dashboard
@@ -218,19 +229,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
218
229
 
219
230
  ```python
220
231
  # OpenAI
221
- evaluate(benchmark="gsm8k", model="gpt-4")
232
+ evaluate("gsm8k", model="gpt-4")
222
233
 
223
234
  # Anthropic
224
- evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
235
+ evaluate("gsm8k", model="claude-3-opus-20240229")
225
236
 
226
237
  # Azure OpenAI
227
- evaluate(benchmark="gsm8k", model="azure/gpt-4")
238
+ evaluate("gsm8k", model="azure/gpt-4")
228
239
 
229
240
  # Local models (vLLM, Ollama, etc.)
230
- evaluate(benchmark="gsm8k", model="ollama/llama3")
241
+ evaluate("gsm8k", model="ollama/llama3")
231
242
 
232
243
  # AWS Bedrock
233
- evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
244
+ evaluate("gsm8k", model="bedrock/anthropic.claude-3")
234
245
  ```
235
246
 
236
247
  ### 💾 Smart Caching
@@ -239,8 +250,7 @@ Themis automatically caches results and resumes failed runs:
239
250
 
240
251
  ```python
241
252
  # Run with caching
242
- result = evaluate(
243
- benchmark="gsm8k",
253
+ result = evaluate("gsm8k",
244
254
  model="gpt-4",
245
255
  limit=1000,
246
256
  run_id="my-experiment",
@@ -275,14 +285,13 @@ result = evaluate(
275
285
  metrics=["exact_match"],
276
286
  )
277
287
 
278
- print(result.report)
288
+ print(result.evaluation_report.metrics["ExactMatch"].mean)
279
289
  ```
280
290
 
281
291
  ### Advanced Configuration
282
292
 
283
293
  ```python
284
- result = evaluate(
285
- benchmark="gsm8k",
294
+ result = evaluate("gsm8k",
286
295
  model="gpt-4",
287
296
  temperature=0.7,
288
297
  max_tokens=512,
@@ -335,7 +344,7 @@ Themis is built on a clean, modular architecture:
335
344
  │ │
336
345
  ┌────▼─────┐ ┌────▼─────┐
337
346
  │Benchmarks│ │Evaluation│
338
- │(6 built- │ │ Pipeline │
347
+ │(19 built-│ │ Pipeline │
339
348
  │ in) │ └────┬─────┘
340
349
  └──────────┘ │
341
350
  ┌────▼─────┐
@@ -359,7 +368,7 @@ Themis is built on a clean, modular architecture:
359
368
 
360
369
  - **[API Reference](docs/index.md)** - Detailed API documentation
361
370
  - **[Examples](examples-simple/)** - Runnable code examples
362
- - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
371
+ - **[Backends API](docs/api/backends.md)** - Custom storage and execution
363
372
  - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
364
373
  - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
365
374
 
@@ -382,14 +391,13 @@ class S3StorageBackend(StorageBackend):
382
391
  # ... implement other methods
383
392
 
384
393
  # Use custom backend
385
- result = evaluate(
386
- benchmark="gsm8k",
394
+ result = evaluate("gsm8k",
387
395
  model="gpt-4",
388
396
  storage_backend=S3StorageBackend(bucket="my-bucket")
389
397
  )
390
398
  ```
391
399
 
392
- See [docs/customization/backends.md](docs/customization/backends.md) for details.
400
+ See [docs/api/backends.md](docs/api/backends.md) for details.
393
401
 
394
402
  ### Distributed Execution
395
403
 
@@ -401,8 +409,7 @@ class RayExecutionBackend(ExecutionBackend):
401
409
  """Distributed execution with Ray"""
402
410
  # ... implementation
403
411
 
404
- result = evaluate(
405
- benchmark="math500",
412
+ result = evaluate("math500",
406
413
  model="gpt-4",
407
414
  execution_backend=RayExecutionBackend(num_cpus=32)
408
415
  )
@@ -454,10 +461,10 @@ themis eval <benchmark> --model <model> [options]
454
461
  themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
455
462
 
456
463
  # Options:
464
+ # --metric NAME Restrict to one metric
457
465
  # --storage PATH Storage directory
458
- # --test STR Statistical test: t_test, bootstrap, permutation
459
- # --alpha FLOAT Significance level (default: 0.05)
460
466
  # --output FILE Export report (.json, .html, .md)
467
+ # --show-diff Include detailed per-sample differences in summary
461
468
  ```
462
469
 
463
470
  ### Server
@@ -539,6 +546,12 @@ uv run python examples-simple/04_comparison.py
539
546
 
540
547
  # API server example
541
548
  uv run python examples-simple/05_api_server.py
549
+
550
+ # Resume/cache example
551
+ uv run python examples-simple/08_resume_cache.py
552
+
553
+ # End-to-end research loop example
554
+ uv run python examples-simple/09_research_loop.py
542
555
  ```
543
556
 
544
557
  ---
@@ -41,13 +41,14 @@ pip install themis-eval[math,nlp,code,server]
41
41
  from themis import evaluate
42
42
 
43
43
  # Evaluate any model on any benchmark
44
- result = evaluate(
45
- benchmark="gsm8k",
44
+ report = evaluate(
45
+ "gsm8k",
46
46
  model="gpt-4",
47
- limit=100
47
+ limit=100,
48
48
  )
49
49
 
50
- print(f"Accuracy: {result.metrics['exact_match']:.2%}")
50
+ accuracy = report.evaluation_report.metrics["ExactMatch"].mean
51
+ print(f"Accuracy: {accuracy:.2%}")
51
52
  ```
52
53
 
53
54
  ### CLI Usage
@@ -63,6 +64,9 @@ themis compare gpt4-run claude-run
63
64
 
64
65
  # Start web dashboard
65
66
  themis serve
67
+
68
+ # Share a run
69
+ themis share gpt4-run --output-dir share
66
70
  ```
67
71
 
68
72
  ---
@@ -71,20 +75,28 @@ themis serve
71
75
 
72
76
  ### 🎯 Built-in Benchmarks
73
77
 
74
- Themis includes 6 popular benchmarks out-of-the-box:
78
+ Themis includes 19 built-in benchmarks out-of-the-box:
75
79
 
76
80
  ```python
77
81
  # Math reasoning
78
- evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
79
- evaluate(benchmark="math500", model="gpt-4", limit=50)
80
- evaluate(benchmark="aime24", model="gpt-4")
82
+ evaluate("gsm8k", model="gpt-4", limit=100)
83
+ evaluate("math500", model="gpt-4", limit=50)
84
+ evaluate("aime24", model="gpt-4")
81
85
 
82
86
  # General knowledge
83
- evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
84
- evaluate(benchmark="supergpqa", model="gpt-4")
87
+ evaluate("mmlu-pro", model="gpt-4", limit=1000)
88
+ evaluate("supergpqa", model="gpt-4")
89
+
90
+ # Science & medical
91
+ evaluate("gpqa", model="gpt-4", limit=200)
92
+ evaluate("medmcqa", model="gpt-4", limit=200)
93
+
94
+ # Commonsense & conversational
95
+ evaluate("commonsense_qa", model="gpt-4", limit=200)
96
+ evaluate("coqa", model="gpt-4", limit=200)
85
97
 
86
98
  # Quick testing
87
- evaluate(benchmark="demo", model="fake-math-llm", limit=10)
99
+ evaluate("demo", model="fake-math-llm", limit=10)
88
100
  ```
89
101
 
90
102
  **See all available benchmarks:**
@@ -106,8 +118,7 @@ themis list benchmarks
106
118
 
107
119
  ```python
108
120
  # Use specific metrics
109
- result = evaluate(
110
- benchmark="gsm8k",
121
+ result = evaluate("gsm8k",
111
122
  model="gpt-4",
112
123
  metrics=["exact_match", "bleu", "rouge1"],
113
124
  )
@@ -133,7 +144,7 @@ print(report.summary())
133
144
 
134
145
  **CLI:**
135
146
  ```bash
136
- themis compare run-1 run-2 --test bootstrap --output comparison.html
147
+ themis compare run-1 run-2 --output comparison.html
137
148
  ```
138
149
 
139
150
  ### 🌐 Web Dashboard
@@ -159,19 +170,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
159
170
 
160
171
  ```python
161
172
  # OpenAI
162
- evaluate(benchmark="gsm8k", model="gpt-4")
173
+ evaluate("gsm8k", model="gpt-4")
163
174
 
164
175
  # Anthropic
165
- evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
176
+ evaluate("gsm8k", model="claude-3-opus-20240229")
166
177
 
167
178
  # Azure OpenAI
168
- evaluate(benchmark="gsm8k", model="azure/gpt-4")
179
+ evaluate("gsm8k", model="azure/gpt-4")
169
180
 
170
181
  # Local models (vLLM, Ollama, etc.)
171
- evaluate(benchmark="gsm8k", model="ollama/llama3")
182
+ evaluate("gsm8k", model="ollama/llama3")
172
183
 
173
184
  # AWS Bedrock
174
- evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
185
+ evaluate("gsm8k", model="bedrock/anthropic.claude-3")
175
186
  ```
176
187
 
177
188
  ### 💾 Smart Caching
@@ -180,8 +191,7 @@ Themis automatically caches results and resumes failed runs:
180
191
 
181
192
  ```python
182
193
  # Run with caching
183
- result = evaluate(
184
- benchmark="gsm8k",
194
+ result = evaluate("gsm8k",
185
195
  model="gpt-4",
186
196
  limit=1000,
187
197
  run_id="my-experiment",
@@ -216,14 +226,13 @@ result = evaluate(
216
226
  metrics=["exact_match"],
217
227
  )
218
228
 
219
- print(result.report)
229
+ print(result.evaluation_report.metrics["ExactMatch"].mean)
220
230
  ```
221
231
 
222
232
  ### Advanced Configuration
223
233
 
224
234
  ```python
225
- result = evaluate(
226
- benchmark="gsm8k",
235
+ result = evaluate("gsm8k",
227
236
  model="gpt-4",
228
237
  temperature=0.7,
229
238
  max_tokens=512,
@@ -276,7 +285,7 @@ Themis is built on a clean, modular architecture:
276
285
  │ │
277
286
  ┌────▼─────┐ ┌────▼─────┐
278
287
  │Benchmarks│ │Evaluation│
279
- │(6 built- │ │ Pipeline │
288
+ │(19 built-│ │ Pipeline │
280
289
  │ in) │ └────┬─────┘
281
290
  └──────────┘ │
282
291
  ┌────▼─────┐
@@ -300,7 +309,7 @@ Themis is built on a clean, modular architecture:
300
309
 
301
310
  - **[API Reference](docs/index.md)** - Detailed API documentation
302
311
  - **[Examples](examples-simple/)** - Runnable code examples
303
- - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
312
+ - **[Backends API](docs/api/backends.md)** - Custom storage and execution
304
313
  - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
305
314
  - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
306
315
 
@@ -323,14 +332,13 @@ class S3StorageBackend(StorageBackend):
323
332
  # ... implement other methods
324
333
 
325
334
  # Use custom backend
326
- result = evaluate(
327
- benchmark="gsm8k",
335
+ result = evaluate("gsm8k",
328
336
  model="gpt-4",
329
337
  storage_backend=S3StorageBackend(bucket="my-bucket")
330
338
  )
331
339
  ```
332
340
 
333
- See [docs/customization/backends.md](docs/customization/backends.md) for details.
341
+ See [docs/api/backends.md](docs/api/backends.md) for details.
334
342
 
335
343
  ### Distributed Execution
336
344
 
@@ -342,8 +350,7 @@ class RayExecutionBackend(ExecutionBackend):
342
350
  """Distributed execution with Ray"""
343
351
  # ... implementation
344
352
 
345
- result = evaluate(
346
- benchmark="math500",
353
+ result = evaluate("math500",
347
354
  model="gpt-4",
348
355
  execution_backend=RayExecutionBackend(num_cpus=32)
349
356
  )
@@ -395,10 +402,10 @@ themis eval <benchmark> --model <model> [options]
395
402
  themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
396
403
 
397
404
  # Options:
405
+ # --metric NAME Restrict to one metric
398
406
  # --storage PATH Storage directory
399
- # --test STR Statistical test: t_test, bootstrap, permutation
400
- # --alpha FLOAT Significance level (default: 0.05)
401
407
  # --output FILE Export report (.json, .html, .md)
408
+ # --show-diff Include detailed per-sample differences in summary
402
409
  ```
403
410
 
404
411
  ### Server
@@ -480,6 +487,12 @@ uv run python examples-simple/04_comparison.py
480
487
 
481
488
  # API server example
482
489
  uv run python examples-simple/05_api_server.py
490
+
491
+ # Resume/cache example
492
+ uv run python examples-simple/08_resume_cache.py
493
+
494
+ # End-to-end research loop example
495
+ uv run python examples-simple/09_research_loop.py
483
496
  ```
484
497
 
485
498
  ---
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "themis-eval"
7
- version = "0.2.2"
7
+ version = "1.0.0"
8
8
  description = "Lightweight evaluation platform for LLM experiments"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -12,9 +12,10 @@ Extension APIs for registering custom components:
12
12
  - themis.register_benchmark() - Register custom benchmark presets
13
13
  """
14
14
 
15
- from themis import config, core, evaluation, experiment, generation, project
15
+ from themis import config, core, evaluation, generation, project, session
16
16
  from themis._version import __version__
17
17
  from themis.api import evaluate, get_registered_metrics, register_metric
18
+ from themis.session import ExperimentSession
18
19
  from themis.datasets import register_dataset, list_datasets, is_dataset_registered
19
20
  from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
20
21
  from themis.providers import register_provider
@@ -39,9 +40,11 @@ __all__ = [
39
40
  "config",
40
41
  "core",
41
42
  "evaluation",
42
- "experiment",
43
43
  "generation",
44
44
  "project",
45
+ "session",
46
+ # Session API
47
+ "ExperimentSession",
45
48
  # Version
46
49
  "__version__",
47
50
  ]
@@ -0,0 +1,30 @@
1
+ """Package version helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from importlib import metadata
6
+ from pathlib import Path
7
+ import tomllib
8
+
9
+
10
+ def _read_local_pyproject_version() -> str:
11
+ """Return the version declared in pyproject.toml for local development."""
12
+ pyproject_path = Path(__file__).resolve().parents[1] / "pyproject.toml"
13
+ try:
14
+ with pyproject_path.open("rb") as fh:
15
+ data = tomllib.load(fh)
16
+ except FileNotFoundError:
17
+ return "0.0.0"
18
+ return data.get("project", {}).get("version", "0.0.0")
19
+
20
+
21
+ def _detect_version() -> str:
22
+ try:
23
+ return metadata.version("themis-eval")
24
+ except metadata.PackageNotFoundError: # pragma: no cover - local dev only
25
+ return _read_local_pyproject_version()
26
+
27
+
28
+ __version__ = _detect_version()
29
+
30
+ __all__ = ["__version__"]