themis-eval 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. {themis_eval-0.2.0/themis_eval.egg-info → themis_eval-0.2.2}/PKG-INFO +6 -5
  2. {themis_eval-0.2.0 → themis_eval-0.2.2}/README.md +4 -4
  3. {themis_eval-0.2.0 → themis_eval-0.2.2}/pyproject.toml +5 -1
  4. themis_eval-0.2.2/themis/__init__.py +47 -0
  5. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/_version.py +1 -1
  6. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/api.py +156 -17
  7. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/orchestrator.py +61 -5
  8. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/storage.py +163 -19
  9. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/providers/litellm_provider.py +46 -0
  10. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/runner.py +22 -6
  11. themis_eval-0.2.2/themis/presets/__init__.py +21 -0
  12. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/logging_utils.py +8 -3
  13. themis_eval-0.2.2/themis/utils/progress.py +77 -0
  14. {themis_eval-0.2.0 → themis_eval-0.2.2/themis_eval.egg-info}/PKG-INFO +6 -5
  15. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/requires.txt +1 -0
  16. themis_eval-0.2.0/themis/__init__.py +0 -25
  17. themis_eval-0.2.0/themis/presets/__init__.py +0 -10
  18. themis_eval-0.2.0/themis/utils/progress.py +0 -58
  19. {themis_eval-0.2.0 → themis_eval-0.2.2}/LICENSE +0 -0
  20. {themis_eval-0.2.0 → themis_eval-0.2.2}/setup.cfg +0 -0
  21. {themis_eval-0.2.0 → themis_eval-0.2.2}/tests/test_package_metadata.py +0 -0
  22. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/backends/__init__.py +0 -0
  23. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/backends/execution.py +0 -0
  24. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/backends/storage.py +0 -0
  25. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/__init__.py +0 -0
  26. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/__main__.py +0 -0
  27. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/__init__.py +0 -0
  28. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/benchmarks.py +0 -0
  29. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/comparison.py +0 -0
  30. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/config_commands.py +0 -0
  31. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/cost.py +0 -0
  32. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/demo.py +0 -0
  33. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/info.py +0 -0
  34. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/leaderboard.py +0 -0
  35. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/math_benchmarks.py +0 -0
  36. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/mcq_benchmarks.py +0 -0
  37. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/results.py +0 -0
  38. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/sample_run.py +0 -0
  39. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/visualize.py +0 -0
  40. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/main.py +0 -0
  41. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/new_project.py +0 -0
  42. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/utils.py +0 -0
  43. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/__init__.py +0 -0
  44. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/engine.py +0 -0
  45. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/reports.py +0 -0
  46. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/statistics.py +0 -0
  47. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/__init__.py +0 -0
  48. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/loader.py +0 -0
  49. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/registry.py +0 -0
  50. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/runtime.py +0 -0
  51. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/schema.py +0 -0
  52. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/__init__.py +0 -0
  53. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/conversation.py +0 -0
  54. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/entities.py +0 -0
  55. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/serialization.py +0 -0
  56. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/tools.py +0 -0
  57. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/types.py +0 -0
  58. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/__init__.py +0 -0
  59. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/base.py +0 -0
  60. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/commonsense_qa.py +0 -0
  61. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/competition_math.py +0 -0
  62. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/coqa.py +0 -0
  63. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/gpqa.py +0 -0
  64. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/gsm8k.py +0 -0
  65. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/gsm_symbolic.py +0 -0
  66. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/math500.py +0 -0
  67. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/med_qa.py +0 -0
  68. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/medmcqa.py +0 -0
  69. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/mmlu_pro.py +0 -0
  70. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/piqa.py +0 -0
  71. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/registry.py +0 -0
  72. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/schema.py +0 -0
  73. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/sciq.py +0 -0
  74. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/social_i_qa.py +0 -0
  75. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/super_gpqa.py +0 -0
  76. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/__init__.py +0 -0
  77. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/conditional.py +0 -0
  78. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/__init__.py +0 -0
  79. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/error_taxonomy_extractor.py +0 -0
  80. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/exceptions.py +0 -0
  81. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/identity_extractor.py +0 -0
  82. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/json_field_extractor.py +0 -0
  83. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/math_verify_extractor.py +0 -0
  84. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/regex_extractor.py +0 -0
  85. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/math_verify_utils.py +0 -0
  86. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/__init__.py +0 -0
  87. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/__init__.py +0 -0
  88. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/codebleu.py +0 -0
  89. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/execution.py +0 -0
  90. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/pass_at_k.py +0 -0
  91. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/composite_metric.py +0 -0
  92. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/consistency_metric.py +0 -0
  93. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/exact_match.py +0 -0
  94. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/length_difference_tolerance.py +0 -0
  95. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/math_verify_accuracy.py +0 -0
  96. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/__init__.py +0 -0
  97. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/bertscore.py +0 -0
  98. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/bleu.py +0 -0
  99. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/meteor.py +0 -0
  100. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/rouge.py +0 -0
  101. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/pairwise_judge_metric.py +0 -0
  102. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/response_length.py +0 -0
  103. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/rubric_judge_metric.py +0 -0
  104. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipeline.py +0 -0
  105. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipelines/__init__.py +0 -0
  106. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipelines/composable_pipeline.py +0 -0
  107. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipelines/standard_pipeline.py +0 -0
  108. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/reports.py +0 -0
  109. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/__init__.py +0 -0
  110. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/bootstrap.py +0 -0
  111. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/confidence_intervals.py +0 -0
  112. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/distributions.py +0 -0
  113. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/effect_sizes.py +0 -0
  114. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/hypothesis_tests.py +0 -0
  115. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/types.py +0 -0
  116. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/__init__.py +0 -0
  117. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +0 -0
  118. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/default_evaluation_strategy.py +0 -0
  119. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/evaluation_strategy.py +0 -0
  120. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/judge_evaluation_strategy.py +0 -0
  121. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/__init__.py +0 -0
  122. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/builder.py +0 -0
  123. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/cache_manager.py +0 -0
  124. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/comparison.py +0 -0
  125. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/cost.py +0 -0
  126. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/definitions.py +0 -0
  127. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/export.py +0 -0
  128. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/export_csv.py +0 -0
  129. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/integration_manager.py +0 -0
  130. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/math.py +0 -0
  131. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/mcq.py +0 -0
  132. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/pricing.py +0 -0
  133. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/visualization.py +0 -0
  134. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/__init__.py +0 -0
  135. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/agentic_runner.py +0 -0
  136. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/batching.py +0 -0
  137. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/clients.py +0 -0
  138. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/conversation_runner.py +0 -0
  139. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/plan.py +0 -0
  140. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/providers/vllm_provider.py +0 -0
  141. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/router.py +0 -0
  142. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/strategies.py +0 -0
  143. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/templates.py +0 -0
  144. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/turn_strategies.py +0 -0
  145. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/types.py +0 -0
  146. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/integrations/__init__.py +0 -0
  147. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/integrations/huggingface.py +0 -0
  148. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/integrations/wandb.py +0 -0
  149. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/interfaces/__init__.py +0 -0
  150. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/presets/benchmarks.py +0 -0
  151. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/presets/models.py +0 -0
  152. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/project/__init__.py +0 -0
  153. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/project/definitions.py +0 -0
  154. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/project/patterns.py +0 -0
  155. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/providers/__init__.py +0 -0
  156. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/providers/registry.py +0 -0
  157. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/py.typed +0 -0
  158. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/server/__init__.py +0 -0
  159. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/server/app.py +0 -0
  160. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/api_generator.py +0 -0
  161. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/cost_tracking.py +0 -0
  162. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/dashboard.py +0 -0
  163. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/tracing.py +0 -0
  164. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/SOURCES.txt +0 -0
  165. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/dependency_links.txt +0 -0
  166. {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -25,6 +25,7 @@ Requires-Dist: tabulate>=0.9.0
25
25
  Requires-Dist: tenacity>=9.1.2
26
26
  Requires-Dist: plotly>=6.5.0
27
27
  Requires-Dist: math-verify>=0.8.0
28
+ Requires-Dist: rich>=14.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: pytest>=8.0; extra == "dev"
30
31
  Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
@@ -358,9 +359,9 @@ Themis is built on a clean, modular architecture:
358
359
 
359
360
  - **[API Reference](docs/index.md)** - Detailed API documentation
360
361
  - **[Examples](examples-simple/)** - Runnable code examples
361
- - **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
362
- - **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
363
- - **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
362
+ - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
363
+ - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
364
+ - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
364
365
 
365
366
  ---
366
367
 
@@ -388,7 +389,7 @@ result = evaluate(
388
389
  )
389
390
  ```
390
391
 
391
- See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
392
+ See [docs/customization/backends.md](docs/customization/backends.md) for details.
392
393
 
393
394
  ### Distributed Execution
394
395
 
@@ -300,9 +300,9 @@ Themis is built on a clean, modular architecture:
300
300
 
301
301
  - **[API Reference](docs/index.md)** - Detailed API documentation
302
302
  - **[Examples](examples-simple/)** - Runnable code examples
303
- - **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
304
- - **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
305
- - **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
303
+ - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
304
+ - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
305
+ - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
306
306
 
307
307
  ---
308
308
 
@@ -330,7 +330,7 @@ result = evaluate(
330
330
  )
331
331
  ```
332
332
 
333
- See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
333
+ See [docs/customization/backends.md](docs/customization/backends.md) for details.
334
334
 
335
335
  ### Distributed Execution
336
336
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "themis-eval"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "Lightweight evaluation platform for LLM experiments"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -32,6 +32,7 @@ dependencies = [
32
32
  "tenacity>=9.1.2",
33
33
  "plotly>=6.5.0",
34
34
  "math-verify>=0.8.0",
35
+ "rich>=14.2.0",
35
36
  ]
36
37
 
37
38
  [tool.setuptools.packages.find]
@@ -85,3 +86,6 @@ all = [
85
86
  [tool.pytest.ini_options]
86
87
  addopts = "-q"
87
88
  pythonpath = ["."]
89
+ markers = [
90
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
91
+ ]
@@ -0,0 +1,47 @@
1
+ """Themis experiment platform - Dead simple LLM evaluation.
2
+
3
+ The primary interface is the `evaluate()` function:
4
+
5
+ import themis
6
+ report = themis.evaluate("math500", model="gpt-4", limit=100)
7
+
8
+ Extension APIs for registering custom components:
9
+ - themis.register_metric() - Register custom metrics
10
+ - themis.register_dataset() - Register custom datasets
11
+ - themis.register_provider() - Register custom model providers
12
+ - themis.register_benchmark() - Register custom benchmark presets
13
+ """
14
+
15
+ from themis import config, core, evaluation, experiment, generation, project
16
+ from themis._version import __version__
17
+ from themis.api import evaluate, get_registered_metrics, register_metric
18
+ from themis.datasets import register_dataset, list_datasets, is_dataset_registered
19
+ from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
20
+ from themis.providers import register_provider
21
+
22
+ __all__ = [
23
+ # Main API
24
+ "evaluate",
25
+ # Metrics
26
+ "register_metric",
27
+ "get_registered_metrics",
28
+ # Datasets
29
+ "register_dataset",
30
+ "list_datasets",
31
+ "is_dataset_registered",
32
+ # Benchmarks
33
+ "register_benchmark",
34
+ "list_benchmarks",
35
+ "get_benchmark_preset",
36
+ # Providers
37
+ "register_provider",
38
+ # Submodules
39
+ "config",
40
+ "core",
41
+ "evaluation",
42
+ "experiment",
43
+ "generation",
44
+ "project",
45
+ # Version
46
+ "__version__",
47
+ ]
@@ -9,7 +9,7 @@ def _detect_version() -> str:
9
9
  try:
10
10
  return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.2.0" # Fallback for development
12
+ return "0.2.2" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
@@ -33,6 +33,7 @@ Example:
33
33
 
34
34
  from __future__ import annotations
35
35
 
36
+ import logging
36
37
  from datetime import datetime
37
38
  from pathlib import Path
38
39
  from typing import Any, Callable, Sequence
@@ -52,6 +53,67 @@ from themis.generation.runner import GenerationRunner
52
53
  from themis.generation.templates import PromptTemplate
53
54
  from themis.providers import create_provider
54
55
 
56
+ # Import provider modules to ensure they register themselves
57
+ try:
58
+ from themis.generation import clients # noqa: F401 - registers fake provider
59
+ from themis.generation.providers import (
60
+ litellm_provider, # noqa: F401
61
+ vllm_provider, # noqa: F401
62
+ )
63
+ except ImportError:
64
+ pass
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+
69
+ # Module-level metrics registry for custom metrics
70
+ _METRICS_REGISTRY: dict[str, type] = {}
71
+
72
+
73
+ def register_metric(name: str, metric_cls: type) -> None:
74
+ """Register a custom metric for use in evaluate().
75
+
76
+ This allows users to add their own metrics to Themis without modifying
77
+ the source code. Registered metrics can be used by passing their names
78
+ to the `metrics` parameter in evaluate().
79
+
80
+ Args:
81
+ name: Metric name (used in evaluate(metrics=[name]))
82
+ metric_cls: Metric class implementing the Metric interface.
83
+ Must have a compute() method that takes prediction, references,
84
+ and metadata parameters.
85
+
86
+ Raises:
87
+ TypeError: If metric_cls is not a class
88
+ ValueError: If metric_cls doesn't implement the required interface
89
+
90
+ Example:
91
+ >>> from themis.evaluation.metrics import MyCustomMetric
92
+ >>> themis.register_metric("my_metric", MyCustomMetric)
93
+ >>> report = themis.evaluate("math500", model="gpt-4", metrics=["my_metric"])
94
+ """
95
+ if not isinstance(metric_cls, type):
96
+ raise TypeError(f"metric_cls must be a class, got {type(metric_cls)}")
97
+
98
+ # Validate that it implements the Metric interface
99
+ if not hasattr(metric_cls, "compute"):
100
+ raise ValueError(
101
+ f"{metric_cls.__name__} must implement compute() method. "
102
+ f"See themis.evaluation.metrics for examples."
103
+ )
104
+
105
+ _METRICS_REGISTRY[name] = metric_cls
106
+ logger.info(f"Registered custom metric: {name} -> {metric_cls.__name__}")
107
+
108
+
109
+ def get_registered_metrics() -> dict[str, type]:
110
+ """Get all currently registered custom metrics.
111
+
112
+ Returns:
113
+ Dictionary mapping metric names to their classes
114
+ """
115
+ return _METRICS_REGISTRY.copy()
116
+
55
117
 
56
118
  def evaluate(
57
119
  benchmark_or_dataset: str | Sequence[dict[str, Any]],
@@ -123,6 +185,19 @@ def evaluate(
123
185
  >>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
124
186
  Accuracy: 85.00%
125
187
  """
188
+ logger.info("=" * 60)
189
+ logger.info("Starting Themis evaluation")
190
+ logger.info(f"Model: {model}")
191
+ logger.info(f"Workers: {workers}")
192
+ logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
193
+ if "api_base" in kwargs:
194
+ logger.info(f"Custom API base: {kwargs['api_base']}")
195
+ if "api_key" in kwargs:
196
+ logger.info("API key: <provided>")
197
+ else:
198
+ logger.warning("⚠️ No api_key provided - may fail for custom API endpoints")
199
+ logger.info("=" * 60)
200
+
126
201
  # Import presets system (lazy import to avoid circular dependencies)
127
202
  from themis.presets import get_benchmark_preset, parse_model_name
128
203
 
@@ -131,11 +206,23 @@ def evaluate(
131
206
 
132
207
  if is_benchmark:
133
208
  benchmark_name = benchmark_or_dataset
209
+ logger.info(f"Loading benchmark: {benchmark_name}")
210
+
134
211
  # Get preset configuration
135
- preset = get_benchmark_preset(benchmark_name)
212
+ try:
213
+ preset = get_benchmark_preset(benchmark_name)
214
+ except Exception as e:
215
+ logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
216
+ raise
136
217
 
137
218
  # Load dataset using preset loader
138
- dataset = preset.load_dataset(limit=limit)
219
+ logger.info(f"Loading dataset (limit={limit})...")
220
+ try:
221
+ dataset = preset.load_dataset(limit=limit)
222
+ logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
223
+ except Exception as e:
224
+ logger.error(f"❌ Failed to load dataset: {e}")
225
+ raise
139
226
 
140
227
  # Use preset prompt if not overridden
141
228
  if prompt is None:
@@ -158,11 +245,14 @@ def evaluate(
158
245
  dataset_id_field = preset.dataset_id_field
159
246
  else:
160
247
  # Custom dataset
248
+ logger.info("Using custom dataset")
161
249
  dataset = list(benchmark_or_dataset)
250
+ logger.info(f"Custom dataset has {len(dataset)} samples")
162
251
 
163
252
  # Limit dataset if requested
164
253
  if limit is not None:
165
254
  dataset = dataset[:limit]
255
+ logger.info(f"Limited to {len(dataset)} samples")
166
256
 
167
257
  # Use provided prompt or default
168
258
  if prompt is None:
@@ -188,7 +278,15 @@ def evaluate(
188
278
  dataset_id_field = "id"
189
279
 
190
280
  # Parse model name to get provider and options
191
- provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
281
+ logger.info(f"Parsing model configuration...")
282
+ try:
283
+ provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
284
+ logger.info(f"Provider: {provider_name}")
285
+ logger.info(f"Model ID: {model_id}")
286
+ logger.debug(f"Provider options: {provider_options}")
287
+ except Exception as e:
288
+ logger.error(f"❌ Failed to parse model name '{model}': {e}")
289
+ raise
192
290
 
193
291
  # Create model spec
194
292
  model_spec = ModelSpec(
@@ -214,17 +312,31 @@ def evaluate(
214
312
  )
215
313
 
216
314
  # Create provider and router
217
- provider = create_provider(provider_name, **provider_options)
315
+ logger.info(f"Creating provider '{provider_name}'...")
316
+ try:
317
+ provider = create_provider(provider_name, **provider_options)
318
+ logger.info(f"✅ Provider created successfully")
319
+ except KeyError as e:
320
+ logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
321
+ logger.error(f" This usually means the provider module wasn't imported.")
322
+ raise
323
+ except Exception as e:
324
+ logger.error(f"❌ Failed to create provider: {e}")
325
+ raise
326
+
218
327
  router = ProviderRouter({model_id: provider})
328
+ logger.debug(f"Router configured for model: {model_id}")
219
329
 
220
330
  # Create runner
221
- runner = GenerationRunner(provider=router)
331
+ runner = GenerationRunner(provider=router, max_parallel=workers)
332
+ logger.info(f"Runner configured with {workers} parallel workers")
222
333
 
223
334
  # Create evaluation pipeline
224
335
  pipeline = EvaluationPipeline(
225
336
  extractor=extractor,
226
337
  metrics=metrics_list,
227
338
  )
339
+ logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
228
340
 
229
341
  # Determine storage location
230
342
  if storage is None:
@@ -235,11 +347,15 @@ def evaluate(
235
347
  # Generate run ID if not provided
236
348
  if run_id is None:
237
349
  run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
350
+ logger.info(f"Run ID: {run_id}")
351
+ logger.info(f"Storage: {storage_dir}")
352
+ logger.info(f"Resume: {resume}")
238
353
 
239
354
  # Create storage backend
240
355
  if isinstance(storage_dir, Path):
241
356
  from themis.experiment.storage import ExperimentStorage
242
357
  storage_backend = ExperimentStorage(storage_dir)
358
+ logger.debug(f"Storage backend created at {storage_dir}")
243
359
  else:
244
360
  # Cloud storage (to be implemented in Phase 3)
245
361
  raise NotImplementedError(
@@ -264,15 +380,34 @@ def evaluate(
264
380
  )
265
381
 
266
382
  # Run locally
267
- report = orchestrator.run(
268
- dataset=dataset,
269
- max_samples=limit,
270
- run_id=run_id,
271
- resume=resume,
272
- on_result=on_result,
273
- )
383
+ logger.info("=" * 60)
384
+ logger.info("🚀 Starting experiment execution...")
385
+ logger.info("=" * 60)
274
386
 
275
- return report
387
+ try:
388
+ report = orchestrator.run(
389
+ dataset=dataset,
390
+ max_samples=limit,
391
+ run_id=run_id,
392
+ resume=resume,
393
+ on_result=on_result,
394
+ )
395
+
396
+ logger.info("=" * 60)
397
+ logger.info("✅ Evaluation completed successfully!")
398
+ logger.info(f" Total samples: {len(report.generation_results)}")
399
+ logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
400
+ logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
401
+ if report.evaluation_report.metrics:
402
+ logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
403
+ logger.info("=" * 60)
404
+
405
+ return report
406
+ except Exception as e:
407
+ logger.error("=" * 60)
408
+ logger.error(f"❌ Evaluation failed: {e}")
409
+ logger.error("=" * 60)
410
+ raise
276
411
 
277
412
 
278
413
  def _resolve_metrics(metric_names: list[str]) -> list:
@@ -298,8 +433,8 @@ def _resolve_metrics(metric_names: list[str]) -> list:
298
433
  except ImportError:
299
434
  nlp_available = False
300
435
 
301
- # Metric registry
302
- METRICS_REGISTRY = {
436
+ # Built-in metrics registry
437
+ BUILTIN_METRICS = {
303
438
  # Core metrics
304
439
  "exact_match": ExactMatch,
305
440
  "math_verify": MathVerifyAccuracy,
@@ -308,7 +443,7 @@ def _resolve_metrics(metric_names: list[str]) -> list:
308
443
 
309
444
  # Add NLP metrics if available
310
445
  if nlp_available:
311
- METRICS_REGISTRY.update({
446
+ BUILTIN_METRICS.update({
312
447
  "bleu": BLEU,
313
448
  "rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
314
449
  "rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
@@ -321,6 +456,10 @@ def _resolve_metrics(metric_names: list[str]) -> list:
321
456
  # "pass_at_k": PassAtK,
322
457
  # "codebleu": CodeBLEU,
323
458
 
459
+ # Merge built-in and custom metrics
460
+ # Custom metrics can override built-in metrics
461
+ METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
462
+
324
463
  metrics = []
325
464
  for name in metric_names:
326
465
  if name not in METRICS_REGISTRY:
@@ -340,4 +479,4 @@ def _resolve_metrics(metric_names: list[str]) -> list:
340
479
  return metrics
341
480
 
342
481
 
343
- __all__ = ["evaluate"]
482
+ __all__ = ["evaluate", "register_metric", "get_registered_metrics"]
@@ -2,10 +2,13 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from datetime import datetime, timezone
6
7
  from typing import Callable, Sequence
7
8
 
8
9
  from themis.config.schema import IntegrationsConfig
10
+
11
+ logger = logging.getLogger(__name__)
9
12
  from themis.core.entities import (
10
13
  EvaluationRecord,
11
14
  ExperimentFailure,
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
102
105
  Returns:
103
106
  ExperimentReport with generation results, evaluation, and metadata
104
107
  """
108
+ logger.info("Orchestrator: Initializing experiment run")
109
+
105
110
  # Initialize integrations
106
111
  self._integrations.initialize_run(
107
112
  {
@@ -112,13 +117,23 @@ class ExperimentOrchestrator:
112
117
  )
113
118
 
114
119
  # Prepare dataset
115
- dataset_list = self._resolve_dataset(
116
- dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
117
- )
120
+ logger.info("Orchestrator: Loading dataset...")
121
+ try:
122
+ dataset_list = self._resolve_dataset(
123
+ dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
124
+ )
125
+ logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
126
+ except Exception as e:
127
+ logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
128
+ raise
129
+
118
130
  selected_dataset = (
119
131
  dataset_list[:max_samples] if max_samples is not None else dataset_list
120
132
  )
121
133
  run_identifier = run_id or self._default_run_id()
134
+
135
+ logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
136
+ logger.info(f"Orchestrator: Run ID = {run_identifier}")
122
137
 
123
138
  # Initialize run in storage (if storage exists and run doesn't exist)
124
139
  if self._cache.has_storage:
@@ -130,18 +145,30 @@ class ExperimentOrchestrator:
130
145
  self._cache.cache_dataset(run_identifier, dataset_list)
131
146
 
132
147
  # Expand dataset into generation tasks
133
- tasks = list(self._plan.expand(selected_dataset))
148
+ logger.info("Orchestrator: Expanding dataset into generation tasks...")
149
+ try:
150
+ tasks = list(self._plan.expand(selected_dataset))
151
+ logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
152
+ except Exception as e:
153
+ logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
154
+ raise
134
155
 
135
156
  # Build evaluation configuration for cache invalidation
136
157
  evaluation_config = self._build_evaluation_config()
137
158
 
138
159
  # Load cached results if resuming
160
+ if resume:
161
+ logger.info("Orchestrator: Loading cached results...")
139
162
  cached_records = (
140
163
  self._cache.load_cached_records(run_identifier) if resume else {}
141
164
  )
142
165
  cached_evaluations = (
143
166
  self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
144
167
  )
168
+ if resume and cached_records:
169
+ logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
170
+ if resume and cached_evaluations:
171
+ logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
145
172
 
146
173
  # Process tasks: use cached or run new generations
147
174
  generation_results: list[GenerationRecord] = []
@@ -178,9 +205,18 @@ class ExperimentOrchestrator:
178
205
 
179
206
  # Run pending generation tasks
180
207
  if pending_tasks:
208
+ logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
209
+ completed = 0
181
210
  for record in self._runner.run(pending_tasks):
211
+ logger.debug(f"Orchestrator: Received generation record")
182
212
  generation_results.append(record)
213
+ completed += 1
214
+
215
+ # Log progress every 10 samples or at key milestones
216
+ if completed % 10 == 0 or completed == len(pending_tasks):
217
+ logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
183
218
 
219
+ logger.debug(f"Orchestrator: Processing record (cost tracking...)")
184
220
  # Track cost for successful generations
185
221
  if record.output and record.output.usage:
186
222
  usage = record.output.usage
@@ -197,6 +233,7 @@ class ExperimentOrchestrator:
197
233
  cost=cost,
198
234
  )
199
235
 
236
+ logger.debug(f"Orchestrator: Processing record (error handling...)")
200
237
  if record.error:
201
238
  failures.append(
202
239
  ExperimentFailure(
@@ -204,20 +241,35 @@ class ExperimentOrchestrator:
204
241
  message=record.error.message,
205
242
  )
206
243
  )
244
+
245
+ logger.debug(f"Orchestrator: Processing record (caching...)")
207
246
  cache_key = experiment_storage.task_cache_key(record.task)
208
247
  if cache_results:
209
248
  self._cache.save_generation_record(
210
249
  run_identifier, record, cache_key
211
250
  )
251
+
252
+ logger.debug(f"Orchestrator: Processing record (adding to pending...)")
212
253
  pending_records.append(record)
213
254
  pending_keys.append(cache_key)
255
+
256
+ logger.debug(f"Orchestrator: Processing record (callback...)")
214
257
  if on_result:
215
258
  on_result(record)
259
+ logger.debug(f"Orchestrator: Record processing complete")
216
260
 
217
261
  # Evaluate pending records
262
+ logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
218
263
  if pending_records:
219
- new_evaluation_report = self._evaluation.evaluate(pending_records)
264
+ logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
265
+ try:
266
+ new_evaluation_report = self._evaluation.evaluate(pending_records)
267
+ logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
268
+ except Exception as e:
269
+ logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
270
+ raise
220
271
  else:
272
+ logger.info("Orchestrator: No new records to evaluate (all cached)")
221
273
  new_evaluation_report = evaluation_pipeline.EvaluationReport(
222
274
  metrics={}, failures=[], records=[]
223
275
  )
@@ -229,12 +281,16 @@ class ExperimentOrchestrator:
229
281
  )
230
282
 
231
283
  # Combine cached and new evaluations
284
+ logger.info("Orchestrator: Combining cached and new evaluations...")
232
285
  evaluation_report = self._combine_evaluations(
233
286
  cached_eval_records, new_evaluation_report
234
287
  )
288
+ logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
235
289
 
236
290
  # Get cost breakdown
237
291
  cost_breakdown = self._cost_tracker.get_breakdown()
292
+ if cost_breakdown.total_cost > 0:
293
+ logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
238
294
 
239
295
  # Build metadata
240
296
  metadata = {