themis-eval 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. {themis_eval-0.2.1/themis_eval.egg-info → themis_eval-0.2.2}/PKG-INFO +6 -5
  2. {themis_eval-0.2.1 → themis_eval-0.2.2}/README.md +4 -4
  3. {themis_eval-0.2.1 → themis_eval-0.2.2}/pyproject.toml +2 -1
  4. themis_eval-0.2.2/themis/__init__.py +47 -0
  5. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/_version.py +1 -1
  6. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/api.py +57 -4
  7. themis_eval-0.2.2/themis/presets/__init__.py +21 -0
  8. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/utils/logging_utils.py +8 -3
  9. themis_eval-0.2.2/themis/utils/progress.py +77 -0
  10. {themis_eval-0.2.1 → themis_eval-0.2.2/themis_eval.egg-info}/PKG-INFO +6 -5
  11. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis_eval.egg-info/requires.txt +1 -0
  12. themis_eval-0.2.1/themis/__init__.py +0 -25
  13. themis_eval-0.2.1/themis/presets/__init__.py +0 -10
  14. themis_eval-0.2.1/themis/utils/progress.py +0 -58
  15. {themis_eval-0.2.1 → themis_eval-0.2.2}/LICENSE +0 -0
  16. {themis_eval-0.2.1 → themis_eval-0.2.2}/setup.cfg +0 -0
  17. {themis_eval-0.2.1 → themis_eval-0.2.2}/tests/test_package_metadata.py +0 -0
  18. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/backends/__init__.py +0 -0
  19. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/backends/execution.py +0 -0
  20. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/backends/storage.py +0 -0
  21. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/__init__.py +0 -0
  22. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/__main__.py +0 -0
  23. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/__init__.py +0 -0
  24. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/benchmarks.py +0 -0
  25. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/comparison.py +0 -0
  26. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/config_commands.py +0 -0
  27. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/cost.py +0 -0
  28. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/demo.py +0 -0
  29. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/info.py +0 -0
  30. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/leaderboard.py +0 -0
  31. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/math_benchmarks.py +0 -0
  32. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/mcq_benchmarks.py +0 -0
  33. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/results.py +0 -0
  34. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/sample_run.py +0 -0
  35. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/commands/visualize.py +0 -0
  36. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/main.py +0 -0
  37. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/new_project.py +0 -0
  38. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/cli/utils.py +0 -0
  39. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/comparison/__init__.py +0 -0
  40. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/comparison/engine.py +0 -0
  41. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/comparison/reports.py +0 -0
  42. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/comparison/statistics.py +0 -0
  43. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/config/__init__.py +0 -0
  44. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/config/loader.py +0 -0
  45. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/config/registry.py +0 -0
  46. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/config/runtime.py +0 -0
  47. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/config/schema.py +0 -0
  48. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/core/__init__.py +0 -0
  49. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/core/conversation.py +0 -0
  50. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/core/entities.py +0 -0
  51. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/core/serialization.py +0 -0
  52. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/core/tools.py +0 -0
  53. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/core/types.py +0 -0
  54. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/__init__.py +0 -0
  55. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/base.py +0 -0
  56. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/commonsense_qa.py +0 -0
  57. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/competition_math.py +0 -0
  58. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/coqa.py +0 -0
  59. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/gpqa.py +0 -0
  60. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/gsm8k.py +0 -0
  61. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/gsm_symbolic.py +0 -0
  62. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/math500.py +0 -0
  63. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/med_qa.py +0 -0
  64. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/medmcqa.py +0 -0
  65. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/mmlu_pro.py +0 -0
  66. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/piqa.py +0 -0
  67. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/registry.py +0 -0
  68. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/schema.py +0 -0
  69. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/sciq.py +0 -0
  70. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/social_i_qa.py +0 -0
  71. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/datasets/super_gpqa.py +0 -0
  72. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/__init__.py +0 -0
  73. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/conditional.py +0 -0
  74. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/__init__.py +0 -0
  75. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/error_taxonomy_extractor.py +0 -0
  76. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/exceptions.py +0 -0
  77. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/identity_extractor.py +0 -0
  78. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/json_field_extractor.py +0 -0
  79. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/math_verify_extractor.py +0 -0
  80. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/extractors/regex_extractor.py +0 -0
  81. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/math_verify_utils.py +0 -0
  82. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/__init__.py +0 -0
  83. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/code/__init__.py +0 -0
  84. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/code/codebleu.py +0 -0
  85. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/code/execution.py +0 -0
  86. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/code/pass_at_k.py +0 -0
  87. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/composite_metric.py +0 -0
  88. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/consistency_metric.py +0 -0
  89. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/exact_match.py +0 -0
  90. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/length_difference_tolerance.py +0 -0
  91. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/math_verify_accuracy.py +0 -0
  92. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/__init__.py +0 -0
  93. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/bertscore.py +0 -0
  94. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/bleu.py +0 -0
  95. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/meteor.py +0 -0
  96. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/rouge.py +0 -0
  97. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/pairwise_judge_metric.py +0 -0
  98. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/response_length.py +0 -0
  99. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/metrics/rubric_judge_metric.py +0 -0
  100. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/pipeline.py +0 -0
  101. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/pipelines/__init__.py +0 -0
  102. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/pipelines/composable_pipeline.py +0 -0
  103. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/pipelines/standard_pipeline.py +0 -0
  104. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/reports.py +0 -0
  105. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/__init__.py +0 -0
  106. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/bootstrap.py +0 -0
  107. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/confidence_intervals.py +0 -0
  108. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/distributions.py +0 -0
  109. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/effect_sizes.py +0 -0
  110. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/hypothesis_tests.py +0 -0
  111. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/statistics/types.py +0 -0
  112. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/strategies/__init__.py +0 -0
  113. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +0 -0
  114. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/strategies/default_evaluation_strategy.py +0 -0
  115. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/strategies/evaluation_strategy.py +0 -0
  116. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/evaluation/strategies/judge_evaluation_strategy.py +0 -0
  117. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/__init__.py +0 -0
  118. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/builder.py +0 -0
  119. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/cache_manager.py +0 -0
  120. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/comparison.py +0 -0
  121. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/cost.py +0 -0
  122. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/definitions.py +0 -0
  123. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/export.py +0 -0
  124. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/export_csv.py +0 -0
  125. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/integration_manager.py +0 -0
  126. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/math.py +0 -0
  127. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/mcq.py +0 -0
  128. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/orchestrator.py +0 -0
  129. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/pricing.py +0 -0
  130. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/storage.py +0 -0
  131. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/experiment/visualization.py +0 -0
  132. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/__init__.py +0 -0
  133. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/agentic_runner.py +0 -0
  134. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/batching.py +0 -0
  135. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/clients.py +0 -0
  136. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/conversation_runner.py +0 -0
  137. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/plan.py +0 -0
  138. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/providers/litellm_provider.py +0 -0
  139. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/providers/vllm_provider.py +0 -0
  140. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/router.py +0 -0
  141. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/runner.py +0 -0
  142. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/strategies.py +0 -0
  143. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/templates.py +0 -0
  144. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/turn_strategies.py +0 -0
  145. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/generation/types.py +0 -0
  146. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/integrations/__init__.py +0 -0
  147. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/integrations/huggingface.py +0 -0
  148. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/integrations/wandb.py +0 -0
  149. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/interfaces/__init__.py +0 -0
  150. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/presets/benchmarks.py +0 -0
  151. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/presets/models.py +0 -0
  152. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/project/__init__.py +0 -0
  153. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/project/definitions.py +0 -0
  154. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/project/patterns.py +0 -0
  155. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/providers/__init__.py +0 -0
  156. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/providers/registry.py +0 -0
  157. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/py.typed +0 -0
  158. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/server/__init__.py +0 -0
  159. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/server/app.py +0 -0
  160. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/utils/api_generator.py +0 -0
  161. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/utils/cost_tracking.py +0 -0
  162. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/utils/dashboard.py +0 -0
  163. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis/utils/tracing.py +0 -0
  164. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis_eval.egg-info/SOURCES.txt +0 -0
  165. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis_eval.egg-info/dependency_links.txt +0 -0
  166. {themis_eval-0.2.1 → themis_eval-0.2.2}/themis_eval.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -25,6 +25,7 @@ Requires-Dist: tabulate>=0.9.0
25
25
  Requires-Dist: tenacity>=9.1.2
26
26
  Requires-Dist: plotly>=6.5.0
27
27
  Requires-Dist: math-verify>=0.8.0
28
+ Requires-Dist: rich>=14.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: pytest>=8.0; extra == "dev"
30
31
  Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
@@ -358,9 +359,9 @@ Themis is built on a clean, modular architecture:
358
359
 
359
360
  - **[API Reference](docs/index.md)** - Detailed API documentation
360
361
  - **[Examples](examples-simple/)** - Runnable code examples
361
- - **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
362
- - **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
363
- - **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
362
+ - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
363
+ - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
364
+ - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
364
365
 
365
366
  ---
366
367
 
@@ -388,7 +389,7 @@ result = evaluate(
388
389
  )
389
390
  ```
390
391
 
391
- See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
392
+ See [docs/customization/backends.md](docs/customization/backends.md) for details.
392
393
 
393
394
  ### Distributed Execution
394
395
 
@@ -300,9 +300,9 @@ Themis is built on a clean, modular architecture:
300
300
 
301
301
  - **[API Reference](docs/index.md)** - Detailed API documentation
302
302
  - **[Examples](examples-simple/)** - Runnable code examples
303
- - **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
304
- - **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
305
- - **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
303
+ - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
304
+ - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
305
+ - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
306
306
 
307
307
  ---
308
308
 
@@ -330,7 +330,7 @@ result = evaluate(
330
330
  )
331
331
  ```
332
332
 
333
- See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
333
+ See [docs/customization/backends.md](docs/customization/backends.md) for details.
334
334
 
335
335
  ### Distributed Execution
336
336
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "themis-eval"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  description = "Lightweight evaluation platform for LLM experiments"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -32,6 +32,7 @@ dependencies = [
32
32
  "tenacity>=9.1.2",
33
33
  "plotly>=6.5.0",
34
34
  "math-verify>=0.8.0",
35
+ "rich>=14.2.0",
35
36
  ]
36
37
 
37
38
  [tool.setuptools.packages.find]
@@ -0,0 +1,47 @@
1
+ """Themis experiment platform - Dead simple LLM evaluation.
2
+
3
+ The primary interface is the `evaluate()` function:
4
+
5
+ import themis
6
+ report = themis.evaluate("math500", model="gpt-4", limit=100)
7
+
8
+ Extension APIs for registering custom components:
9
+ - themis.register_metric() - Register custom metrics
10
+ - themis.register_dataset() - Register custom datasets
11
+ - themis.register_provider() - Register custom model providers
12
+ - themis.register_benchmark() - Register custom benchmark presets
13
+ """
14
+
15
+ from themis import config, core, evaluation, experiment, generation, project
16
+ from themis._version import __version__
17
+ from themis.api import evaluate, get_registered_metrics, register_metric
18
+ from themis.datasets import register_dataset, list_datasets, is_dataset_registered
19
+ from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
20
+ from themis.providers import register_provider
21
+
22
+ __all__ = [
23
+ # Main API
24
+ "evaluate",
25
+ # Metrics
26
+ "register_metric",
27
+ "get_registered_metrics",
28
+ # Datasets
29
+ "register_dataset",
30
+ "list_datasets",
31
+ "is_dataset_registered",
32
+ # Benchmarks
33
+ "register_benchmark",
34
+ "list_benchmarks",
35
+ "get_benchmark_preset",
36
+ # Providers
37
+ "register_provider",
38
+ # Submodules
39
+ "config",
40
+ "core",
41
+ "evaluation",
42
+ "experiment",
43
+ "generation",
44
+ "project",
45
+ # Version
46
+ "__version__",
47
+ ]
@@ -9,7 +9,7 @@ def _detect_version() -> str:
9
9
  try:
10
10
  return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.2.1" # Fallback for development
12
+ return "0.2.2" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
@@ -66,6 +66,55 @@ except ImportError:
66
66
  logger = logging.getLogger(__name__)
67
67
 
68
68
 
69
+ # Module-level metrics registry for custom metrics
70
+ _METRICS_REGISTRY: dict[str, type] = {}
71
+
72
+
73
+ def register_metric(name: str, metric_cls: type) -> None:
74
+ """Register a custom metric for use in evaluate().
75
+
76
+ This allows users to add their own metrics to Themis without modifying
77
+ the source code. Registered metrics can be used by passing their names
78
+ to the `metrics` parameter in evaluate().
79
+
80
+ Args:
81
+ name: Metric name (used in evaluate(metrics=[name]))
82
+ metric_cls: Metric class implementing the Metric interface.
83
+ Must have a compute() method that takes prediction, references,
84
+ and metadata parameters.
85
+
86
+ Raises:
87
+ TypeError: If metric_cls is not a class
88
+ ValueError: If metric_cls doesn't implement the required interface
89
+
90
+ Example:
91
+ >>> from themis.evaluation.metrics import MyCustomMetric
92
+ >>> themis.register_metric("my_metric", MyCustomMetric)
93
+ >>> report = themis.evaluate("math500", model="gpt-4", metrics=["my_metric"])
94
+ """
95
+ if not isinstance(metric_cls, type):
96
+ raise TypeError(f"metric_cls must be a class, got {type(metric_cls)}")
97
+
98
+ # Validate that it implements the Metric interface
99
+ if not hasattr(metric_cls, "compute"):
100
+ raise ValueError(
101
+ f"{metric_cls.__name__} must implement compute() method. "
102
+ f"See themis.evaluation.metrics for examples."
103
+ )
104
+
105
+ _METRICS_REGISTRY[name] = metric_cls
106
+ logger.info(f"Registered custom metric: {name} -> {metric_cls.__name__}")
107
+
108
+
109
+ def get_registered_metrics() -> dict[str, type]:
110
+ """Get all currently registered custom metrics.
111
+
112
+ Returns:
113
+ Dictionary mapping metric names to their classes
114
+ """
115
+ return _METRICS_REGISTRY.copy()
116
+
117
+
69
118
  def evaluate(
70
119
  benchmark_or_dataset: str | Sequence[dict[str, Any]],
71
120
  *,
@@ -384,8 +433,8 @@ def _resolve_metrics(metric_names: list[str]) -> list:
384
433
  except ImportError:
385
434
  nlp_available = False
386
435
 
387
- # Metric registry
388
- METRICS_REGISTRY = {
436
+ # Built-in metrics registry
437
+ BUILTIN_METRICS = {
389
438
  # Core metrics
390
439
  "exact_match": ExactMatch,
391
440
  "math_verify": MathVerifyAccuracy,
@@ -394,7 +443,7 @@ def _resolve_metrics(metric_names: list[str]) -> list:
394
443
 
395
444
  # Add NLP metrics if available
396
445
  if nlp_available:
397
- METRICS_REGISTRY.update({
446
+ BUILTIN_METRICS.update({
398
447
  "bleu": BLEU,
399
448
  "rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
400
449
  "rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
@@ -407,6 +456,10 @@ def _resolve_metrics(metric_names: list[str]) -> list:
407
456
  # "pass_at_k": PassAtK,
408
457
  # "codebleu": CodeBLEU,
409
458
 
459
+ # Merge built-in and custom metrics
460
+ # Custom metrics can override built-in metrics
461
+ METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
462
+
410
463
  metrics = []
411
464
  for name in metric_names:
412
465
  if name not in METRICS_REGISTRY:
@@ -426,4 +479,4 @@ def _resolve_metrics(metric_names: list[str]) -> list:
426
479
  return metrics
427
480
 
428
481
 
429
- __all__ = ["evaluate"]
482
+ __all__ = ["evaluate", "register_metric", "get_registered_metrics"]
@@ -0,0 +1,21 @@
1
+ """Preset configurations for common benchmarks and models.
2
+
3
+ This module provides automatic configuration for popular benchmarks,
4
+ eliminating the need for manual setup of prompts, metrics, and extractors.
5
+ """
6
+
7
+ from themis.presets.benchmarks import (
8
+ BenchmarkPreset,
9
+ get_benchmark_preset,
10
+ list_benchmarks,
11
+ register_benchmark,
12
+ )
13
+ from themis.presets.models import parse_model_name
14
+
15
+ __all__ = [
16
+ "BenchmarkPreset",
17
+ "register_benchmark",
18
+ "get_benchmark_preset",
19
+ "list_benchmarks",
20
+ "parse_model_name",
21
+ ]
@@ -5,6 +5,9 @@ from __future__ import annotations
5
5
  import logging
6
6
  from typing import Mapping
7
7
 
8
+ from rich.logging import RichHandler
9
+ from rich.traceback import install as install_rich_traceback
10
+
8
11
  TRACE_LEVEL = 5
9
12
  logging.addLevelName(TRACE_LEVEL, "TRACE")
10
13
 
@@ -28,12 +31,14 @@ _LEVELS: Mapping[str, int] = {
28
31
 
29
32
  def configure_logging(level: str = "info") -> None:
30
33
  """Configure root logging with human-friendly formatting."""
31
-
34
+ install_rich_traceback()
32
35
  numeric_level = _LEVELS.get(level.lower(), logging.INFO)
36
+
33
37
  logging.basicConfig(
34
38
  level=numeric_level,
35
- format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
36
- datefmt="%H:%M:%S",
39
+ format="%(message)s",
40
+ datefmt="[%X]",
41
+ handlers=[RichHandler(rich_tracebacks=True, markup=True)],
37
42
  force=True,
38
43
  )
39
44
 
@@ -0,0 +1,77 @@
1
+ """Simple CLI-friendly progress reporter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contextlib import AbstractContextManager
6
+ from typing import Any, Callable
7
+
8
+ from rich.progress import (
9
+ BarColumn,
10
+ MofNCompleteColumn,
11
+ Progress,
12
+ SpinnerColumn,
13
+ TaskProgressColumn,
14
+ TextColumn,
15
+ TimeElapsedColumn,
16
+ TimeRemainingColumn,
17
+ )
18
+
19
+
20
+ class ProgressReporter(AbstractContextManager["ProgressReporter"]):
21
+ def __init__(
22
+ self,
23
+ *,
24
+ total: int | None,
25
+ description: str = "Processing",
26
+ unit: str = "sample",
27
+ leave: bool = False,
28
+ ) -> None:
29
+ self._total = total
30
+ self._description = description
31
+ self._unit = unit
32
+ self._leave = leave
33
+ self._progress: Progress | None = None
34
+ self._task_id = None
35
+
36
+ def __enter__(self) -> "ProgressReporter":
37
+ self.start()
38
+ return self
39
+
40
+ def __exit__(self, *_exc) -> None:
41
+ self.close()
42
+
43
+ def start(self) -> None:
44
+ if self._progress is None:
45
+ self._progress = Progress(
46
+ SpinnerColumn(),
47
+ TextColumn("[progress.description]{task.description}"),
48
+ BarColumn(),
49
+ TaskProgressColumn(),
50
+ MofNCompleteColumn(),
51
+ TimeElapsedColumn(),
52
+ TimeRemainingColumn(),
53
+ transient=not self._leave,
54
+ )
55
+ self._progress.start()
56
+ self._task_id = self._progress.add_task(
57
+ self._description, total=self._total
58
+ )
59
+
60
+ def close(self) -> None:
61
+ if self._progress is not None:
62
+ self._progress.stop()
63
+ self._progress = None
64
+ self._task_id = None
65
+
66
+ def increment(self, step: int = 1) -> None:
67
+ if self._progress is not None and self._task_id is not None:
68
+ self._progress.update(self._task_id, advance=step)
69
+
70
+ def on_result(self, _record: Any) -> None:
71
+ self.increment()
72
+
73
+ def as_callback(self) -> Callable[[Any], None]:
74
+ return self.on_result
75
+
76
+
77
+ __all__ = ["ProgressReporter"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -25,6 +25,7 @@ Requires-Dist: tabulate>=0.9.0
25
25
  Requires-Dist: tenacity>=9.1.2
26
26
  Requires-Dist: plotly>=6.5.0
27
27
  Requires-Dist: math-verify>=0.8.0
28
+ Requires-Dist: rich>=14.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: pytest>=8.0; extra == "dev"
30
31
  Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
@@ -358,9 +359,9 @@ Themis is built on a clean, modular architecture:
358
359
 
359
360
  - **[API Reference](docs/index.md)** - Detailed API documentation
360
361
  - **[Examples](examples-simple/)** - Runnable code examples
361
- - **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
362
- - **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
363
- - **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
362
+ - **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
363
+ - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
364
+ - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
364
365
 
365
366
  ---
366
367
 
@@ -388,7 +389,7 @@ result = evaluate(
388
389
  )
389
390
  ```
390
391
 
391
- See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
392
+ See [docs/customization/backends.md](docs/customization/backends.md) for details.
392
393
 
393
394
  ### Distributed Execution
394
395
 
@@ -8,6 +8,7 @@ tabulate>=0.9.0
8
8
  tenacity>=9.1.2
9
9
  plotly>=6.5.0
10
10
  math-verify>=0.8.0
11
+ rich>=14.2.0
11
12
 
12
13
  [all]
13
14
  themis-eval[code,docs,math,nlp,server,viz]
@@ -1,25 +0,0 @@
1
- """Themis experiment platform - Dead simple LLM evaluation.
2
-
3
- The primary interface is the `evaluate()` function:
4
-
5
- import themis
6
- report = themis.evaluate("math500", model="gpt-4", limit=100)
7
- """
8
-
9
- from themis import config, core, evaluation, experiment, generation, project
10
- from themis._version import __version__
11
- from themis.api import evaluate
12
-
13
- __all__ = [
14
- # Main API
15
- "evaluate",
16
- # Submodules
17
- "config",
18
- "core",
19
- "evaluation",
20
- "experiment",
21
- "generation",
22
- "project",
23
- # Version
24
- "__version__",
25
- ]
@@ -1,10 +0,0 @@
1
- """Preset configurations for common benchmarks and models.
2
-
3
- This module provides automatic configuration for popular benchmarks,
4
- eliminating the need for manual setup of prompts, metrics, and extractors.
5
- """
6
-
7
- from themis.presets.benchmarks import get_benchmark_preset, list_benchmarks
8
- from themis.presets.models import parse_model_name
9
-
10
- __all__ = ["get_benchmark_preset", "list_benchmarks", "parse_model_name"]
@@ -1,58 +0,0 @@
1
- """Simple CLI-friendly progress reporter."""
2
-
3
- from __future__ import annotations
4
-
5
- from contextlib import AbstractContextManager
6
- from typing import Any, Callable
7
-
8
- from tqdm import tqdm
9
-
10
-
11
- class ProgressReporter(AbstractContextManager["ProgressReporter"]):
12
- def __init__(
13
- self,
14
- *,
15
- total: int | None,
16
- description: str = "Processing",
17
- unit: str = "sample",
18
- leave: bool = False,
19
- ) -> None:
20
- self._total = total
21
- self._description = description
22
- self._unit = unit
23
- self._leave = leave
24
- self._pbar: tqdm | None = None
25
-
26
- def __enter__(self) -> "ProgressReporter":
27
- self.start()
28
- return self
29
-
30
- def __exit__(self, *_exc) -> None:
31
- self.close()
32
-
33
- def start(self) -> None:
34
- if self._pbar is None:
35
- self._pbar = tqdm(
36
- total=self._total,
37
- desc=self._description,
38
- unit=self._unit,
39
- leave=self._leave,
40
- )
41
-
42
- def close(self) -> None:
43
- if self._pbar is not None:
44
- self._pbar.close()
45
- self._pbar = None
46
-
47
- def increment(self, step: int = 1) -> None:
48
- if self._pbar is not None:
49
- self._pbar.update(step)
50
-
51
- def on_result(self, _record: Any) -> None:
52
- self.increment()
53
-
54
- def as_callback(self) -> Callable[[Any], None]:
55
- return self.on_result
56
-
57
-
58
- __all__ = ["ProgressReporter"]
File without changes
File without changes
File without changes