themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,151 @@
1
+ """Utilities for assembling experiments from reusable components."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, Callable, Mapping, Sequence, Type
7
+
8
+ from themis.config import schema as config
9
+ from themis.core import entities as core_entities
10
+ from themis.evaluation import pipeline as evaluation_pipeline
11
+ from themis.evaluation import strategies as evaluation_strategies
12
+ from themis.experiment import orchestrator
13
+ from themis.experiment import storage as experiment_storage
14
+ from themis.experiment.cache_manager import CacheManager
15
+ from themis.experiment.definitions import (
16
+ BuiltExperiment,
17
+ ExperimentDefinition,
18
+ ModelBinding,
19
+ )
20
+ from themis.experiment.integration_manager import IntegrationManager
21
+ from themis.generation import plan as generation_plan
22
+ from themis.generation import router as generation_router
23
+ from themis.generation import runner as generation_runner
24
+ from themis.generation import strategies as generation_strategies
25
+ from themis.interfaces import ModelProvider
26
+ from themis.providers import create_provider
27
+
28
+
29
+ class ExperimentBuilder:
30
+ """Composable builder for constructing experiment components."""
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ extractor,
36
+ metrics,
37
+ runner_cls: Type[
38
+ generation_runner.GenerationRunner
39
+ ] = generation_runner.GenerationRunner,
40
+ runner_kwargs: Mapping[str, Any] | None = None,
41
+ pipeline_cls: Type[
42
+ evaluation_pipeline.EvaluationPipeline
43
+ ] = evaluation_pipeline.EvaluationPipeline,
44
+ pipeline_kwargs: Mapping[str, Any] | None = None,
45
+ router_cls: Type[ModelProvider] = generation_router.ProviderRouter,
46
+ router_kwargs: Mapping[str, Any] | None = None,
47
+ strategy_resolver: Callable[
48
+ [core_entities.GenerationTask], generation_strategies.GenerationStrategy
49
+ ]
50
+ | None = None,
51
+ evaluation_strategy_resolver: Callable[
52
+ [core_entities.GenerationRecord], evaluation_strategies.EvaluationStrategy
53
+ ]
54
+ | None = None,
55
+ ) -> None:
56
+ self._extractor = extractor
57
+ self._metrics = list(metrics)
58
+ self._runner_cls = runner_cls
59
+ self._runner_kwargs = dict(runner_kwargs or {})
60
+ self._pipeline_cls = pipeline_cls
61
+ self._pipeline_kwargs = dict(pipeline_kwargs or {})
62
+ self._router_cls = router_cls
63
+ self._router_kwargs = dict(router_kwargs or {})
64
+ self._strategy_resolver = strategy_resolver
65
+ self._evaluation_strategy_resolver = evaluation_strategy_resolver
66
+
67
+ def build(
68
+ self,
69
+ definition: ExperimentDefinition,
70
+ *,
71
+ storage_dir: str | Path | None = None,
72
+ ) -> BuiltExperiment:
73
+ plan_obj = self._build_plan(definition)
74
+ router = self._build_router(definition.model_bindings)
75
+ runner_kwargs = dict(self._runner_kwargs)
76
+ if self._strategy_resolver is not None:
77
+ runner_kwargs.setdefault("strategy_resolver", self._strategy_resolver)
78
+ runner = self._runner_cls(provider=router, **runner_kwargs)
79
+ pipeline_kwargs = dict(self._pipeline_kwargs)
80
+ if self._evaluation_strategy_resolver is not None:
81
+ pipeline_kwargs.setdefault(
82
+ "strategy_resolver", self._evaluation_strategy_resolver
83
+ )
84
+ pipeline = self._pipeline_cls(
85
+ extractor=self._extractor,
86
+ metrics=self._metrics,
87
+ **pipeline_kwargs,
88
+ )
89
+
90
+ # Create storage backend
91
+ storage = (
92
+ experiment_storage.ExperimentStorage(storage_dir)
93
+ if storage_dir is not None
94
+ else None
95
+ )
96
+
97
+ # Create managers for better separation of concerns
98
+ cache_manager = CacheManager(
99
+ storage=storage,
100
+ enable_resume=True,
101
+ enable_cache=True,
102
+ )
103
+ integration_manager = IntegrationManager(config=config.IntegrationsConfig())
104
+
105
+ # Create orchestrator with managers
106
+ orchestrator_obj = orchestrator.ExperimentOrchestrator(
107
+ generation_plan=plan_obj,
108
+ generation_runner=runner,
109
+ evaluation_pipeline=pipeline,
110
+ cache_manager=cache_manager,
111
+ integration_manager=integration_manager,
112
+ )
113
+
114
+ return BuiltExperiment(
115
+ orchestrator=orchestrator_obj,
116
+ plan=plan_obj,
117
+ runner=runner,
118
+ pipeline=pipeline,
119
+ storage=storage,
120
+ router=router,
121
+ )
122
+
123
+ def _build_plan(
124
+ self, definition: ExperimentDefinition
125
+ ) -> generation_plan.GenerationPlan:
126
+ return generation_plan.GenerationPlan(
127
+ templates=list(definition.templates),
128
+ models=[binding.spec for binding in definition.model_bindings],
129
+ sampling_parameters=list(definition.sampling_parameters),
130
+ dataset_id_field=definition.dataset_id_field,
131
+ reference_field=definition.reference_field,
132
+ metadata_fields=tuple(definition.metadata_fields),
133
+ context_builder=definition.context_builder,
134
+ )
135
+
136
+ def _build_router(self, bindings: Sequence[ModelBinding]) -> ModelProvider:
137
+ providers: dict[str, ModelProvider] = {}
138
+ for binding in bindings:
139
+ providers[binding.spec.identifier] = create_provider(
140
+ binding.provider_name,
141
+ **binding.provider_options,
142
+ )
143
+ return self._router_cls(providers, **self._router_kwargs)
144
+
145
+
146
+ __all__ = [
147
+ "ExperimentBuilder",
148
+ "ExperimentDefinition",
149
+ "ModelBinding",
150
+ "BuiltExperiment",
151
+ ]
@@ -0,0 +1,134 @@
1
+ """Cache management for experiment resumability and storage."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Sequence
6
+
7
+ from themis.core.entities import EvaluationRecord, GenerationRecord
8
+ from themis.experiment import storage as experiment_storage
9
+
10
+
11
+ class CacheManager:
12
+ """Manages experiment caching and resumability.
13
+
14
+ This class handles all storage-related operations including:
15
+ - Loading cached generation records
16
+ - Loading cached evaluations
17
+ - Saving datasets for resumability
18
+ - Saving generation records and evaluations
19
+
20
+ Single Responsibility: Cache and storage management
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ storage: experiment_storage.ExperimentStorage | None,
26
+ enable_resume: bool = True,
27
+ enable_cache: bool = True,
28
+ ) -> None:
29
+ """Initialize cache manager.
30
+
31
+ Args:
32
+ storage: Storage backend (None disables caching)
33
+ enable_resume: Whether to load cached results on resume
34
+ enable_cache: Whether to save new results to cache
35
+ """
36
+ self._storage = storage
37
+ self._enable_resume = enable_resume
38
+ self._enable_cache = enable_cache
39
+
40
+ @property
41
+ def has_storage(self) -> bool:
42
+ """Check if storage is available."""
43
+ return self._storage is not None
44
+
45
+ def cache_dataset(self, run_id: str, dataset: Sequence[dict[str, object]]) -> None:
46
+ """Cache dataset for future resumability.
47
+
48
+ Args:
49
+ run_id: Unique run identifier
50
+ dataset: Dataset samples to cache
51
+ """
52
+ if self._storage is not None and self._enable_cache:
53
+ self._storage.cache_dataset(run_id, list(dataset))
54
+
55
+ def load_cached_records(self, run_id: str) -> dict[str, GenerationRecord]:
56
+ """Load cached generation records for resuming.
57
+
58
+ Args:
59
+ run_id: Unique run identifier
60
+
61
+ Returns:
62
+ Dictionary mapping cache keys to generation records
63
+ """
64
+ if not self._enable_resume or self._storage is None:
65
+ return {}
66
+ return self._storage.load_cached_records(run_id)
67
+
68
+ def load_cached_evaluations(
69
+ self, run_id: str, evaluation_config: dict | None = None
70
+ ) -> dict[str, EvaluationRecord]:
71
+ """Load cached evaluation records for resuming.
72
+
73
+ Args:
74
+ run_id: Unique run identifier
75
+ evaluation_config: Evaluation configuration (metrics, extractor) for cache matching
76
+
77
+ Returns:
78
+ Dictionary mapping cache keys to evaluation records
79
+ """
80
+ if not self._enable_resume or self._storage is None:
81
+ return {}
82
+ return self._storage.load_cached_evaluations(run_id, evaluation_config=evaluation_config)
83
+
84
+ def save_generation_record(
85
+ self,
86
+ run_id: str,
87
+ record: GenerationRecord,
88
+ cache_key: str,
89
+ ) -> None:
90
+ """Save a single generation record.
91
+
92
+ Args:
93
+ run_id: Unique run identifier
94
+ record: Generation record to save
95
+ cache_key: Cache key for this record
96
+ """
97
+ if self._storage is not None and self._enable_cache:
98
+ self._storage.append_record(run_id, record, cache_key=cache_key)
99
+
100
+ def save_evaluation_record(
101
+ self,
102
+ run_id: str,
103
+ generation_record: GenerationRecord,
104
+ evaluation_record: EvaluationRecord,
105
+ evaluation_config: dict | None = None,
106
+ ) -> None:
107
+ """Save a single evaluation record.
108
+
109
+ Args:
110
+ run_id: Unique run identifier
111
+ generation_record: Corresponding generation record
112
+ evaluation_record: Evaluation record to save
113
+ evaluation_config: Evaluation configuration for cache invalidation
114
+ """
115
+ if self._storage is not None and self._enable_cache:
116
+ self._storage.append_evaluation(
117
+ run_id, generation_record, evaluation_record, evaluation_config=evaluation_config
118
+ )
119
+
120
+ def get_run_path(self, run_id: str) -> str | None:
121
+ """Get filesystem path for a run.
122
+
123
+ Args:
124
+ run_id: Unique run identifier
125
+
126
+ Returns:
127
+ Path to run directory, or None if no storage
128
+ """
129
+ if self._storage is None:
130
+ return None
131
+ return str(self._storage.get_run_path(run_id))
132
+
133
+
134
+ __all__ = ["CacheManager"]