themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ """Evaluation pipeline orchestration.
2
+
3
+ This module provides two complementary pipeline styles:
4
+
5
+ 1. EvaluationPipeline: Traditional batch evaluation with extractors, metrics, and strategies
6
+ 2. ComposableEvaluationPipeline: Chainable builder pattern for composing evaluation steps
7
+
8
+ Example (Traditional):
9
+ >>> pipeline = EvaluationPipeline(
10
+ ... extractor=JsonFieldExtractor("answer"),
11
+ ... metrics=[ExactMatch()]
12
+ ... )
13
+ >>> report = pipeline.evaluate(records)
14
+
15
+ Example (Composable):
16
+ >>> pipeline = (
17
+ ... ComposableEvaluationPipeline()
18
+ ... .extract(JsonFieldExtractor("answer"))
19
+ ... .validate(lambda x: isinstance(x, str), "Must be string")
20
+ ... .transform(lambda x: x.strip().lower(), name="normalize")
21
+ ... .compute_metrics([ExactMatch()], references=["42"])
22
+ ... )
23
+ >>> result = pipeline.evaluate(record)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ # Re-export pipeline implementations for backward compatibility
29
+ from themis.evaluation.pipelines.composable_pipeline import (
30
+ ComposableEvaluationPipeline,
31
+ EvaluationResult,
32
+ EvaluationStep,
33
+ )
34
+ from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
35
+ from themis.evaluation.reports import (
36
+ EvaluationFailure,
37
+ EvaluationReport,
38
+ MetricAggregate,
39
+ )
40
+
41
+ __all__ = [
42
+ "EvaluationPipeline",
43
+ "ComposableEvaluationPipeline",
44
+ "EvaluationStep",
45
+ "EvaluationResult",
46
+ "MetricAggregate",
47
+ "EvaluationReport",
48
+ "EvaluationFailure",
49
+ ]
@@ -0,0 +1,15 @@
1
+ """Evaluation pipeline implementations."""
2
+
3
+ from themis.evaluation.pipelines.composable_pipeline import (
4
+ ComposableEvaluationPipeline,
5
+ EvaluationResult,
6
+ EvaluationStep,
7
+ )
8
+ from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
9
+
10
+ __all__ = [
11
+ "EvaluationPipeline",
12
+ "ComposableEvaluationPipeline",
13
+ "EvaluationStep",
14
+ "EvaluationResult",
15
+ ]
@@ -0,0 +1,357 @@
1
+ """Composable evaluation pipeline with chainable steps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Callable, Generic, Sequence, TypeVar
7
+
8
+ from themis.core import entities as core_entities
9
+ from themis.interfaces import Metric as MetricInterface
10
+ from themis.utils import tracing
11
+
12
+ # Type variables for composable pipeline
13
+ T = TypeVar("T")
14
+ U = TypeVar("U")
15
+
16
+
17
+ @dataclass
18
+ class EvaluationStep(Generic[T, U]):
19
+ """Single step in evaluation pipeline.
20
+
21
+ A step transforms an input of type T to output of type U.
22
+ It can optionally handle errors that occur during processing.
23
+
24
+ Attributes:
25
+ name: Step name
26
+ processor: Function to transform input to output
27
+ error_handler: Optional error handler
28
+ """
29
+
30
+ name: str
31
+ processor: Callable[[T], U]
32
+ error_handler: Callable[[Exception], U | None] | None = None
33
+
34
+ def execute(self, value: T) -> tuple[U | None, str | None]:
35
+ """Execute the step.
36
+
37
+ Args:
38
+ value: Input value
39
+
40
+ Returns:
41
+ Tuple of (result, error_message)
42
+ """
43
+ try:
44
+ result = self.processor(value)
45
+ return result, None
46
+ except Exception as e:
47
+ if self.error_handler:
48
+ handled = self.error_handler(e)
49
+ if handled is not None:
50
+ return handled, None
51
+ return None, str(e)
52
+
53
+
54
+ @dataclass
55
+ class EvaluationResult:
56
+ """Result from evaluating a single record through pipeline.
57
+
58
+ Attributes:
59
+ record: Original generation record
60
+ scores: Final metric scores
61
+ errors: List of errors encountered
62
+ intermediate_values: Dict of intermediate values from each step
63
+ """
64
+
65
+ record: core_entities.GenerationRecord
66
+ scores: list[core_entities.MetricScore]
67
+ errors: list[str]
68
+ intermediate_values: dict[str, Any] = field(default_factory=dict)
69
+
70
+ def is_success(self) -> bool:
71
+ """Check if evaluation succeeded.
72
+
73
+ Returns:
74
+ True if no errors and has scores
75
+ """
76
+ return len(self.errors) == 0 and len(self.scores) > 0
77
+
78
+
79
+ class ComposableEvaluationPipeline:
80
+ """Pipeline that chains multiple evaluation steps.
81
+
82
+ This pipeline allows you to compose evaluation logic from multiple steps:
83
+ 1. Extraction (get answer from raw output)
84
+ 2. Validation (check format/constraints)
85
+ 3. Transformation (normalize, clean, convert)
86
+ 4. Metric computation (compare against references)
87
+
88
+ Each step can have error handling, and intermediate values are tracked.
89
+
90
+ Example:
91
+ >>> pipeline = (
92
+ ... ComposableEvaluationPipeline()
93
+ ... .extract(RegexExtractor(r"(\\d+)"))
94
+ ... .validate(lambda x: x.isdigit(), "Must be numeric")
95
+ ... .transform(int, name="parse_int")
96
+ ... .compute_metrics([NumericMatch()], references=[42])
97
+ ... )
98
+ """
99
+
100
+ def __init__(self):
101
+ """Initialize empty pipeline."""
102
+ self._steps: list[EvaluationStep] = []
103
+
104
+ def add_step(self, step: EvaluationStep) -> ComposableEvaluationPipeline:
105
+ """Add a step to the pipeline (builder pattern).
106
+
107
+ Args:
108
+ step: Evaluation step to add
109
+
110
+ Returns:
111
+ Self for chaining
112
+ """
113
+ self._steps.append(step)
114
+ return self
115
+
116
+ def extract(
117
+ self,
118
+ extractor: Any,
119
+ error_handler: Callable[[Exception], Any | None] | None = None,
120
+ ) -> ComposableEvaluationPipeline:
121
+ """Add extraction step.
122
+
123
+ Args:
124
+ extractor: Extractor to use
125
+ error_handler: Optional error handler
126
+
127
+ Returns:
128
+ Self for chaining
129
+ """
130
+ return self.add_step(
131
+ EvaluationStep(
132
+ name=f"extract_{extractor.__class__.__name__}",
133
+ processor=extractor.extract,
134
+ error_handler=error_handler,
135
+ )
136
+ )
137
+
138
+ def validate(
139
+ self, validator: Callable[[Any], bool], error_message: str = "Validation failed"
140
+ ) -> ComposableEvaluationPipeline:
141
+ """Add validation step.
142
+
143
+ Args:
144
+ validator: Function that returns True if valid
145
+ error_message: Error message if validation fails
146
+
147
+ Returns:
148
+ Self for chaining
149
+ """
150
+
151
+ def validate_fn(value):
152
+ if not validator(value):
153
+ raise ValueError(error_message)
154
+ return value
155
+
156
+ return self.add_step(
157
+ EvaluationStep(
158
+ name="validate",
159
+ processor=validate_fn,
160
+ )
161
+ )
162
+
163
+ def transform(
164
+ self,
165
+ transformer: Callable[[Any], Any],
166
+ name: str = "transform",
167
+ error_handler: Callable | None = None,
168
+ ) -> ComposableEvaluationPipeline:
169
+ """Add transformation step.
170
+
171
+ Args:
172
+ transformer: Function to transform value
173
+ name: Name for this step
174
+ error_handler: Optional error handler
175
+
176
+ Returns:
177
+ Self for chaining
178
+ """
179
+ return self.add_step(
180
+ EvaluationStep(
181
+ name=name,
182
+ processor=transformer,
183
+ error_handler=error_handler,
184
+ )
185
+ )
186
+
187
+ def conditional_step(
188
+ self,
189
+ condition: Callable[[Any], bool],
190
+ step_if_true: EvaluationStep,
191
+ step_if_false: EvaluationStep | None = None,
192
+ ) -> ComposableEvaluationPipeline:
193
+ """Add conditional step that branches based on condition.
194
+
195
+ Args:
196
+ condition: Function to determine which branch to take
197
+ step_if_true: Step to execute if condition is True
198
+ step_if_false: Step to execute if condition is False (or passthrough)
199
+
200
+ Returns:
201
+ Self for chaining
202
+ """
203
+
204
+ def conditional_processor(value):
205
+ if condition(value):
206
+ result, error = step_if_true.execute(value)
207
+ if error:
208
+ raise ValueError(f"True branch failed: {error}")
209
+ return result
210
+ elif step_if_false:
211
+ result, error = step_if_false.execute(value)
212
+ if error:
213
+ raise ValueError(f"False branch failed: {error}")
214
+ return result
215
+ else:
216
+ return value # Passthrough
217
+
218
+ return self.add_step(
219
+ EvaluationStep(
220
+ name=f"conditional_{step_if_true.name}",
221
+ processor=conditional_processor,
222
+ )
223
+ )
224
+
225
+ def compute_metrics(
226
+ self,
227
+ metrics: Sequence[MetricInterface],
228
+ references: Sequence[Any],
229
+ metadata: dict[str, Any] | None = None,
230
+ ) -> ComposableEvaluationPipeline:
231
+ """Add metrics computation step.
232
+
233
+ This should typically be the final step in the pipeline.
234
+
235
+ Args:
236
+ metrics: List of metrics to compute
237
+ references: Reference values to compare against
238
+ metadata: Optional metadata to pass to metrics
239
+
240
+ Returns:
241
+ Self for chaining
242
+ """
243
+
244
+ def compute(prediction):
245
+ scores = []
246
+ for metric in metrics:
247
+ score = metric.compute(
248
+ prediction=prediction,
249
+ references=references,
250
+ metadata=metadata or {},
251
+ )
252
+ scores.append(score)
253
+ return scores
254
+
255
+ return self.add_step(
256
+ EvaluationStep(
257
+ name="compute_metrics",
258
+ processor=compute,
259
+ )
260
+ )
261
+
262
+ def evaluate(self, record: core_entities.GenerationRecord) -> EvaluationResult:
263
+ """Execute the pipeline on a generation record.
264
+
265
+ Args:
266
+ record: Generation record to evaluate
267
+
268
+ Returns:
269
+ Evaluation result with scores, errors, and intermediate values
270
+ """
271
+ if record.output is None:
272
+ return EvaluationResult(
273
+ record=record,
274
+ scores=[],
275
+ errors=["Missing model output"],
276
+ intermediate_values={},
277
+ )
278
+
279
+ intermediate_values = {"raw_output": record.output.text}
280
+ current_value = record.output.text
281
+ errors = []
282
+
283
+ with tracing.span("composable_pipeline_evaluate", num_steps=len(self._steps)):
284
+ for step in self._steps:
285
+ try:
286
+ with tracing.span(f"eval_step_{step.name}"):
287
+ result, error = step.execute(current_value)
288
+
289
+ if error:
290
+ errors.append(f"{step.name}: {error}")
291
+ return EvaluationResult(
292
+ record=record,
293
+ scores=[],
294
+ errors=errors,
295
+ intermediate_values=intermediate_values,
296
+ )
297
+
298
+ if result is not None:
299
+ current_value = result
300
+ intermediate_values[step.name] = current_value
301
+
302
+ except Exception as e:
303
+ errors.append(f"{step.name}: {str(e)}")
304
+ return EvaluationResult(
305
+ record=record,
306
+ scores=[],
307
+ errors=errors,
308
+ intermediate_values=intermediate_values,
309
+ )
310
+
311
+ # Final value should be list of scores if compute_metrics was last step
312
+ scores = current_value if isinstance(current_value, list) else []
313
+
314
+ # Filter to only MetricScore objects
315
+ metric_scores = [s for s in scores if isinstance(s, core_entities.MetricScore)]
316
+
317
+ return EvaluationResult(
318
+ record=record,
319
+ scores=metric_scores,
320
+ errors=errors,
321
+ intermediate_values=intermediate_values,
322
+ )
323
+
324
+ def evaluate_batch(
325
+ self, records: Sequence[core_entities.GenerationRecord]
326
+ ) -> list[EvaluationResult]:
327
+ """Evaluate multiple records.
328
+
329
+ Args:
330
+ records: List of generation records
331
+
332
+ Returns:
333
+ List of evaluation results
334
+ """
335
+ results = []
336
+ with tracing.span("composable_pipeline_batch", num_records=len(records)):
337
+ for record in records:
338
+ result = self.evaluate(record)
339
+ results.append(result)
340
+ return results
341
+
342
+ def get_step_names(self) -> list[str]:
343
+ """Get names of all steps in pipeline.
344
+
345
+ Returns:
346
+ List of step names
347
+ """
348
+ return [step.name for step in self._steps]
349
+
350
+ def clear(self) -> ComposableEvaluationPipeline:
351
+ """Clear all steps from pipeline.
352
+
353
+ Returns:
354
+ Self for chaining
355
+ """
356
+ self._steps.clear()
357
+ return self