themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,310 @@
1
+ """Cost tracking and estimation for LLM experiments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+
9
+ @dataclass
10
+ class CostBreakdown:
11
+ """Detailed cost breakdown for an experiment.
12
+
13
+ Attributes:
14
+ total_cost: Total cost in USD
15
+ generation_cost: Cost of generation API calls
16
+ evaluation_cost: Cost of LLM-based evaluation (if applicable)
17
+ per_sample_costs: List of costs per sample
18
+ per_model_costs: Cost breakdown by model
19
+ token_counts: Token usage statistics
20
+ api_calls: Total number of API calls
21
+ currency: Currency code (default: USD)
22
+ """
23
+
24
+ total_cost: float
25
+ generation_cost: float
26
+ evaluation_cost: float = 0.0
27
+ per_sample_costs: list[float] = field(default_factory=list)
28
+ per_model_costs: dict[str, float] = field(default_factory=dict)
29
+ token_counts: dict[str, int] = field(default_factory=dict)
30
+ api_calls: int = 0
31
+ currency: str = "USD"
32
+
33
+ def __post_init__(self):
34
+ """Validate cost breakdown."""
35
+ if self.total_cost < 0:
36
+ raise ValueError("Total cost cannot be negative")
37
+ if self.generation_cost < 0:
38
+ raise ValueError("Generation cost cannot be negative")
39
+ if self.evaluation_cost < 0:
40
+ raise ValueError("Evaluation cost cannot be negative")
41
+
42
+
43
+ @dataclass
44
+ class CostEstimate:
45
+ """Cost estimate for an experiment.
46
+
47
+ Attributes:
48
+ estimated_cost: Expected cost in USD
49
+ lower_bound: Lower bound of 95% confidence interval
50
+ upper_bound: Upper bound of 95% confidence interval
51
+ breakdown_by_phase: Cost breakdown by experiment phase
52
+ assumptions: Assumptions used for estimation
53
+ currency: Currency code (default: USD)
54
+ """
55
+
56
+ estimated_cost: float
57
+ lower_bound: float
58
+ upper_bound: float
59
+ breakdown_by_phase: dict[str, float] = field(default_factory=dict)
60
+ assumptions: dict[str, Any] = field(default_factory=dict)
61
+ currency: str = "USD"
62
+
63
+
64
+ class CostTracker:
65
+ """Tracks costs during experiment execution.
66
+
67
+ This class accumulates costs from generation and evaluation steps,
68
+ providing detailed breakdowns and per-sample tracking.
69
+
70
+ Example:
71
+ >>> tracker = CostTracker()
72
+ >>> tracker.record_generation("gpt-4", 100, 50, 0.0045)
73
+ >>> tracker.record_generation("gpt-4", 120, 60, 0.0054)
74
+ >>> breakdown = tracker.get_breakdown()
75
+ >>> print(f"Total cost: ${breakdown.total_cost:.4f}")
76
+ """
77
+
78
+ def __init__(self):
79
+ """Initialize cost tracker."""
80
+ self._generation_costs: list[tuple[str, float]] = []
81
+ self._evaluation_costs: list[tuple[str, float]] = []
82
+ self._token_counts: dict[str, int] = {
83
+ "prompt_tokens": 0,
84
+ "completion_tokens": 0,
85
+ "total_tokens": 0,
86
+ }
87
+ self._per_model_costs: dict[str, float] = {}
88
+ self._per_sample_costs: list[float] = []
89
+ self._api_calls: int = 0
90
+
91
+ def record_generation(
92
+ self,
93
+ model: str,
94
+ prompt_tokens: int,
95
+ completion_tokens: int,
96
+ cost: float,
97
+ ) -> None:
98
+ """Record cost of a generation call.
99
+
100
+ Args:
101
+ model: Model identifier
102
+ prompt_tokens: Number of prompt tokens
103
+ completion_tokens: Number of completion tokens
104
+ cost: Cost in USD
105
+ """
106
+ self._generation_costs.append((model, cost))
107
+ self._token_counts["prompt_tokens"] += prompt_tokens
108
+ self._token_counts["completion_tokens"] += completion_tokens
109
+ self._token_counts["total_tokens"] += prompt_tokens + completion_tokens
110
+ self._per_model_costs[model] = self._per_model_costs.get(model, 0.0) + cost
111
+ self._per_sample_costs.append(cost)
112
+ self._api_calls += 1
113
+
114
+ def record_evaluation(self, metric: str, cost: float) -> None:
115
+ """Record cost of LLM-based evaluation.
116
+
117
+ Args:
118
+ metric: Metric name that incurred the cost
119
+ cost: Cost in USD
120
+ """
121
+ self._evaluation_costs.append((metric, cost))
122
+ # Evaluation costs also count as API calls
123
+ self._api_calls += 1
124
+
125
+ def get_breakdown(self) -> CostBreakdown:
126
+ """Get detailed cost breakdown.
127
+
128
+ Returns:
129
+ CostBreakdown with all accumulated costs
130
+ """
131
+ generation_cost = sum(cost for _, cost in self._generation_costs)
132
+ evaluation_cost = sum(cost for _, cost in self._evaluation_costs)
133
+ total_cost = generation_cost + evaluation_cost
134
+
135
+ return CostBreakdown(
136
+ total_cost=total_cost,
137
+ generation_cost=generation_cost,
138
+ evaluation_cost=evaluation_cost,
139
+ per_sample_costs=self._per_sample_costs.copy(),
140
+ per_model_costs=self._per_model_costs.copy(),
141
+ token_counts=self._token_counts.copy(),
142
+ api_calls=self._api_calls,
143
+ )
144
+
145
+ def reset(self) -> None:
146
+ """Reset all tracked costs."""
147
+ self._generation_costs.clear()
148
+ self._evaluation_costs.clear()
149
+ self._token_counts = {
150
+ "prompt_tokens": 0,
151
+ "completion_tokens": 0,
152
+ "total_tokens": 0,
153
+ }
154
+ self._per_model_costs.clear()
155
+ self._per_sample_costs.clear()
156
+ self._api_calls = 0
157
+
158
+
159
+ class BudgetMonitor:
160
+ """Monitor and enforce budget limits during experiments.
161
+
162
+ Example:
163
+ >>> monitor = BudgetMonitor(max_cost=10.0, alert_threshold=0.8)
164
+ >>> monitor.add_cost(7.0)
165
+ >>> within_budget, message = monitor.check_budget()
166
+ >>> print(message) # "Warning: 70% of budget used"
167
+ >>> monitor.add_cost(4.0) # Exceeds budget
168
+ >>> within_budget, message = monitor.check_budget()
169
+ >>> print(message) # "Budget exceeded: $11.00 >= $10.00"
170
+ """
171
+
172
+ def __init__(self, max_cost: float, alert_threshold: float = 0.8):
173
+ """Initialize budget monitor.
174
+
175
+ Args:
176
+ max_cost: Maximum allowed cost in USD
177
+ alert_threshold: Threshold (0.0-1.0) for warning alerts
178
+
179
+ Raises:
180
+ ValueError: If max_cost is negative or alert_threshold is invalid
181
+ """
182
+ if max_cost < 0:
183
+ raise ValueError("Max cost cannot be negative")
184
+ if not 0.0 <= alert_threshold <= 1.0:
185
+ raise ValueError("Alert threshold must be between 0.0 and 1.0")
186
+
187
+ self.max_cost = max_cost
188
+ self.alert_threshold = alert_threshold
189
+ self.current_cost = 0.0
190
+
191
+ def add_cost(self, cost: float) -> None:
192
+ """Add cost to current total.
193
+
194
+ Args:
195
+ cost: Cost to add in USD
196
+ """
197
+ self.current_cost += cost
198
+
199
+ def check_budget(self) -> tuple[bool, str]:
200
+ """Check if budget is within limits.
201
+
202
+ Returns:
203
+ Tuple of (within_budget, message)
204
+ - within_budget: True if under max_cost
205
+ - message: Status message or warning
206
+ """
207
+ if self.current_cost >= self.max_cost:
208
+ return (
209
+ False,
210
+ f"Budget exceeded: ${self.current_cost:.2f} >= ${self.max_cost:.2f}",
211
+ )
212
+
213
+ if self.current_cost >= self.max_cost * self.alert_threshold:
214
+ percentage = (self.current_cost / self.max_cost) * 100
215
+ return (
216
+ True,
217
+ f"Warning: {percentage:.0f}% of budget used "
218
+ f"(${self.current_cost:.2f} / ${self.max_cost:.2f})",
219
+ )
220
+
221
+ return True, "Budget OK"
222
+
223
+ def remaining_budget(self) -> float:
224
+ """Get remaining budget.
225
+
226
+ Returns:
227
+ Remaining budget in USD (may be negative if exceeded)
228
+ """
229
+ return self.max_cost - self.current_cost
230
+
231
+ def percentage_used(self) -> float:
232
+ """Get percentage of budget used.
233
+
234
+ Returns:
235
+ Percentage (0.0-100.0+) of budget used
236
+ """
237
+ if self.max_cost == 0:
238
+ return 100.0 if self.current_cost > 0 else 0.0
239
+ return (self.current_cost / self.max_cost) * 100
240
+
241
+
242
+ def estimate_experiment_cost(
243
+ model: str,
244
+ dataset_size: int,
245
+ avg_prompt_tokens: int = 500,
246
+ avg_completion_tokens: int = 300,
247
+ confidence_level: float = 0.95,
248
+ ) -> CostEstimate:
249
+ """Estimate total cost for an experiment.
250
+
251
+ Args:
252
+ model: Model identifier
253
+ dataset_size: Number of samples in dataset
254
+ avg_prompt_tokens: Average prompt tokens per sample
255
+ avg_completion_tokens: Average completion tokens per sample
256
+ confidence_level: Confidence level for bounds (default: 0.95)
257
+
258
+ Returns:
259
+ CostEstimate with expected cost and confidence bounds
260
+
261
+ Example:
262
+ >>> estimate = estimate_experiment_cost("gpt-4", 100, 500, 300)
263
+ >>> print(f"Estimated cost: ${estimate.estimated_cost:.2f}")
264
+ >>> print(f"Range: ${estimate.lower_bound:.2f} - ${estimate.upper_bound:.2f}")
265
+ """
266
+ from themis.experiment.pricing import calculate_cost
267
+
268
+ # Calculate cost per sample
269
+ cost_per_sample = calculate_cost(model, avg_prompt_tokens, avg_completion_tokens)
270
+
271
+ # Estimate total cost
272
+ estimated_cost = cost_per_sample * dataset_size
273
+
274
+ # Calculate confidence bounds (assuming ~20% variance)
275
+ variance_factor = 0.2
276
+ margin = estimated_cost * variance_factor * (1 - (1 - confidence_level))
277
+
278
+ lower_bound = max(0.0, estimated_cost - margin)
279
+ upper_bound = estimated_cost + margin
280
+
281
+ breakdown = {
282
+ "generation": estimated_cost,
283
+ "evaluation": 0.0, # No LLM-based evaluation assumed
284
+ }
285
+
286
+ assumptions = {
287
+ "model": model,
288
+ "dataset_size": dataset_size,
289
+ "avg_prompt_tokens": avg_prompt_tokens,
290
+ "avg_completion_tokens": avg_completion_tokens,
291
+ "cost_per_sample": cost_per_sample,
292
+ "confidence_level": confidence_level,
293
+ }
294
+
295
+ return CostEstimate(
296
+ estimated_cost=estimated_cost,
297
+ lower_bound=lower_bound,
298
+ upper_bound=upper_bound,
299
+ breakdown_by_phase=breakdown,
300
+ assumptions=assumptions,
301
+ )
302
+
303
+
304
+ __all__ = [
305
+ "CostBreakdown",
306
+ "CostEstimate",
307
+ "CostTracker",
308
+ "BudgetMonitor",
309
+ "estimate_experiment_cost",
310
+ ]
@@ -0,0 +1,62 @@
1
+ """Shared experiment definitions used by the builder."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, Callable, Sequence
7
+
8
+ from themis.core import entities as core_entities
9
+
10
+ if TYPE_CHECKING:
11
+ from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
12
+ from themis.experiment.orchestrator import ExperimentOrchestrator
13
+ from themis.experiment.storage import ExperimentStorage
14
+ from themis.generation.plan import GenerationPlan
15
+ from themis.generation.runner import GenerationRunner
16
+ from themis.interfaces import ModelProvider
17
+
18
+
19
+ @dataclass
20
+ class ModelBinding:
21
+ spec: core_entities.ModelSpec
22
+ provider_name: str
23
+ provider_options: dict[str, Any] = field(default_factory=dict)
24
+
25
+
26
+ @dataclass
27
+ class ExperimentDefinition:
28
+ templates: Sequence
29
+ sampling_parameters: Sequence[core_entities.SamplingConfig]
30
+ model_bindings: Sequence[ModelBinding]
31
+ dataset_id_field: str = "id"
32
+ reference_field: str | None = "expected"
33
+ metadata_fields: Sequence[str] = field(default_factory=tuple)
34
+ context_builder: Callable[[dict[str, Any]], dict[str, Any]] | None = None
35
+
36
+
37
+ @dataclass
38
+ class BuiltExperiment:
39
+ """Built experiment with all components assembled.
40
+
41
+ Attributes:
42
+ plan: Generation plan for expanding tasks from dataset samples
43
+ runner: Generation runner for executing tasks via providers
44
+ pipeline: Evaluation pipeline for scoring outputs
45
+ storage: Optional experiment storage for caching and resumability
46
+ router: Provider router for dispatching to correct LLM provider
47
+ orchestrator: Main orchestrator coordinating generation and evaluation
48
+ """
49
+
50
+ plan: "GenerationPlan"
51
+ runner: "GenerationRunner"
52
+ pipeline: "EvaluationPipeline"
53
+ storage: "ExperimentStorage | None"
54
+ router: "ModelProvider"
55
+ orchestrator: "ExperimentOrchestrator"
56
+
57
+
58
+ __all__ = [
59
+ "ModelBinding",
60
+ "ExperimentDefinition",
61
+ "BuiltExperiment",
62
+ ]