themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,376 @@
1
+ """Cost tracking utilities for monitoring LLM API usage and costs.
2
+
3
+ This module provides tools to track token usage, API costs, and generate
4
+ cost reports across experiments and providers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Dict, List
13
+
14
+ from themis.core import entities as core_entities
15
+
16
+ # Provider pricing per 1M tokens (as of 2024)
17
+ # Format: {provider_model: (input_cost_per_1m, output_cost_per_1m)}
18
+ DEFAULT_PRICING = {
19
+ # OpenAI GPT-4
20
+ "gpt-4": (30.0, 60.0),
21
+ "gpt-4-turbo": (10.0, 30.0),
22
+ "gpt-4o": (2.5, 10.0),
23
+ "gpt-4o-mini": (0.15, 0.60),
24
+ # OpenAI GPT-3.5
25
+ "gpt-3.5-turbo": (0.5, 1.5),
26
+ # Anthropic Claude
27
+ "claude-3-opus-20240229": (15.0, 75.0),
28
+ "claude-3-sonnet-20240229": (3.0, 15.0),
29
+ "claude-3-haiku-20240307": (0.25, 1.25),
30
+ "claude-3-5-sonnet-20241022": (3.0, 15.0),
31
+ # Google Gemini
32
+ "gemini-1.5-pro": (1.25, 5.0),
33
+ "gemini-1.5-flash": (0.075, 0.30),
34
+ # Meta Llama (via cloud providers - approximate)
35
+ "llama-3-70b": (0.9, 0.9),
36
+ "llama-3-8b": (0.2, 0.2),
37
+ # Fake/local models
38
+ "fake": (0.0, 0.0),
39
+ }
40
+
41
+
42
+ @dataclass
43
+ class TokenUsage:
44
+ """Token usage statistics for a single API call.
45
+
46
+ Attributes:
47
+ input_tokens: Number of input/prompt tokens
48
+ output_tokens: Number of output/completion tokens
49
+ total_tokens: Total tokens (input + output)
50
+ """
51
+
52
+ input_tokens: int
53
+ output_tokens: int
54
+
55
+ @property
56
+ def total_tokens(self) -> int:
57
+ return self.input_tokens + self.output_tokens
58
+
59
+
60
+ @dataclass
61
+ class CostRecord:
62
+ """Cost record for a single generation.
63
+
64
+ Attributes:
65
+ model_identifier: Model name/identifier
66
+ provider: Provider name
67
+ usage: Token usage statistics
68
+ input_cost: Cost for input tokens (in USD)
69
+ output_cost: Cost for output tokens (in USD)
70
+ total_cost: Total cost (in USD)
71
+ metadata: Additional metadata (e.g., timestamp, run_id)
72
+ """
73
+
74
+ model_identifier: str
75
+ provider: str
76
+ usage: TokenUsage
77
+ input_cost: float
78
+ output_cost: float
79
+ total_cost: float
80
+ metadata: Dict[str, object] = field(default_factory=dict)
81
+
82
+
83
+ @dataclass
84
+ class CostSummary:
85
+ """Aggregated cost summary across multiple generations.
86
+
87
+ Attributes:
88
+ total_cost: Total cost in USD
89
+ total_tokens: Total number of tokens
90
+ total_input_tokens: Total input tokens
91
+ total_output_tokens: Total output tokens
92
+ num_requests: Number of API requests
93
+ cost_by_model: Cost breakdown by model
94
+ cost_by_provider: Cost breakdown by provider
95
+ """
96
+
97
+ total_cost: float
98
+ total_tokens: int
99
+ total_input_tokens: int
100
+ total_output_tokens: int
101
+ num_requests: int
102
+ cost_by_model: Dict[str, float]
103
+ cost_by_provider: Dict[str, float]
104
+
105
+
106
+ class CostTracker:
107
+ """Track and compute costs for LLM API usage.
108
+
109
+ This class maintains a record of all API calls and their costs,
110
+ with support for custom pricing models and cost aggregation.
111
+ """
112
+
113
+ def __init__(
114
+ self,
115
+ pricing: Dict[str, tuple[float, float]] | None = None,
116
+ ) -> None:
117
+ """Initialize cost tracker.
118
+
119
+ Args:
120
+ pricing: Custom pricing dictionary mapping model names to
121
+ (input_cost_per_1m, output_cost_per_1m) tuples.
122
+ Defaults to DEFAULT_PRICING if not provided.
123
+ """
124
+ self.pricing = pricing or DEFAULT_PRICING.copy()
125
+ self.records: List[CostRecord] = []
126
+
127
+ def add_pricing(
128
+ self,
129
+ model: str,
130
+ input_cost_per_1m: float,
131
+ output_cost_per_1m: float,
132
+ ) -> None:
133
+ """Add or update pricing for a model.
134
+
135
+ Args:
136
+ model: Model identifier
137
+ input_cost_per_1m: Cost per 1M input tokens in USD
138
+ output_cost_per_1m: Cost per 1M output tokens in USD
139
+ """
140
+ self.pricing[model] = (input_cost_per_1m, output_cost_per_1m)
141
+
142
+ def track_generation(
143
+ self,
144
+ record: core_entities.GenerationRecord,
145
+ input_tokens: int | None = None,
146
+ output_tokens: int | None = None,
147
+ ) -> CostRecord:
148
+ """Track cost for a generation record.
149
+
150
+ Args:
151
+ record: Generation record to track
152
+ input_tokens: Number of input tokens (if None, estimated from prompt)
153
+ output_tokens: Number of output tokens (if None, estimated from output)
154
+
155
+ Returns:
156
+ CostRecord with computed costs
157
+ """
158
+ model_id = record.task.model.identifier
159
+ provider = record.task.model.provider
160
+
161
+ # Extract or estimate token counts
162
+ if input_tokens is None:
163
+ input_tokens = self._estimate_tokens(record.task.prompt.text)
164
+
165
+ if output_tokens is None and record.output:
166
+ output_tokens = self._estimate_tokens(record.output.text)
167
+ elif output_tokens is None:
168
+ output_tokens = 0
169
+
170
+ usage = TokenUsage(input_tokens=input_tokens, output_tokens=output_tokens)
171
+
172
+ # Compute costs
173
+ input_cost, output_cost = self._compute_cost(model_id, usage)
174
+ total_cost = input_cost + output_cost
175
+
176
+ cost_record = CostRecord(
177
+ model_identifier=model_id,
178
+ provider=provider,
179
+ usage=usage,
180
+ input_cost=input_cost,
181
+ output_cost=output_cost,
182
+ total_cost=total_cost,
183
+ metadata={
184
+ "sample_id": record.task.metadata.get("sample_id"),
185
+ "run_id": record.task.metadata.get("run_id"),
186
+ },
187
+ )
188
+
189
+ self.records.append(cost_record)
190
+ return cost_record
191
+
192
+ def get_summary(self) -> CostSummary:
193
+ """Compute aggregated cost summary across all tracked records.
194
+
195
+ Returns:
196
+ CostSummary with aggregated statistics
197
+ """
198
+ if not self.records:
199
+ return CostSummary(
200
+ total_cost=0.0,
201
+ total_tokens=0,
202
+ total_input_tokens=0,
203
+ total_output_tokens=0,
204
+ num_requests=0,
205
+ cost_by_model={},
206
+ cost_by_provider={},
207
+ )
208
+
209
+ total_cost = sum(r.total_cost for r in self.records)
210
+ total_input_tokens = sum(r.usage.input_tokens for r in self.records)
211
+ total_output_tokens = sum(r.usage.output_tokens for r in self.records)
212
+
213
+ # Aggregate by model
214
+ cost_by_model: Dict[str, float] = {}
215
+ for record in self.records:
216
+ model = record.model_identifier
217
+ cost_by_model[model] = cost_by_model.get(model, 0.0) + record.total_cost
218
+
219
+ # Aggregate by provider
220
+ cost_by_provider: Dict[str, float] = {}
221
+ for record in self.records:
222
+ provider = record.provider
223
+ cost_by_provider[provider] = (
224
+ cost_by_provider.get(provider, 0.0) + record.total_cost
225
+ )
226
+
227
+ return CostSummary(
228
+ total_cost=total_cost,
229
+ total_tokens=total_input_tokens + total_output_tokens,
230
+ total_input_tokens=total_input_tokens,
231
+ total_output_tokens=total_output_tokens,
232
+ num_requests=len(self.records),
233
+ cost_by_model=cost_by_model,
234
+ cost_by_provider=cost_by_provider,
235
+ )
236
+
237
+ def export_records(self, path: str | Path) -> None:
238
+ """Export cost records to JSON file.
239
+
240
+ Args:
241
+ path: Output file path
242
+ """
243
+ path = Path(path)
244
+ path.parent.mkdir(parents=True, exist_ok=True)
245
+
246
+ data = {
247
+ "records": [
248
+ {
249
+ "model": r.model_identifier,
250
+ "provider": r.provider,
251
+ "input_tokens": r.usage.input_tokens,
252
+ "output_tokens": r.usage.output_tokens,
253
+ "total_tokens": r.usage.total_tokens,
254
+ "input_cost": r.input_cost,
255
+ "output_cost": r.output_cost,
256
+ "total_cost": r.total_cost,
257
+ "metadata": r.metadata,
258
+ }
259
+ for r in self.records
260
+ ],
261
+ "summary": {
262
+ "total_cost": self.get_summary().total_cost,
263
+ "total_tokens": self.get_summary().total_tokens,
264
+ "num_requests": len(self.records),
265
+ },
266
+ }
267
+
268
+ with open(path, "w") as f:
269
+ json.dump(data, f, indent=2)
270
+
271
+ def _compute_cost(
272
+ self,
273
+ model: str,
274
+ usage: TokenUsage,
275
+ ) -> tuple[float, float]:
276
+ """Compute input and output costs for a model.
277
+
278
+ Args:
279
+ model: Model identifier
280
+ usage: Token usage statistics
281
+
282
+ Returns:
283
+ Tuple of (input_cost, output_cost) in USD
284
+ """
285
+ # Try exact match first
286
+ pricing = self.pricing.get(model)
287
+
288
+ # If no exact match, try prefix matching
289
+ if pricing is None:
290
+ for price_key in self.pricing:
291
+ if model.startswith(price_key):
292
+ pricing = self.pricing[price_key]
293
+ break
294
+
295
+ # Fall back to generic pricing if model not found
296
+ if pricing is None:
297
+ # Use a reasonable default ($1 per 1M tokens)
298
+ pricing = (1.0, 1.0)
299
+
300
+ input_cost_per_1m, output_cost_per_1m = pricing
301
+
302
+ input_cost = (usage.input_tokens / 1_000_000) * input_cost_per_1m
303
+ output_cost = (usage.output_tokens / 1_000_000) * output_cost_per_1m
304
+
305
+ return input_cost, output_cost
306
+
307
+ @staticmethod
308
+ def _estimate_tokens(text: str) -> int:
309
+ """Rough estimate of token count from text.
310
+
311
+ Uses a simple heuristic: ~4 characters per token on average.
312
+ For accurate counts, use provider-specific tokenizers.
313
+
314
+ Args:
315
+ text: Input text
316
+
317
+ Returns:
318
+ Estimated token count
319
+ """
320
+ return max(1, len(text) // 4)
321
+
322
+
323
+ def format_cost_summary(summary: CostSummary) -> str:
324
+ """Format cost summary as human-readable string.
325
+
326
+ Args:
327
+ summary: Cost summary to format
328
+
329
+ Returns:
330
+ Formatted string representation
331
+ """
332
+ lines = [
333
+ "Cost Summary",
334
+ "=" * 50,
335
+ f"Total Cost: ${summary.total_cost:.4f}",
336
+ f"Total Tokens: {summary.total_tokens:,}",
337
+ f" Input Tokens: {summary.total_input_tokens:,}",
338
+ f" Output Tokens: {summary.total_output_tokens:,}",
339
+ f"API Requests: {summary.num_requests:,}",
340
+ "",
341
+ ]
342
+
343
+ if summary.cost_by_model:
344
+ lines.append("Cost by Model:")
345
+ lines.append("-" * 50)
346
+ for model, cost in sorted(
347
+ summary.cost_by_model.items(),
348
+ key=lambda x: x[1],
349
+ reverse=True,
350
+ ):
351
+ pct = (cost / summary.total_cost * 100) if summary.total_cost > 0 else 0
352
+ lines.append(f" {model:30s} ${cost:8.4f} ({pct:5.1f}%)")
353
+ lines.append("")
354
+
355
+ if summary.cost_by_provider:
356
+ lines.append("Cost by Provider:")
357
+ lines.append("-" * 50)
358
+ for provider, cost in sorted(
359
+ summary.cost_by_provider.items(),
360
+ key=lambda x: x[1],
361
+ reverse=True,
362
+ ):
363
+ pct = (cost / summary.total_cost * 100) if summary.total_cost > 0 else 0
364
+ lines.append(f" {provider:30s} ${cost:8.4f} ({pct:5.1f}%)")
365
+
366
+ return "\n".join(lines)
367
+
368
+
369
+ __all__ = [
370
+ "TokenUsage",
371
+ "CostRecord",
372
+ "CostSummary",
373
+ "CostTracker",
374
+ "DEFAULT_PRICING",
375
+ "format_cost_summary",
376
+ ]