themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,317 @@
1
+ """Provider pricing database and cost calculation utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ # Pricing table for common LLM providers (prices per token in USD)
8
+ # Updated as of November 2024
9
+ PRICING_TABLE: dict[str, dict[str, float]] = {
10
+ # OpenAI models
11
+ "gpt-4": {
12
+ "prompt_tokens": 0.00003, # $30 per 1M tokens
13
+ "completion_tokens": 0.00006, # $60 per 1M tokens
14
+ },
15
+ "gpt-4-32k": {
16
+ "prompt_tokens": 0.00006,
17
+ "completion_tokens": 0.00012,
18
+ },
19
+ "gpt-4-turbo": {
20
+ "prompt_tokens": 0.00001, # $10 per 1M tokens
21
+ "completion_tokens": 0.00003, # $30 per 1M tokens
22
+ },
23
+ "gpt-4-turbo-preview": {
24
+ "prompt_tokens": 0.00001,
25
+ "completion_tokens": 0.00003,
26
+ },
27
+ "gpt-3.5-turbo": {
28
+ "prompt_tokens": 0.0000005, # $0.50 per 1M tokens
29
+ "completion_tokens": 0.0000015, # $1.50 per 1M tokens
30
+ },
31
+ "gpt-3.5-turbo-16k": {
32
+ "prompt_tokens": 0.000003,
33
+ "completion_tokens": 0.000004,
34
+ },
35
+ # Anthropic Claude models
36
+ "claude-3-5-sonnet-20241022": {
37
+ "prompt_tokens": 0.000003, # $3 per 1M tokens
38
+ "completion_tokens": 0.000015, # $15 per 1M tokens
39
+ },
40
+ "claude-3-opus-20240229": {
41
+ "prompt_tokens": 0.000015, # $15 per 1M tokens
42
+ "completion_tokens": 0.000075, # $75 per 1M tokens
43
+ },
44
+ "claude-3-sonnet-20240229": {
45
+ "prompt_tokens": 0.000003,
46
+ "completion_tokens": 0.000015,
47
+ },
48
+ "claude-3-haiku-20240307": {
49
+ "prompt_tokens": 0.00000025, # $0.25 per 1M tokens
50
+ "completion_tokens": 0.00000125, # $1.25 per 1M tokens
51
+ },
52
+ # Google models
53
+ "gemini-pro": {
54
+ "prompt_tokens": 0.00000025,
55
+ "completion_tokens": 0.0000005,
56
+ },
57
+ "gemini-1.5-pro": {
58
+ "prompt_tokens": 0.00000125, # $1.25 per 1M tokens
59
+ "completion_tokens": 0.000005, # $5 per 1M tokens
60
+ },
61
+ "gemini-1.5-flash": {
62
+ "prompt_tokens": 0.000000075, # $0.075 per 1M tokens
63
+ "completion_tokens": 0.0000003, # $0.30 per 1M tokens
64
+ },
65
+ # Mistral models
66
+ "mistral-large-latest": {
67
+ "prompt_tokens": 0.000002, # $2 per 1M tokens
68
+ "completion_tokens": 0.000006, # $6 per 1M tokens
69
+ },
70
+ "mistral-medium-latest": {
71
+ "prompt_tokens": 0.0000027,
72
+ "completion_tokens": 0.0000081,
73
+ },
74
+ "mistral-small-latest": {
75
+ "prompt_tokens": 0.000001,
76
+ "completion_tokens": 0.000003,
77
+ },
78
+ # Cohere models
79
+ "command-r-plus": {
80
+ "prompt_tokens": 0.000003,
81
+ "completion_tokens": 0.000015,
82
+ },
83
+ "command-r": {
84
+ "prompt_tokens": 0.0000005,
85
+ "completion_tokens": 0.0000015,
86
+ },
87
+ # Meta Llama (via various providers - using typical cloud pricing)
88
+ "llama-3.1-70b": {
89
+ "prompt_tokens": 0.00000088,
90
+ "completion_tokens": 0.00000088,
91
+ },
92
+ "llama-3.1-8b": {
93
+ "prompt_tokens": 0.0000002,
94
+ "completion_tokens": 0.0000002,
95
+ },
96
+ # Default fallback for unknown models
97
+ "default": {
98
+ "prompt_tokens": 0.000001,
99
+ "completion_tokens": 0.000002,
100
+ },
101
+ }
102
+
103
+ # Model aliases and variations
104
+ MODEL_ALIASES: dict[str, str] = {
105
+ # OpenAI aliases
106
+ "gpt-4-0613": "gpt-4",
107
+ "gpt-4-0314": "gpt-4",
108
+ "gpt-4-1106-preview": "gpt-4-turbo-preview",
109
+ "gpt-4-0125-preview": "gpt-4-turbo-preview",
110
+ "gpt-3.5-turbo-0613": "gpt-3.5-turbo",
111
+ "gpt-3.5-turbo-0301": "gpt-3.5-turbo",
112
+ "gpt-3.5-turbo-1106": "gpt-3.5-turbo",
113
+ # Anthropic aliases
114
+ "claude-3-opus": "claude-3-opus-20240229",
115
+ "claude-3-sonnet": "claude-3-sonnet-20240229",
116
+ "claude-3-haiku": "claude-3-haiku-20240307",
117
+ "claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
118
+ # Google aliases
119
+ "gemini-pro-1.0": "gemini-pro",
120
+ "gemini-1.5-pro-latest": "gemini-1.5-pro",
121
+ "gemini-1.5-flash-latest": "gemini-1.5-flash",
122
+ }
123
+
124
+
125
+ def normalize_model_name(model: str) -> str:
126
+ """Normalize model name to canonical form.
127
+
128
+ Args:
129
+ model: Model identifier (may include provider prefix)
130
+
131
+ Returns:
132
+ Normalized model name
133
+
134
+ Example:
135
+ >>> normalize_model_name("openai/gpt-4-0613")
136
+ 'gpt-4'
137
+ >>> normalize_model_name("claude-3-opus")
138
+ 'claude-3-opus-20240229'
139
+ """
140
+ # Remove provider prefix if present (e.g., "openai/gpt-4" -> "gpt-4")
141
+ if "/" in model:
142
+ model = model.split("/", 1)[1]
143
+
144
+ # Look up alias
145
+ model = MODEL_ALIASES.get(model, model)
146
+
147
+ return model
148
+
149
+
150
+ def get_provider_pricing(model: str) -> dict[str, float]:
151
+ """Get pricing for a model.
152
+
153
+ Args:
154
+ model: Model identifier
155
+
156
+ Returns:
157
+ Dict with 'prompt_tokens' and 'completion_tokens' prices per token
158
+
159
+ Example:
160
+ >>> pricing = get_provider_pricing("gpt-4")
161
+ >>> print(f"Prompt: ${pricing['prompt_tokens'] * 1_000_000:.2f}/1M tokens")
162
+ Prompt: $30.00/1M tokens
163
+ """
164
+ normalized = normalize_model_name(model)
165
+
166
+ # Check if we have pricing for this model
167
+ if normalized in PRICING_TABLE:
168
+ return PRICING_TABLE[normalized].copy()
169
+
170
+ # Try to find a partial match (e.g., "gpt-4-turbo-2024-04-09" matches "gpt-4-turbo")
171
+ for known_model in PRICING_TABLE:
172
+ if known_model in normalized or normalized.startswith(known_model):
173
+ return PRICING_TABLE[known_model].copy()
174
+
175
+ # Fallback to default pricing
176
+ return PRICING_TABLE["default"].copy()
177
+
178
+
179
+ def calculate_cost(
180
+ model: str,
181
+ prompt_tokens: int,
182
+ completion_tokens: int,
183
+ pricing: dict[str, float] | None = None,
184
+ ) -> float:
185
+ """Calculate cost for a model completion.
186
+
187
+ Args:
188
+ model: Model identifier
189
+ prompt_tokens: Number of prompt tokens
190
+ completion_tokens: Number of completion tokens
191
+ pricing: Optional custom pricing (if None, uses default pricing table)
192
+
193
+ Returns:
194
+ Total cost in USD
195
+
196
+ Example:
197
+ >>> cost = calculate_cost("gpt-4", 1000, 500)
198
+ >>> print(f"Cost: ${cost:.4f}")
199
+ Cost: $0.0600
200
+ """
201
+ if pricing is None:
202
+ pricing = get_provider_pricing(model)
203
+
204
+ prompt_cost = prompt_tokens * pricing["prompt_tokens"]
205
+ completion_cost = completion_tokens * pricing["completion_tokens"]
206
+
207
+ return prompt_cost + completion_cost
208
+
209
+
210
+ def compare_provider_costs(
211
+ prompt_tokens: int,
212
+ completion_tokens: int,
213
+ models: list[str],
214
+ ) -> dict[str, float]:
215
+ """Compare costs across multiple providers for same workload.
216
+
217
+ Args:
218
+ prompt_tokens: Number of prompt tokens
219
+ completion_tokens: Number of completion tokens
220
+ models: List of model identifiers to compare
221
+
222
+ Returns:
223
+ Dict mapping model names to costs
224
+
225
+ Example:
226
+ >>> costs = compare_provider_costs(
227
+ ... 1000, 500, ["gpt-4", "gpt-3.5-turbo", "claude-3-haiku"]
228
+ ... )
229
+ >>> for model, cost in sorted(costs.items(), key=lambda x: x[1]):
230
+ ... print(f"{model}: ${cost:.4f}")
231
+ claude-3-haiku: $0.0009
232
+ gpt-3.5-turbo: $0.0013
233
+ gpt-4: $0.0600
234
+ """
235
+ costs = {}
236
+ for model in models:
237
+ costs[model] = calculate_cost(model, prompt_tokens, completion_tokens)
238
+ return costs
239
+
240
+
241
+ def estimate_tokens(text: str, chars_per_token: float = 4.0) -> int:
242
+ """Estimate number of tokens from text.
243
+
244
+ This is a rough approximation. For accurate token counts,
245
+ use the model's tokenizer.
246
+
247
+ Args:
248
+ text: Input text
249
+ chars_per_token: Average characters per token (default: 4.0)
250
+
251
+ Returns:
252
+ Estimated token count
253
+
254
+ Example:
255
+ >>> text = "This is a sample text for token estimation."
256
+ >>> tokens = estimate_tokens(text)
257
+ >>> print(f"Estimated tokens: {tokens}")
258
+ Estimated tokens: 11
259
+ """
260
+ if not text:
261
+ return 0
262
+ return max(1, int(len(text) / chars_per_token))
263
+
264
+
265
+ def get_all_models() -> list[str]:
266
+ """Get list of all models with known pricing.
267
+
268
+ Returns:
269
+ List of model identifiers
270
+ """
271
+ return [k for k in PRICING_TABLE.keys() if k != "default"]
272
+
273
+
274
+ def get_pricing_summary() -> dict[str, Any]:
275
+ """Get summary of pricing for all models.
276
+
277
+ Returns:
278
+ Dict with model pricing information
279
+
280
+ Example:
281
+ >>> summary = get_pricing_summary()
282
+ >>> print(f"Total models: {summary['total_models']}")
283
+ >>> print(f"Cheapest: {summary['cheapest_model']}")
284
+ """
285
+ models = get_all_models()
286
+
287
+ # Find cheapest and most expensive (based on prompt + completion average)
288
+ model_avg_costs = {}
289
+ for model in models:
290
+ pricing = PRICING_TABLE[model]
291
+ avg_cost = (pricing["prompt_tokens"] + pricing["completion_tokens"]) / 2
292
+ model_avg_costs[model] = avg_cost
293
+
294
+ cheapest = min(model_avg_costs.items(), key=lambda x: x[1])
295
+ most_expensive = max(model_avg_costs.items(), key=lambda x: x[1])
296
+
297
+ return {
298
+ "total_models": len(models),
299
+ "cheapest_model": cheapest[0],
300
+ "cheapest_avg_cost_per_token": cheapest[1],
301
+ "most_expensive_model": most_expensive[0],
302
+ "most_expensive_avg_cost_per_token": most_expensive[1],
303
+ "models": models,
304
+ }
305
+
306
+
307
+ __all__ = [
308
+ "PRICING_TABLE",
309
+ "MODEL_ALIASES",
310
+ "normalize_model_name",
311
+ "get_provider_pricing",
312
+ "calculate_cost",
313
+ "compare_provider_costs",
314
+ "estimate_tokens",
315
+ "get_all_models",
316
+ "get_pricing_summary",
317
+ ]