themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,214 @@
1
+ """Cost estimation and tracking commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Annotated
6
+
7
+ from cyclopts import Parameter
8
+
9
+ from themis.experiment.cost import estimate_experiment_cost
10
+ from themis.experiment.pricing import (
11
+ compare_provider_costs,
12
+ get_all_models,
13
+ get_provider_pricing,
14
+ )
15
+
16
+
17
+ def estimate_cost_command(
18
+ *,
19
+ model: Annotated[
20
+ str, Parameter(help="Model identifier (e.g., gpt-4, claude-3-5-sonnet)")
21
+ ],
22
+ dataset_size: Annotated[int, Parameter(help="Number of samples in dataset")],
23
+ avg_prompt_tokens: Annotated[
24
+ int, Parameter(help="Average prompt tokens per sample")
25
+ ] = 500,
26
+ avg_completion_tokens: Annotated[
27
+ int, Parameter(help="Average completion tokens per sample")
28
+ ] = 300,
29
+ ) -> int:
30
+ """Estimate cost for an experiment before running.
31
+
32
+ Examples:
33
+ # Estimate cost for 100 samples with GPT-4
34
+ uv run python -m themis.cli estimate-cost \\
35
+ --model gpt-4 \\
36
+ --dataset-size 100
37
+
38
+ # Custom token estimates
39
+ uv run python -m themis.cli estimate-cost \\
40
+ --model claude-3-5-sonnet-20241022 \\
41
+ --dataset-size 1000 \\
42
+ --avg-prompt-tokens 800 \\
43
+ --avg-completion-tokens 400
44
+ """
45
+ try:
46
+ estimate = estimate_experiment_cost(
47
+ model=model,
48
+ dataset_size=dataset_size,
49
+ avg_prompt_tokens=avg_prompt_tokens,
50
+ avg_completion_tokens=avg_completion_tokens,
51
+ )
52
+
53
+ print("=" * 80)
54
+ print("Cost Estimate")
55
+ print("=" * 80)
56
+ print(f"\nModel: {model}")
57
+ print(f"Dataset size: {dataset_size} samples")
58
+ print(
59
+ f"Avg tokens per sample: {avg_prompt_tokens} prompt + {avg_completion_tokens} completion"
60
+ )
61
+
62
+ print("\nšŸ’° Estimated Cost")
63
+ print(f" Total: ${estimate.estimated_cost:.4f}")
64
+ print(f" Per sample: ${estimate.assumptions['cost_per_sample']:.6f}")
65
+ print(f" 95% CI: ${estimate.lower_bound:.4f} - ${estimate.upper_bound:.4f}")
66
+
67
+ print("\nšŸ“Š Breakdown")
68
+ for phase, cost in estimate.breakdown_by_phase.items():
69
+ print(f" {phase.capitalize()}: ${cost:.4f}")
70
+
71
+ print("\n" + "=" * 80)
72
+
73
+ # Warning if cost is high
74
+ if estimate.estimated_cost > 10.0:
75
+ print(
76
+ f"\nāš ļø Warning: Estimated cost is ${estimate.estimated_cost:.2f}. "
77
+ "Consider using --limit for initial testing."
78
+ )
79
+
80
+ return 0
81
+
82
+ except Exception as e:
83
+ print(f"Error estimating cost: {e}")
84
+ return 1
85
+
86
+
87
+ def show_pricing_command(
88
+ *,
89
+ model: Annotated[
90
+ str | None, Parameter(help="Show pricing for specific model")
91
+ ] = None,
92
+ list_all: Annotated[bool, Parameter(help="List all available models")] = False,
93
+ compare_models: Annotated[
94
+ list[str] | None, Parameter(help="Compare costs for multiple models")
95
+ ] = None,
96
+ ) -> int:
97
+ """Show pricing information for LLM models.
98
+
99
+ Examples:
100
+ # Show pricing for a specific model
101
+ uv run python -m themis.cli show-pricing --model gpt-4
102
+
103
+ # List all models with pricing
104
+ uv run python -m themis.cli show-pricing --list-all
105
+
106
+ # Compare pricing across models (use repeated --compare-models flags)
107
+ uv run python -m themis.cli show-pricing \\
108
+ --compare-models gpt-4 \\
109
+ --compare-models gpt-3.5-turbo \\
110
+ --compare-models claude-3-haiku-20240307
111
+ """
112
+ try:
113
+ if list_all:
114
+ models = get_all_models()
115
+ print("=" * 80)
116
+ print(f"Available Models ({len(models)} total)")
117
+ print("=" * 80)
118
+ print("\nModel pricing (per 1M tokens):\n")
119
+
120
+ for model_name in sorted(models):
121
+ pricing = get_provider_pricing(model_name)
122
+ prompt_price = pricing["prompt_tokens"] * 1_000_000
123
+ completion_price = pricing["completion_tokens"] * 1_000_000
124
+ print(
125
+ f" {model_name:40s} | "
126
+ f"Prompt: ${prompt_price:6.2f} | "
127
+ f"Completion: ${completion_price:6.2f}"
128
+ )
129
+
130
+ print("\n" + "=" * 80)
131
+ return 0
132
+
133
+ if compare_models:
134
+ # Compare costs for standard workload
135
+ prompt_tokens = 1000
136
+ completion_tokens = 500
137
+
138
+ costs = compare_provider_costs(
139
+ prompt_tokens, completion_tokens, compare_models
140
+ )
141
+
142
+ print("=" * 80)
143
+ print(
144
+ f"Cost Comparison ({prompt_tokens} prompt + {completion_tokens} completion tokens)"
145
+ )
146
+ print("=" * 80)
147
+ print()
148
+
149
+ # Sort by cost
150
+ sorted_costs = sorted(costs.items(), key=lambda x: x[1])
151
+
152
+ for model_name, cost in sorted_costs:
153
+ # Calculate cost per 1M tokens for comparison
154
+ pricing = get_provider_pricing(model_name)
155
+ prompt_price = pricing["prompt_tokens"] * 1_000_000
156
+ completion_price = pricing["completion_tokens"] * 1_000_000
157
+
158
+ print(f" {model_name:40s} | ${cost:.6f}")
159
+ print(
160
+ f" {'':40s} | (${prompt_price:.2f} / ${completion_price:.2f} per 1M)"
161
+ )
162
+
163
+ # Show relative costs
164
+ if sorted_costs:
165
+ cheapest_cost = sorted_costs[0][1]
166
+ print(f"\nRelative costs (vs {sorted_costs[0][0]}):")
167
+ for model_name, cost in sorted_costs[1:]:
168
+ multiplier = cost / cheapest_cost if cheapest_cost > 0 else 0
169
+ print(f" {model_name:40s} | {multiplier:.1f}x more expensive")
170
+
171
+ print("\n" + "=" * 80)
172
+ return 0
173
+
174
+ if model:
175
+ pricing = get_provider_pricing(model)
176
+ prompt_price = pricing["prompt_tokens"] * 1_000_000
177
+ completion_price = pricing["completion_tokens"] * 1_000_000
178
+
179
+ print("=" * 80)
180
+ print(f"Pricing for {model}")
181
+ print("=" * 80)
182
+ print(f"\nPrompt tokens: ${prompt_price:.2f} per 1M tokens")
183
+ print(f"Completion tokens: ${completion_price:.2f} per 1M tokens")
184
+
185
+ # Show example costs
186
+ print("\nExample costs:")
187
+ examples = [
188
+ (100, 50, "Short query"),
189
+ (500, 300, "Medium query"),
190
+ (1000, 500, "Long query"),
191
+ ]
192
+
193
+ for prompt_tok, completion_tok, label in examples:
194
+ from themis.experiment.pricing import calculate_cost
195
+
196
+ cost = calculate_cost(model, prompt_tok, completion_tok)
197
+ print(
198
+ f" {label:15s} ({prompt_tok:4d} + {completion_tok:4d} tokens): ${cost:.6f}"
199
+ )
200
+
201
+ print("\n" + "=" * 80)
202
+ return 0
203
+
204
+ # No options provided
205
+ print("Error: Must specify --model, --list-all, or --compare-models")
206
+ print("Use --help for usage information")
207
+ return 1
208
+
209
+ except Exception as e:
210
+ print(f"Error: {e}")
211
+ return 1
212
+
213
+
214
+ __all__ = ["estimate_cost_command", "show_pricing_command"]
@@ -0,0 +1,68 @@
1
+ """Demo command implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.experiment import math as math_experiment
12
+ from themis.utils.logging_utils import configure_logging
13
+ from themis.utils.progress import ProgressReporter
14
+
15
+
16
+ def demo_command(
17
+ *,
18
+ max_samples: Annotated[
19
+ int | None, Parameter(help="Limit number of demo samples")
20
+ ] = None,
21
+ log_level: Annotated[
22
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
23
+ ] = "info",
24
+ csv_output: Annotated[
25
+ Path | None, Parameter(help="Write CSV export to this path")
26
+ ] = None,
27
+ html_output: Annotated[
28
+ Path | None, Parameter(help="Write HTML summary to this path")
29
+ ] = None,
30
+ json_output: Annotated[
31
+ Path | None, Parameter(help="Write JSON export to this path")
32
+ ] = None,
33
+ ) -> int:
34
+ """Run the built-in demo dataset."""
35
+ configure_logging(log_level)
36
+ dataset = [
37
+ {
38
+ "unique_id": "demo-1",
39
+ "problem": "Convert the point (0,3) in rectangular coordinates to polar coordinates.",
40
+ "answer": "\\left( 3, \\frac{\\pi}{2} \\right)",
41
+ "subject": "precalculus",
42
+ "level": 2,
43
+ },
44
+ {
45
+ "unique_id": "demo-2",
46
+ "problem": "What is 7 + 5?",
47
+ "answer": "12",
48
+ "subject": "arithmetic",
49
+ "level": 1,
50
+ },
51
+ ]
52
+ experiment = math_experiment.build_math500_zero_shot_experiment()
53
+ total = effective_total(len(dataset), max_samples)
54
+ with ProgressReporter(total=total, description="Generating") as progress:
55
+ report = experiment.run(
56
+ dataset,
57
+ max_samples=max_samples,
58
+ on_result=progress.on_result,
59
+ )
60
+ print(math_experiment.summarize_report(report))
61
+ export_outputs(
62
+ report,
63
+ csv_output=csv_output,
64
+ html_output=html_output,
65
+ json_output=json_output,
66
+ title="Demo experiment",
67
+ )
68
+ return 0
@@ -0,0 +1,90 @@
1
+ """System information and listing commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ from cyclopts import Parameter
10
+
11
+ from themis.providers.registry import _REGISTRY
12
+
13
+
14
+ def show_info() -> int:
15
+ """Show system information and installed components."""
16
+ import themis
17
+ from themis import _version
18
+
19
+ print("Themis Information")
20
+ print("=" * 60)
21
+ print(f"Version: {getattr(_version, '__version__', 'unknown')}")
22
+ print(f"Python: {sys.version.split()[0]}")
23
+ print(f"Platform: {sys.platform}")
24
+
25
+ print("\nšŸ“¦ Installed Providers:")
26
+ providers = sorted(_REGISTRY._factories.keys())
27
+ for provider in providers:
28
+ print(f" āœ“ {provider}")
29
+
30
+ print("\nšŸ“Š Available Benchmarks:")
31
+ benchmarks = [
32
+ "demo",
33
+ "math500",
34
+ "aime24",
35
+ "aime25",
36
+ "amc23",
37
+ "olympiadbench",
38
+ "beyondaime",
39
+ "supergpqa",
40
+ "mmlu-pro",
41
+ "inline (via config)",
42
+ ]
43
+ for bench in benchmarks:
44
+ print(f" āœ“ {bench}")
45
+
46
+ print("\nšŸ“ Example Locations:")
47
+ examples_dir = Path(themis.__file__).parent.parent / "examples"
48
+ if examples_dir.exists():
49
+ print(f" {examples_dir}")
50
+ example_dirs = sorted(
51
+ [
52
+ d.name
53
+ for d in examples_dir.iterdir()
54
+ if d.is_dir() and not d.name.startswith("_")
55
+ ]
56
+ )
57
+ for ex in example_dirs:
58
+ print(f" • {ex}/")
59
+
60
+ print("\nšŸ“š Documentation:")
61
+ print(" examples/README.md - Comprehensive tutorial cookbook")
62
+ print(" COOKBOOK.md - Quick reference guide")
63
+ print(" docs/ - Detailed documentation")
64
+
65
+ print("\nšŸš€ Quick Start:")
66
+ print(" uv run python -m themis.cli demo")
67
+ print(" uv run python -m themis.cli list-providers")
68
+ print(" uv run python -m themis.cli list-benchmarks")
69
+
70
+ return 0
71
+
72
+
73
+ def new_project(
74
+ *,
75
+ project_name: Annotated[str, Parameter(help="The name of the new project")],
76
+ project_path: Annotated[
77
+ Path,
78
+ Parameter(help="The path where the new project will be created"),
79
+ ] = Path("."),
80
+ ) -> int:
81
+ """Create a new Themis project."""
82
+ from themis.cli.new_project import create_project
83
+
84
+ try:
85
+ create_project(project_name, project_path)
86
+ print(f"Successfully created new project '{project_name}' in {project_path}")
87
+ return 0
88
+ except FileExistsError as e:
89
+ print(f"Error: {e}")
90
+ return 1