themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +93 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +164 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +288 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +129 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +690 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +373 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +255 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +61 -0
- themis/integrations/wandb.py +65 -0
- themis/interfaces/__init__.py +83 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
- themis_eval-0.1.1.dist-info/RECORD +134 -0
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Visualization commands for interactive charts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.experiment.comparison import compare_experiments
|
|
11
|
+
from themis.experiment.visualization import (
|
|
12
|
+
PLOTLY_AVAILABLE,
|
|
13
|
+
InteractiveVisualizer,
|
|
14
|
+
export_interactive_html,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def visualize_comparison_command(
|
|
19
|
+
*,
|
|
20
|
+
run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
|
|
21
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
22
|
+
metric: Annotated[str | None, Parameter(help="Metric to visualize")] = None,
|
|
23
|
+
output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
|
|
24
|
+
"visualization.html"
|
|
25
|
+
),
|
|
26
|
+
chart_type: Annotated[
|
|
27
|
+
str,
|
|
28
|
+
Parameter(help="Chart type: comparison, evolution, dashboard, pareto"),
|
|
29
|
+
] = "comparison",
|
|
30
|
+
) -> int:
|
|
31
|
+
"""Generate interactive visualization for experiments.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
# Bar chart comparing accuracy across runs
|
|
35
|
+
uv run python -m themis.cli visualize \\
|
|
36
|
+
--run-ids run-1 run-2 run-3 \\
|
|
37
|
+
--metric accuracy \\
|
|
38
|
+
--output accuracy_comparison.html
|
|
39
|
+
|
|
40
|
+
# Evolution chart showing metric over time
|
|
41
|
+
uv run python -m themis.cli visualize \\
|
|
42
|
+
--run-ids run-1 run-2 run-3 run-4 \\
|
|
43
|
+
--metric accuracy \\
|
|
44
|
+
--chart-type evolution \\
|
|
45
|
+
--output accuracy_evolution.html
|
|
46
|
+
|
|
47
|
+
# Dashboard with multiple metrics
|
|
48
|
+
uv run python -m themis.cli visualize \\
|
|
49
|
+
--run-ids run-1 run-2 run-3 \\
|
|
50
|
+
--chart-type dashboard \\
|
|
51
|
+
--output dashboard.html
|
|
52
|
+
|
|
53
|
+
# Pareto frontier (requires --pareto-metrics and --maximize)
|
|
54
|
+
uv run python -m themis.cli visualize-pareto \\
|
|
55
|
+
--run-ids run-1 run-2 run-3 \\
|
|
56
|
+
--metric1 accuracy \\
|
|
57
|
+
--metric2 cost \\
|
|
58
|
+
--output pareto.html
|
|
59
|
+
"""
|
|
60
|
+
if not PLOTLY_AVAILABLE:
|
|
61
|
+
print("Error: Plotly is not installed.")
|
|
62
|
+
print("Install with: pip install plotly")
|
|
63
|
+
return 1
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Load experiments
|
|
67
|
+
print(f"Loading experiments from {storage}...")
|
|
68
|
+
comparison = compare_experiments(
|
|
69
|
+
run_ids=run_ids,
|
|
70
|
+
storage_dir=storage,
|
|
71
|
+
include_metadata=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print(f"✓ Loaded {len(comparison.experiments)} experiments")
|
|
75
|
+
|
|
76
|
+
# Create visualizer
|
|
77
|
+
visualizer = InteractiveVisualizer()
|
|
78
|
+
|
|
79
|
+
# Generate chart based on type
|
|
80
|
+
if chart_type == "comparison":
|
|
81
|
+
if not metric:
|
|
82
|
+
metric = comparison.metrics[0] if comparison.metrics else "accuracy"
|
|
83
|
+
print(f"Using default metric: {metric}")
|
|
84
|
+
|
|
85
|
+
print(f"Creating comparison chart for '{metric}'...")
|
|
86
|
+
fig = visualizer.plot_metric_comparison(comparison, metric)
|
|
87
|
+
|
|
88
|
+
elif chart_type == "evolution":
|
|
89
|
+
if not metric:
|
|
90
|
+
metric = comparison.metrics[0] if comparison.metrics else "accuracy"
|
|
91
|
+
print(f"Using default metric: {metric}")
|
|
92
|
+
|
|
93
|
+
print(f"Creating evolution chart for '{metric}'...")
|
|
94
|
+
fig = visualizer.plot_metric_evolution(comparison, metric)
|
|
95
|
+
|
|
96
|
+
elif chart_type == "dashboard":
|
|
97
|
+
print("Creating dashboard with multiple metrics...")
|
|
98
|
+
fig = visualizer.create_dashboard(comparison)
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
print(f"Error: Unknown chart type '{chart_type}'")
|
|
102
|
+
print("Available: comparison, evolution, dashboard")
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
# Export to HTML
|
|
106
|
+
export_interactive_html(fig, output)
|
|
107
|
+
print(f"\n✓ Visualization saved to {output}")
|
|
108
|
+
print(" Open in browser to interact with chart")
|
|
109
|
+
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"Error: {e}")
|
|
114
|
+
return 1
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def visualize_pareto_command(
|
|
118
|
+
*,
|
|
119
|
+
run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
|
|
120
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
121
|
+
metric1: Annotated[str, Parameter(help="First metric (x-axis)")],
|
|
122
|
+
metric2: Annotated[str, Parameter(help="Second metric (y-axis)")],
|
|
123
|
+
maximize1: Annotated[bool, Parameter(help="Maximize metric1")] = True,
|
|
124
|
+
maximize2: Annotated[bool, Parameter(help="Maximize metric2")] = True,
|
|
125
|
+
output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
|
|
126
|
+
"pareto.html"
|
|
127
|
+
),
|
|
128
|
+
) -> int:
|
|
129
|
+
"""Generate Pareto frontier visualization.
|
|
130
|
+
|
|
131
|
+
Examples:
|
|
132
|
+
# Maximize accuracy, minimize cost
|
|
133
|
+
uv run python -m themis.cli visualize-pareto \\
|
|
134
|
+
--run-ids run-1 run-2 run-3 run-4 \\
|
|
135
|
+
--metric1 accuracy \\
|
|
136
|
+
--metric2 cost \\
|
|
137
|
+
--maximize1 true \\
|
|
138
|
+
--maximize2 false \\
|
|
139
|
+
--output pareto.html
|
|
140
|
+
"""
|
|
141
|
+
if not PLOTLY_AVAILABLE:
|
|
142
|
+
print("Error: Plotly is not installed.")
|
|
143
|
+
print("Install with: pip install plotly")
|
|
144
|
+
return 1
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
# Load experiments
|
|
148
|
+
print(f"Loading experiments from {storage}...")
|
|
149
|
+
comparison = compare_experiments(
|
|
150
|
+
run_ids=run_ids,
|
|
151
|
+
storage_dir=storage,
|
|
152
|
+
include_metadata=True,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
print(f"✓ Loaded {len(comparison.experiments)} experiments")
|
|
156
|
+
|
|
157
|
+
# Compute Pareto frontier
|
|
158
|
+
print(f"Computing Pareto frontier for {metric1} and {metric2}...")
|
|
159
|
+
pareto_ids = comparison.pareto_frontier(
|
|
160
|
+
objectives=[metric1, metric2],
|
|
161
|
+
maximize=[maximize1, maximize2],
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
print(f"✓ Found {len(pareto_ids)} Pareto-optimal experiments:")
|
|
165
|
+
for run_id in pareto_ids:
|
|
166
|
+
print(f" - {run_id}")
|
|
167
|
+
|
|
168
|
+
# Create visualization
|
|
169
|
+
visualizer = InteractiveVisualizer()
|
|
170
|
+
fig = visualizer.plot_pareto_frontier(
|
|
171
|
+
comparison, metric1, metric2, pareto_ids, maximize1, maximize2
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Export to HTML
|
|
175
|
+
export_interactive_html(fig, output)
|
|
176
|
+
print(f"\n✓ Visualization saved to {output}")
|
|
177
|
+
print(" Red points are Pareto-optimal")
|
|
178
|
+
print(" Blue points are dominated")
|
|
179
|
+
|
|
180
|
+
return 0
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
print(f"Error: {e}")
|
|
184
|
+
return 1
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def visualize_distribution_command(
|
|
188
|
+
*,
|
|
189
|
+
run_id: Annotated[str, Parameter(help="Run ID to visualize")],
|
|
190
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
191
|
+
metric: Annotated[str, Parameter(help="Metric to visualize")],
|
|
192
|
+
plot_type: Annotated[
|
|
193
|
+
str, Parameter(help="Plot type: histogram, box, violin")
|
|
194
|
+
] = "histogram",
|
|
195
|
+
output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
|
|
196
|
+
"distribution.html"
|
|
197
|
+
),
|
|
198
|
+
) -> int:
|
|
199
|
+
"""Generate metric distribution visualization.
|
|
200
|
+
|
|
201
|
+
Shows the distribution of a metric across all samples in an experiment.
|
|
202
|
+
|
|
203
|
+
Examples:
|
|
204
|
+
# Histogram of accuracy scores
|
|
205
|
+
uv run python -m themis.cli visualize-distribution \\
|
|
206
|
+
--run-id my-run \\
|
|
207
|
+
--metric accuracy \\
|
|
208
|
+
--output accuracy_dist.html
|
|
209
|
+
|
|
210
|
+
# Violin plot
|
|
211
|
+
uv run python -m themis.cli visualize-distribution \\
|
|
212
|
+
--run-id my-run \\
|
|
213
|
+
--metric accuracy \\
|
|
214
|
+
--plot-type violin \\
|
|
215
|
+
--output accuracy_violin.html
|
|
216
|
+
"""
|
|
217
|
+
if not PLOTLY_AVAILABLE:
|
|
218
|
+
print("Error: Plotly is not installed.")
|
|
219
|
+
print("Install with: pip install plotly")
|
|
220
|
+
return 1
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
import json
|
|
224
|
+
|
|
225
|
+
# Load report
|
|
226
|
+
print(f"Loading report from {storage / run_id}...")
|
|
227
|
+
report_path = storage / run_id / "report.json"
|
|
228
|
+
|
|
229
|
+
if not report_path.exists():
|
|
230
|
+
print(f"Error: Report not found at {report_path}")
|
|
231
|
+
return 1
|
|
232
|
+
|
|
233
|
+
with report_path.open("r", encoding="utf-8") as f:
|
|
234
|
+
report_data = json.load(f)
|
|
235
|
+
|
|
236
|
+
# Extract evaluation report
|
|
237
|
+
# Note: This is simplified - in production you'd deserialize properly
|
|
238
|
+
from themis.core.entities import EvaluationRecord, MetricScore
|
|
239
|
+
from themis.evaluation.reports import EvaluationReport, MetricAggregate
|
|
240
|
+
|
|
241
|
+
# Build evaluation report from JSON
|
|
242
|
+
records = []
|
|
243
|
+
for sample_data in report_data.get("samples", []):
|
|
244
|
+
scores = [
|
|
245
|
+
MetricScore(
|
|
246
|
+
metric_name=score["metric"],
|
|
247
|
+
value=score["value"],
|
|
248
|
+
details=score.get("details"),
|
|
249
|
+
metadata=score.get("metadata", {}),
|
|
250
|
+
)
|
|
251
|
+
for score in sample_data["scores"]
|
|
252
|
+
]
|
|
253
|
+
records.append(
|
|
254
|
+
EvaluationRecord(
|
|
255
|
+
sample_id=sample_data["sample_id"],
|
|
256
|
+
scores=scores,
|
|
257
|
+
failures=[],
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Build metric aggregates
|
|
262
|
+
metrics = {}
|
|
263
|
+
for metric_data in report_data.get("metrics", []):
|
|
264
|
+
metrics[metric_data["name"]] = MetricAggregate(
|
|
265
|
+
count=metric_data["count"],
|
|
266
|
+
mean=metric_data["mean"],
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
eval_report = EvaluationReport(
|
|
270
|
+
records=records,
|
|
271
|
+
metrics=metrics,
|
|
272
|
+
failures=[],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
print(f"✓ Loaded report with {len(records)} samples")
|
|
276
|
+
|
|
277
|
+
# Create visualization
|
|
278
|
+
visualizer = InteractiveVisualizer()
|
|
279
|
+
fig = visualizer.plot_metric_distribution(eval_report, metric, plot_type)
|
|
280
|
+
|
|
281
|
+
# Export to HTML
|
|
282
|
+
export_interactive_html(fig, output)
|
|
283
|
+
print(f"\n✓ Visualization saved to {output}")
|
|
284
|
+
|
|
285
|
+
return 0
|
|
286
|
+
|
|
287
|
+
except Exception as e:
|
|
288
|
+
print(f"Error: {e}")
|
|
289
|
+
import traceback
|
|
290
|
+
|
|
291
|
+
traceback.print_exc()
|
|
292
|
+
return 1
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
__all__ = [
|
|
296
|
+
"visualize_comparison_command",
|
|
297
|
+
"visualize_pareto_command",
|
|
298
|
+
"visualize_distribution_command",
|
|
299
|
+
]
|
themis/cli/main.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Cyclopts-powered CLI entrypoints for Themis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
from cyclopts import App
|
|
8
|
+
|
|
9
|
+
# Import command modules
|
|
10
|
+
from themis.cli.commands import (
|
|
11
|
+
benchmarks,
|
|
12
|
+
comparison,
|
|
13
|
+
config_commands,
|
|
14
|
+
cost,
|
|
15
|
+
demo,
|
|
16
|
+
info,
|
|
17
|
+
leaderboard,
|
|
18
|
+
sample_run,
|
|
19
|
+
visualize,
|
|
20
|
+
)
|
|
21
|
+
from themis.cli.commands import math_benchmarks as math_cmds
|
|
22
|
+
from themis.cli.commands import mcq_benchmarks as mcq_cmds
|
|
23
|
+
|
|
24
|
+
# Import provider modules to ensure they register themselves
|
|
25
|
+
try:
|
|
26
|
+
from themis.generation import clients # noqa: F401 - registers fake provider
|
|
27
|
+
from themis.generation.providers import (
|
|
28
|
+
litellm_provider, # noqa: F401
|
|
29
|
+
vllm_provider, # noqa: F401
|
|
30
|
+
)
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass # Some providers may not be available
|
|
33
|
+
|
|
34
|
+
app = App(help="Run Themis experiments from the command line")
|
|
35
|
+
|
|
36
|
+
# Register demo command
|
|
37
|
+
app.command(name="demo")(demo.demo_command)
|
|
38
|
+
|
|
39
|
+
# Register math benchmark commands
|
|
40
|
+
app.command(name="math500")(math_cmds.math500_command)
|
|
41
|
+
app.command(name="aime24")(math_cmds.aime24_command)
|
|
42
|
+
app.command(name="aime25")(math_cmds.aime25_command)
|
|
43
|
+
app.command(name="amc23")(math_cmds.amc23_command)
|
|
44
|
+
app.command(name="olympiadbench")(math_cmds.olympiadbench_command)
|
|
45
|
+
app.command(name="beyondaime")(math_cmds.beyond_aime_command)
|
|
46
|
+
|
|
47
|
+
# Register MCQ benchmark commands
|
|
48
|
+
app.command(name="supergpqa")(mcq_cmds.supergpqa_command)
|
|
49
|
+
app.command(name="mmlu-pro")(mcq_cmds.mmlu_pro_command)
|
|
50
|
+
|
|
51
|
+
# Register config commands
|
|
52
|
+
app.command(name="run-config")(config_commands.run_configured_experiment)
|
|
53
|
+
app.command(name="validate-config")(config_commands.validate_config)
|
|
54
|
+
app.command(name="init")(config_commands.init_config)
|
|
55
|
+
|
|
56
|
+
# Register info and listing commands
|
|
57
|
+
app.command(name="list-providers")(benchmarks.list_providers)
|
|
58
|
+
app.command(name="list-benchmarks")(benchmarks.list_benchmarks)
|
|
59
|
+
app.command(name="info")(info.show_info)
|
|
60
|
+
app.command(name="new-project")(info.new_project)
|
|
61
|
+
|
|
62
|
+
# Register comparison commands
|
|
63
|
+
app.command(name="compare")(comparison.compare_command)
|
|
64
|
+
app.command(name="diff")(comparison.diff_command)
|
|
65
|
+
app.command(name="pareto")(comparison.pareto_command)
|
|
66
|
+
|
|
67
|
+
# Register cost commands
|
|
68
|
+
app.command(name="estimate-cost")(cost.estimate_cost_command)
|
|
69
|
+
app.command(name="show-pricing")(cost.show_pricing_command)
|
|
70
|
+
|
|
71
|
+
# Register visualization commands
|
|
72
|
+
app.command(name="visualize")(visualize.visualize_comparison_command)
|
|
73
|
+
app.command(name="visualize-pareto")(visualize.visualize_pareto_command)
|
|
74
|
+
app.command(name="visualize-distribution")(visualize.visualize_distribution_command)
|
|
75
|
+
|
|
76
|
+
# Register leaderboard command
|
|
77
|
+
app.command(name="leaderboard")(leaderboard.leaderboard_command)
|
|
78
|
+
|
|
79
|
+
# Register sample-run command
|
|
80
|
+
app.command(name="sample-run")(sample_run.sample_run_command)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
84
|
+
parsed_argv = list(argv) if argv is not None else None
|
|
85
|
+
try:
|
|
86
|
+
result = app(parsed_argv)
|
|
87
|
+
except SystemExit as exc: # pragma: no cover - CLI integration path
|
|
88
|
+
return int(exc.code or 0)
|
|
89
|
+
return int(result) if isinstance(result, int) else 0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if __name__ == "__main__": # pragma: no cover
|
|
93
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def create_project(project_name: str, project_path: Path) -> None:
|
|
7
|
+
if (project_path / project_name).exists():
|
|
8
|
+
raise FileExistsError(
|
|
9
|
+
f"Project '{project_name}' already exists in {project_path}"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
project_dir = project_path / project_name
|
|
13
|
+
project_dir.mkdir()
|
|
14
|
+
|
|
15
|
+
templates_dir = Path(__file__).parent / "templates"
|
|
16
|
+
|
|
17
|
+
# Create config.sample.json
|
|
18
|
+
with open(templates_dir / "config.sample.json.tpl", "r") as f:
|
|
19
|
+
config_template = f.read()
|
|
20
|
+
with open(project_dir / "config.sample.json", "w") as f:
|
|
21
|
+
f.write(config_template.replace("{{project_name}}", project_name))
|
|
22
|
+
|
|
23
|
+
# Create cli.py
|
|
24
|
+
with open(templates_dir / "cli.py.tpl", "r") as f:
|
|
25
|
+
cli_template = f.read()
|
|
26
|
+
with open(project_dir / "cli.py", "w") as f:
|
|
27
|
+
f.write(cli_template)
|
|
28
|
+
|
|
29
|
+
# Create README.md
|
|
30
|
+
with open(templates_dir / "README.md.tpl", "r") as f:
|
|
31
|
+
readme_template = f.read()
|
|
32
|
+
with open(project_dir / "README.md", "w") as f:
|
|
33
|
+
f.write(readme_template.replace("{{project_name}}", project_name))
|
themis/cli/utils.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""CLI utility functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from themis.experiment import export as experiment_export
|
|
8
|
+
from themis.experiment import orchestrator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def export_outputs(
|
|
12
|
+
report: orchestrator.ExperimentReport,
|
|
13
|
+
*,
|
|
14
|
+
csv_output: Path | None,
|
|
15
|
+
html_output: Path | None,
|
|
16
|
+
json_output: Path | None,
|
|
17
|
+
title: str,
|
|
18
|
+
) -> None:
|
|
19
|
+
"""Export experiment report to various formats.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
report: Experiment report to export
|
|
23
|
+
csv_output: Optional path for CSV export
|
|
24
|
+
html_output: Optional path for HTML export
|
|
25
|
+
json_output: Optional path for JSON export
|
|
26
|
+
title: Title for the report
|
|
27
|
+
"""
|
|
28
|
+
outputs = experiment_export.export_report_bundle(
|
|
29
|
+
report,
|
|
30
|
+
csv_path=csv_output,
|
|
31
|
+
html_path=html_output,
|
|
32
|
+
json_path=json_output,
|
|
33
|
+
title=title,
|
|
34
|
+
)
|
|
35
|
+
for kind, output_path in outputs.items():
|
|
36
|
+
print(f"Exported {kind.upper()} to {output_path}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def effective_total(total: int, limit: int | None) -> int:
|
|
40
|
+
"""Calculate effective total based on limit.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
total: Total number of items
|
|
44
|
+
limit: Optional limit
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Effective total (min of total and limit)
|
|
48
|
+
"""
|
|
49
|
+
if limit is None:
|
|
50
|
+
return total
|
|
51
|
+
return min(total, limit)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Hydra-backed configuration helpers for assembling experiments."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .loader import load_experiment_config
|
|
6
|
+
from .runtime import (
|
|
7
|
+
load_dataset_from_config,
|
|
8
|
+
run_experiment_from_config,
|
|
9
|
+
summarize_report_for_config,
|
|
10
|
+
)
|
|
11
|
+
from .schema import ExperimentConfig
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ExperimentConfig",
|
|
15
|
+
"load_dataset_from_config",
|
|
16
|
+
"load_experiment_config",
|
|
17
|
+
"run_experiment_from_config",
|
|
18
|
+
"summarize_report_for_config",
|
|
19
|
+
]
|
themis/config/loader.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Utilities for loading experiment configs via Hydra/OmegaConf."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from omegaconf import OmegaConf
|
|
9
|
+
|
|
10
|
+
from . import schema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_experiment_config(
|
|
14
|
+
config_path: Path,
|
|
15
|
+
overrides: Iterable[str] | None = None,
|
|
16
|
+
) -> schema.ExperimentConfig:
|
|
17
|
+
"""Load and validate an experiment config file with optional overrides."""
|
|
18
|
+
|
|
19
|
+
base = OmegaConf.structured(schema.ExperimentConfig)
|
|
20
|
+
file_conf = OmegaConf.load(config_path)
|
|
21
|
+
merged = OmegaConf.merge(base, file_conf)
|
|
22
|
+
|
|
23
|
+
if overrides:
|
|
24
|
+
override_conf = OmegaConf.from_dotlist(list(overrides))
|
|
25
|
+
merged = OmegaConf.merge(merged, override_conf)
|
|
26
|
+
|
|
27
|
+
return OmegaConf.to_object(merged)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Registry for experiment builders."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
from themis.config import schema
|
|
8
|
+
from themis.experiment import orchestrator
|
|
9
|
+
|
|
10
|
+
ExperimentBuilder = Callable[
|
|
11
|
+
[schema.ExperimentConfig], orchestrator.ExperimentOrchestrator
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
_EXPERIMENT_BUILDERS: dict[str, ExperimentBuilder] = {}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def register_experiment_builder(task: str) -> Callable[[ExperimentBuilder], ExperimentBuilder]:
|
|
18
|
+
"""Decorator to register an experiment builder for a specific task."""
|
|
19
|
+
|
|
20
|
+
def decorator(builder: ExperimentBuilder) -> ExperimentBuilder:
|
|
21
|
+
_EXPERIMENT_BUILDERS[task] = builder
|
|
22
|
+
return builder
|
|
23
|
+
|
|
24
|
+
return decorator
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_experiment_builder(task: str) -> ExperimentBuilder:
|
|
28
|
+
"""Get the experiment builder for a specific task."""
|
|
29
|
+
if task not in _EXPERIMENT_BUILDERS:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"No experiment builder registered for task '{task}'. "
|
|
32
|
+
f"Available tasks: {', '.join(sorted(_EXPERIMENT_BUILDERS.keys()))}"
|
|
33
|
+
)
|
|
34
|
+
return _EXPERIMENT_BUILDERS[task]
|