themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Visualization commands for interactive charts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.experiment.comparison import compare_experiments
|
|
11
|
+
from themis.experiment.visualization import (
|
|
12
|
+
PLOTLY_AVAILABLE,
|
|
13
|
+
InteractiveVisualizer,
|
|
14
|
+
export_interactive_html,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def visualize_comparison_command(
|
|
19
|
+
*,
|
|
20
|
+
run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
|
|
21
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
22
|
+
metric: Annotated[str | None, Parameter(help="Metric to visualize")] = None,
|
|
23
|
+
output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
|
|
24
|
+
"visualization.html"
|
|
25
|
+
),
|
|
26
|
+
chart_type: Annotated[
|
|
27
|
+
str,
|
|
28
|
+
Parameter(help="Chart type: comparison, evolution, dashboard, pareto"),
|
|
29
|
+
] = "comparison",
|
|
30
|
+
) -> int:
|
|
31
|
+
"""Generate interactive visualization for experiments.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
# Bar chart comparing accuracy across runs
|
|
35
|
+
uv run python -m themis.cli visualize \\
|
|
36
|
+
--run-ids run-1 run-2 run-3 \\
|
|
37
|
+
--metric accuracy \\
|
|
38
|
+
--output accuracy_comparison.html
|
|
39
|
+
|
|
40
|
+
# Evolution chart showing metric over time
|
|
41
|
+
uv run python -m themis.cli visualize \\
|
|
42
|
+
--run-ids run-1 run-2 run-3 run-4 \\
|
|
43
|
+
--metric accuracy \\
|
|
44
|
+
--chart-type evolution \\
|
|
45
|
+
--output accuracy_evolution.html
|
|
46
|
+
|
|
47
|
+
# Dashboard with multiple metrics
|
|
48
|
+
uv run python -m themis.cli visualize \\
|
|
49
|
+
--run-ids run-1 run-2 run-3 \\
|
|
50
|
+
--chart-type dashboard \\
|
|
51
|
+
--output dashboard.html
|
|
52
|
+
|
|
53
|
+
# Pareto frontier (requires --pareto-metrics and --maximize)
|
|
54
|
+
uv run python -m themis.cli visualize-pareto \\
|
|
55
|
+
--run-ids run-1 run-2 run-3 \\
|
|
56
|
+
--metric1 accuracy \\
|
|
57
|
+
--metric2 cost \\
|
|
58
|
+
--output pareto.html
|
|
59
|
+
"""
|
|
60
|
+
if not PLOTLY_AVAILABLE:
|
|
61
|
+
print("Error: Plotly is not installed.")
|
|
62
|
+
print("Install with: pip install plotly")
|
|
63
|
+
return 1
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Load experiments
|
|
67
|
+
print(f"Loading experiments from {storage}...")
|
|
68
|
+
comparison = compare_experiments(
|
|
69
|
+
run_ids=run_ids,
|
|
70
|
+
storage_dir=storage,
|
|
71
|
+
include_metadata=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print(f"✓ Loaded {len(comparison.experiments)} experiments")
|
|
75
|
+
|
|
76
|
+
# Create visualizer
|
|
77
|
+
visualizer = InteractiveVisualizer()
|
|
78
|
+
|
|
79
|
+
# Generate chart based on type
|
|
80
|
+
if chart_type == "comparison":
|
|
81
|
+
if not metric:
|
|
82
|
+
metric = comparison.metrics[0] if comparison.metrics else "accuracy"
|
|
83
|
+
print(f"Using default metric: {metric}")
|
|
84
|
+
|
|
85
|
+
print(f"Creating comparison chart for '{metric}'...")
|
|
86
|
+
fig = visualizer.plot_metric_comparison(comparison, metric)
|
|
87
|
+
|
|
88
|
+
elif chart_type == "evolution":
|
|
89
|
+
if not metric:
|
|
90
|
+
metric = comparison.metrics[0] if comparison.metrics else "accuracy"
|
|
91
|
+
print(f"Using default metric: {metric}")
|
|
92
|
+
|
|
93
|
+
print(f"Creating evolution chart for '{metric}'...")
|
|
94
|
+
fig = visualizer.plot_metric_evolution(comparison, metric)
|
|
95
|
+
|
|
96
|
+
elif chart_type == "dashboard":
|
|
97
|
+
print("Creating dashboard with multiple metrics...")
|
|
98
|
+
fig = visualizer.create_dashboard(comparison)
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
print(f"Error: Unknown chart type '{chart_type}'")
|
|
102
|
+
print("Available: comparison, evolution, dashboard")
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
# Export to HTML
|
|
106
|
+
export_interactive_html(fig, output)
|
|
107
|
+
print(f"\n✓ Visualization saved to {output}")
|
|
108
|
+
print(" Open in browser to interact with chart")
|
|
109
|
+
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"Error: {e}")
|
|
114
|
+
return 1
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def visualize_pareto_command(
|
|
118
|
+
*,
|
|
119
|
+
run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
|
|
120
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
121
|
+
metric1: Annotated[str, Parameter(help="First metric (x-axis)")],
|
|
122
|
+
metric2: Annotated[str, Parameter(help="Second metric (y-axis)")],
|
|
123
|
+
maximize1: Annotated[bool, Parameter(help="Maximize metric1")] = True,
|
|
124
|
+
maximize2: Annotated[bool, Parameter(help="Maximize metric2")] = True,
|
|
125
|
+
output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
|
|
126
|
+
"pareto.html"
|
|
127
|
+
),
|
|
128
|
+
) -> int:
|
|
129
|
+
"""Generate Pareto frontier visualization.
|
|
130
|
+
|
|
131
|
+
Examples:
|
|
132
|
+
# Maximize accuracy, minimize cost
|
|
133
|
+
uv run python -m themis.cli visualize-pareto \\
|
|
134
|
+
--run-ids run-1 run-2 run-3 run-4 \\
|
|
135
|
+
--metric1 accuracy \\
|
|
136
|
+
--metric2 cost \\
|
|
137
|
+
--maximize1 true \\
|
|
138
|
+
--maximize2 false \\
|
|
139
|
+
--output pareto.html
|
|
140
|
+
"""
|
|
141
|
+
if not PLOTLY_AVAILABLE:
|
|
142
|
+
print("Error: Plotly is not installed.")
|
|
143
|
+
print("Install with: pip install plotly")
|
|
144
|
+
return 1
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
# Load experiments
|
|
148
|
+
print(f"Loading experiments from {storage}...")
|
|
149
|
+
comparison = compare_experiments(
|
|
150
|
+
run_ids=run_ids,
|
|
151
|
+
storage_dir=storage,
|
|
152
|
+
include_metadata=True,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
print(f"✓ Loaded {len(comparison.experiments)} experiments")
|
|
156
|
+
|
|
157
|
+
# Compute Pareto frontier
|
|
158
|
+
print(f"Computing Pareto frontier for {metric1} and {metric2}...")
|
|
159
|
+
pareto_ids = comparison.pareto_frontier(
|
|
160
|
+
objectives=[metric1, metric2],
|
|
161
|
+
maximize=[maximize1, maximize2],
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
print(f"✓ Found {len(pareto_ids)} Pareto-optimal experiments:")
|
|
165
|
+
for run_id in pareto_ids:
|
|
166
|
+
print(f" - {run_id}")
|
|
167
|
+
|
|
168
|
+
# Create visualization
|
|
169
|
+
visualizer = InteractiveVisualizer()
|
|
170
|
+
fig = visualizer.plot_pareto_frontier(
|
|
171
|
+
comparison, metric1, metric2, pareto_ids, maximize1, maximize2
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Export to HTML
|
|
175
|
+
export_interactive_html(fig, output)
|
|
176
|
+
print(f"\n✓ Visualization saved to {output}")
|
|
177
|
+
print(" Red points are Pareto-optimal")
|
|
178
|
+
print(" Blue points are dominated")
|
|
179
|
+
|
|
180
|
+
return 0
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
print(f"Error: {e}")
|
|
184
|
+
return 1
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def visualize_distribution_command(
|
|
188
|
+
*,
|
|
189
|
+
run_id: Annotated[str, Parameter(help="Run ID to visualize")],
|
|
190
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
191
|
+
metric: Annotated[str, Parameter(help="Metric to visualize")],
|
|
192
|
+
plot_type: Annotated[
|
|
193
|
+
str, Parameter(help="Plot type: histogram, box, violin")
|
|
194
|
+
] = "histogram",
|
|
195
|
+
output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
|
|
196
|
+
"distribution.html"
|
|
197
|
+
),
|
|
198
|
+
) -> int:
|
|
199
|
+
"""Generate metric distribution visualization.
|
|
200
|
+
|
|
201
|
+
Shows the distribution of a metric across all samples in an experiment.
|
|
202
|
+
|
|
203
|
+
Examples:
|
|
204
|
+
# Histogram of accuracy scores
|
|
205
|
+
uv run python -m themis.cli visualize-distribution \\
|
|
206
|
+
--run-id my-run \\
|
|
207
|
+
--metric accuracy \\
|
|
208
|
+
--output accuracy_dist.html
|
|
209
|
+
|
|
210
|
+
# Violin plot
|
|
211
|
+
uv run python -m themis.cli visualize-distribution \\
|
|
212
|
+
--run-id my-run \\
|
|
213
|
+
--metric accuracy \\
|
|
214
|
+
--plot-type violin \\
|
|
215
|
+
--output accuracy_violin.html
|
|
216
|
+
"""
|
|
217
|
+
if not PLOTLY_AVAILABLE:
|
|
218
|
+
print("Error: Plotly is not installed.")
|
|
219
|
+
print("Install with: pip install plotly")
|
|
220
|
+
return 1
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
import json
|
|
224
|
+
|
|
225
|
+
# Load report
|
|
226
|
+
print(f"Loading report from {storage / run_id}...")
|
|
227
|
+
report_path = storage / run_id / "report.json"
|
|
228
|
+
|
|
229
|
+
if not report_path.exists():
|
|
230
|
+
print(f"Error: Report not found at {report_path}")
|
|
231
|
+
return 1
|
|
232
|
+
|
|
233
|
+
with report_path.open("r", encoding="utf-8") as f:
|
|
234
|
+
report_data = json.load(f)
|
|
235
|
+
|
|
236
|
+
# Extract evaluation report
|
|
237
|
+
# Note: This is simplified - in production you'd deserialize properly
|
|
238
|
+
from themis.core.entities import EvaluationRecord, MetricScore
|
|
239
|
+
from themis.evaluation.reports import EvaluationReport, MetricAggregate
|
|
240
|
+
|
|
241
|
+
# Build evaluation report from JSON
|
|
242
|
+
records = []
|
|
243
|
+
for sample_data in report_data.get("samples", []):
|
|
244
|
+
scores = [
|
|
245
|
+
MetricScore(
|
|
246
|
+
metric_name=score["metric"],
|
|
247
|
+
value=score["value"],
|
|
248
|
+
details=score.get("details"),
|
|
249
|
+
metadata=score.get("metadata", {}),
|
|
250
|
+
)
|
|
251
|
+
for score in sample_data["scores"]
|
|
252
|
+
]
|
|
253
|
+
records.append(
|
|
254
|
+
EvaluationRecord(
|
|
255
|
+
sample_id=sample_data["sample_id"],
|
|
256
|
+
scores=scores,
|
|
257
|
+
failures=[],
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Build metric aggregates
|
|
262
|
+
metrics = {}
|
|
263
|
+
for metric_data in report_data.get("metrics", []):
|
|
264
|
+
metrics[metric_data["name"]] = MetricAggregate(
|
|
265
|
+
count=metric_data["count"],
|
|
266
|
+
mean=metric_data["mean"],
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
eval_report = EvaluationReport(
|
|
270
|
+
records=records,
|
|
271
|
+
metrics=metrics,
|
|
272
|
+
failures=[],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
print(f"✓ Loaded report with {len(records)} samples")
|
|
276
|
+
|
|
277
|
+
# Create visualization
|
|
278
|
+
visualizer = InteractiveVisualizer()
|
|
279
|
+
fig = visualizer.plot_metric_distribution(eval_report, metric, plot_type)
|
|
280
|
+
|
|
281
|
+
# Export to HTML
|
|
282
|
+
export_interactive_html(fig, output)
|
|
283
|
+
print(f"\n✓ Visualization saved to {output}")
|
|
284
|
+
|
|
285
|
+
return 0
|
|
286
|
+
|
|
287
|
+
except Exception as e:
|
|
288
|
+
print(f"Error: {e}")
|
|
289
|
+
import traceback
|
|
290
|
+
|
|
291
|
+
traceback.print_exc()
|
|
292
|
+
return 1
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
__all__ = [
|
|
296
|
+
"visualize_comparison_command",
|
|
297
|
+
"visualize_pareto_command",
|
|
298
|
+
"visualize_distribution_command",
|
|
299
|
+
]
|