themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""Multi-experiment comparison commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.experiment.comparison import compare_experiments, diff_configs
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def compare_command(
|
|
14
|
+
*,
|
|
15
|
+
run_ids: Annotated[
|
|
16
|
+
list[str],
|
|
17
|
+
Parameter(
|
|
18
|
+
help="Run IDs to compare (comma-separated or multiple --run-ids)",
|
|
19
|
+
),
|
|
20
|
+
],
|
|
21
|
+
storage: Annotated[
|
|
22
|
+
Path,
|
|
23
|
+
Parameter(
|
|
24
|
+
help="Storage directory containing experiment results",
|
|
25
|
+
),
|
|
26
|
+
] = Path(".cache/runs"),
|
|
27
|
+
metrics: Annotated[
|
|
28
|
+
list[str] | None,
|
|
29
|
+
Parameter(
|
|
30
|
+
help="Metrics to compare (default: all available)",
|
|
31
|
+
),
|
|
32
|
+
] = None,
|
|
33
|
+
output: Annotated[
|
|
34
|
+
Path | None,
|
|
35
|
+
Parameter(
|
|
36
|
+
help="Output file path (format inferred from extension: .csv, .md, .json)",
|
|
37
|
+
),
|
|
38
|
+
] = None,
|
|
39
|
+
format: Annotated[
|
|
40
|
+
str,
|
|
41
|
+
Parameter(
|
|
42
|
+
help="Output format: csv, markdown, json, latex",
|
|
43
|
+
),
|
|
44
|
+
] = "markdown",
|
|
45
|
+
highlight_best: Annotated[
|
|
46
|
+
str | None,
|
|
47
|
+
Parameter(
|
|
48
|
+
help="Metric to highlight best performer (e.g., 'accuracy')",
|
|
49
|
+
),
|
|
50
|
+
] = None,
|
|
51
|
+
) -> int:
|
|
52
|
+
"""Compare multiple experiment runs.
|
|
53
|
+
|
|
54
|
+
Automatically includes cost data when available. Costs are tracked
|
|
55
|
+
automatically during experiment runs and displayed in comparisons.
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
# Compare three runs with default metrics (includes cost if tracked)
|
|
59
|
+
uv run python -m themis.cli compare \\
|
|
60
|
+
--run-ids run-1 run-2 run-3 \\
|
|
61
|
+
--storage .cache/runs
|
|
62
|
+
|
|
63
|
+
# Compare with specific metrics, export to CSV
|
|
64
|
+
uv run python -m themis.cli compare \\
|
|
65
|
+
--run-ids run-1 run-2 run-3 \\
|
|
66
|
+
--metrics accuracy \\
|
|
67
|
+
--output comparison.csv
|
|
68
|
+
|
|
69
|
+
# Use 'cost' as a metric for ranking and Pareto analysis
|
|
70
|
+
uv run python -m themis.cli pareto \\
|
|
71
|
+
--run-ids run-1 run-2 run-3 \\
|
|
72
|
+
--objectives accuracy cost \\
|
|
73
|
+
--maximize true false
|
|
74
|
+
|
|
75
|
+
# Highlight best accuracy performer
|
|
76
|
+
uv run python -m themis.cli compare \\
|
|
77
|
+
--run-ids run-1 run-2 run-3 \\
|
|
78
|
+
--highlight-best accuracy
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
# Load and compare experiments
|
|
82
|
+
print(f"Loading experiments from {storage}...")
|
|
83
|
+
comparison = compare_experiments(
|
|
84
|
+
run_ids=run_ids,
|
|
85
|
+
storage_dir=storage,
|
|
86
|
+
metrics=metrics,
|
|
87
|
+
include_metadata=True,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
|
|
91
|
+
print(f" Metrics: {', '.join(comparison.metrics)}\n")
|
|
92
|
+
|
|
93
|
+
# Display comparison table
|
|
94
|
+
print("=" * 80)
|
|
95
|
+
print("Experiment Comparison")
|
|
96
|
+
print("=" * 80)
|
|
97
|
+
|
|
98
|
+
# Check if any experiment has cost data
|
|
99
|
+
has_cost = any(exp.get_cost() is not None for exp in comparison.experiments)
|
|
100
|
+
|
|
101
|
+
# Header
|
|
102
|
+
header_cols = ["Run ID"] + comparison.metrics + ["Samples", "Failures"]
|
|
103
|
+
if has_cost:
|
|
104
|
+
header_cols.append("Cost ($)")
|
|
105
|
+
col_widths = [max(20, len(col)) for col in header_cols]
|
|
106
|
+
|
|
107
|
+
header = " | ".join(
|
|
108
|
+
col.ljust(width) for col, width in zip(header_cols, col_widths)
|
|
109
|
+
)
|
|
110
|
+
print(header)
|
|
111
|
+
print("-" * len(header))
|
|
112
|
+
|
|
113
|
+
# Rows
|
|
114
|
+
for exp in comparison.experiments:
|
|
115
|
+
row_values = [exp.run_id[:20]] # Truncate run ID
|
|
116
|
+
for metric in comparison.metrics:
|
|
117
|
+
val = exp.get_metric(metric)
|
|
118
|
+
row_values.append(f"{val:.4f}" if val is not None else "N/A")
|
|
119
|
+
row_values.append(str(exp.sample_count))
|
|
120
|
+
row_values.append(str(exp.failure_count))
|
|
121
|
+
|
|
122
|
+
# Add cost if available
|
|
123
|
+
if has_cost:
|
|
124
|
+
cost = exp.get_cost()
|
|
125
|
+
row_values.append(f"{cost:.4f}" if cost is not None else "N/A")
|
|
126
|
+
|
|
127
|
+
row = " | ".join(
|
|
128
|
+
val.ljust(width) for val, width in zip(row_values, col_widths)
|
|
129
|
+
)
|
|
130
|
+
print(row)
|
|
131
|
+
|
|
132
|
+
print("=" * 80)
|
|
133
|
+
|
|
134
|
+
# Highlight best if requested
|
|
135
|
+
if highlight_best:
|
|
136
|
+
if highlight_best in comparison.metrics:
|
|
137
|
+
best = comparison.highlight_best(highlight_best)
|
|
138
|
+
if best:
|
|
139
|
+
best_value = best.get_metric(highlight_best)
|
|
140
|
+
print(
|
|
141
|
+
f"\n⭐ Best {highlight_best}: {best.run_id} ({best_value:.4f})"
|
|
142
|
+
)
|
|
143
|
+
else:
|
|
144
|
+
print(f"\n⚠️ No valid values for metric '{highlight_best}'")
|
|
145
|
+
else:
|
|
146
|
+
print(
|
|
147
|
+
f"\n⚠️ Metric '{highlight_best}' not found. Available: {comparison.metrics}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Export if requested
|
|
151
|
+
if output:
|
|
152
|
+
output = Path(output)
|
|
153
|
+
# Infer format from extension if not specified
|
|
154
|
+
if output.suffix == ".csv":
|
|
155
|
+
comparison.to_csv(output)
|
|
156
|
+
print(f"\n✓ Exported to {output} (CSV)")
|
|
157
|
+
elif output.suffix == ".md":
|
|
158
|
+
comparison.to_markdown(output)
|
|
159
|
+
print(f"\n✓ Exported to {output} (Markdown)")
|
|
160
|
+
elif output.suffix == ".json":
|
|
161
|
+
import json
|
|
162
|
+
|
|
163
|
+
output.write_text(
|
|
164
|
+
json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
|
|
165
|
+
)
|
|
166
|
+
print(f"\n✓ Exported to {output} (JSON)")
|
|
167
|
+
elif output.suffix == ".tex":
|
|
168
|
+
comparison.to_latex(output, style="booktabs")
|
|
169
|
+
print(f"\n✓ Exported to {output} (LaTeX)")
|
|
170
|
+
else:
|
|
171
|
+
# Use specified format
|
|
172
|
+
if format == "csv":
|
|
173
|
+
comparison.to_csv(output)
|
|
174
|
+
print(f"\n✓ Exported to {output} (CSV)")
|
|
175
|
+
elif format == "markdown":
|
|
176
|
+
comparison.to_markdown(output)
|
|
177
|
+
print(f"\n✓ Exported to {output} (Markdown)")
|
|
178
|
+
elif format == "json":
|
|
179
|
+
import json
|
|
180
|
+
|
|
181
|
+
output.write_text(
|
|
182
|
+
json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
|
|
183
|
+
)
|
|
184
|
+
print(f"\n✓ Exported to {output} (JSON)")
|
|
185
|
+
elif format == "latex":
|
|
186
|
+
comparison.to_latex(output, style="booktabs")
|
|
187
|
+
print(f"\n✓ Exported to {output} (LaTeX)")
|
|
188
|
+
else:
|
|
189
|
+
print(f"\n⚠️ Unknown format: {format}")
|
|
190
|
+
print("Available formats: csv, markdown, json, latex")
|
|
191
|
+
return 1
|
|
192
|
+
|
|
193
|
+
return 0
|
|
194
|
+
|
|
195
|
+
except ValueError as e:
|
|
196
|
+
print(f"Error: {e}")
|
|
197
|
+
return 1
|
|
198
|
+
except FileNotFoundError as e:
|
|
199
|
+
print(f"Error: {e}")
|
|
200
|
+
return 1
|
|
201
|
+
except Exception as e:
|
|
202
|
+
print(f"Unexpected error: {e}")
|
|
203
|
+
import traceback
|
|
204
|
+
|
|
205
|
+
traceback.print_exc()
|
|
206
|
+
return 1
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def diff_command(
|
|
210
|
+
*,
|
|
211
|
+
run_id_a: Annotated[
|
|
212
|
+
str,
|
|
213
|
+
Parameter(
|
|
214
|
+
help="First run ID",
|
|
215
|
+
),
|
|
216
|
+
],
|
|
217
|
+
run_id_b: Annotated[
|
|
218
|
+
str,
|
|
219
|
+
Parameter(
|
|
220
|
+
help="Second run ID",
|
|
221
|
+
),
|
|
222
|
+
],
|
|
223
|
+
storage: Annotated[
|
|
224
|
+
Path,
|
|
225
|
+
Parameter(
|
|
226
|
+
help="Storage directory containing experiment results",
|
|
227
|
+
),
|
|
228
|
+
] = Path(".cache/runs"),
|
|
229
|
+
) -> int:
|
|
230
|
+
"""Show configuration differences between two experiment runs.
|
|
231
|
+
|
|
232
|
+
Examples:
|
|
233
|
+
# Compare configurations
|
|
234
|
+
uv run python -m themis.cli diff \\
|
|
235
|
+
--run-id-a run-1 \\
|
|
236
|
+
--run-id-b run-2 \\
|
|
237
|
+
--storage .cache/runs
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
diff = diff_configs(run_id_a, run_id_b, storage)
|
|
241
|
+
|
|
242
|
+
print("=" * 80)
|
|
243
|
+
print(f"Configuration Diff: {run_id_a} → {run_id_b}")
|
|
244
|
+
print("=" * 80)
|
|
245
|
+
|
|
246
|
+
if not diff.has_differences():
|
|
247
|
+
print("\n✓ No differences found - configurations are identical\n")
|
|
248
|
+
return 0
|
|
249
|
+
|
|
250
|
+
# Show changed fields
|
|
251
|
+
if diff.changed_fields:
|
|
252
|
+
print("\n📝 Changed Fields:")
|
|
253
|
+
for key, (old, new) in diff.changed_fields.items():
|
|
254
|
+
print(f"\n {key}:")
|
|
255
|
+
print(f" - {run_id_a}: {old}")
|
|
256
|
+
print(f" + {run_id_b}: {new}")
|
|
257
|
+
|
|
258
|
+
# Show added fields
|
|
259
|
+
if diff.added_fields:
|
|
260
|
+
print("\n➕ Added Fields (in run_id_b):")
|
|
261
|
+
for key, value in diff.added_fields.items():
|
|
262
|
+
print(f" {key}: {value}")
|
|
263
|
+
|
|
264
|
+
# Show removed fields
|
|
265
|
+
if diff.removed_fields:
|
|
266
|
+
print("\n➖ Removed Fields (from run_id_a):")
|
|
267
|
+
for key, value in diff.removed_fields.items():
|
|
268
|
+
print(f" {key}: {value}")
|
|
269
|
+
|
|
270
|
+
print("\n" + "=" * 80)
|
|
271
|
+
return 0
|
|
272
|
+
|
|
273
|
+
except FileNotFoundError as e:
|
|
274
|
+
print(f"Error: {e}")
|
|
275
|
+
print("\nMake sure both run IDs exist and have config.json files.")
|
|
276
|
+
return 1
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print(f"Unexpected error: {e}")
|
|
279
|
+
import traceback
|
|
280
|
+
|
|
281
|
+
traceback.print_exc()
|
|
282
|
+
return 1
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def pareto_command(
|
|
286
|
+
*,
|
|
287
|
+
run_ids: Annotated[
|
|
288
|
+
list[str],
|
|
289
|
+
Parameter(
|
|
290
|
+
help="Run IDs to analyze",
|
|
291
|
+
),
|
|
292
|
+
],
|
|
293
|
+
storage: Annotated[
|
|
294
|
+
Path,
|
|
295
|
+
Parameter(
|
|
296
|
+
help="Storage directory containing experiment results",
|
|
297
|
+
),
|
|
298
|
+
] = Path(".cache/runs"),
|
|
299
|
+
objectives: Annotated[
|
|
300
|
+
list[str],
|
|
301
|
+
Parameter(
|
|
302
|
+
help="Metrics to optimize (e.g., accuracy cost)",
|
|
303
|
+
),
|
|
304
|
+
],
|
|
305
|
+
maximize: Annotated[
|
|
306
|
+
list[bool] | None,
|
|
307
|
+
Parameter(
|
|
308
|
+
help="Whether to maximize each objective (true/false for each)",
|
|
309
|
+
),
|
|
310
|
+
] = None,
|
|
311
|
+
) -> int:
|
|
312
|
+
"""Find Pareto-optimal experiments across multiple objectives.
|
|
313
|
+
|
|
314
|
+
The Pareto frontier consists of experiments where no other experiment
|
|
315
|
+
is better on all objectives simultaneously.
|
|
316
|
+
|
|
317
|
+
Examples:
|
|
318
|
+
# Find experiments with best accuracy/cost tradeoff
|
|
319
|
+
# (maximize accuracy, minimize cost)
|
|
320
|
+
uv run python -m themis.cli pareto \\
|
|
321
|
+
--run-ids run-1 run-2 run-3 run-4 \\
|
|
322
|
+
--objectives accuracy cost \\
|
|
323
|
+
--maximize true false
|
|
324
|
+
|
|
325
|
+
# Find experiments with best accuracy/latency tradeoff
|
|
326
|
+
uv run python -m themis.cli pareto \\
|
|
327
|
+
--run-ids run-1 run-2 run-3 \\
|
|
328
|
+
--objectives accuracy latency \\
|
|
329
|
+
--maximize true false
|
|
330
|
+
"""
|
|
331
|
+
try:
|
|
332
|
+
# Load experiments
|
|
333
|
+
print(f"Loading experiments from {storage}...")
|
|
334
|
+
comparison = compare_experiments(
|
|
335
|
+
run_ids=run_ids,
|
|
336
|
+
storage_dir=storage,
|
|
337
|
+
metrics=objectives,
|
|
338
|
+
include_metadata=True,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
|
|
342
|
+
print(f" Objectives: {', '.join(objectives)}\n")
|
|
343
|
+
|
|
344
|
+
# Compute Pareto frontier
|
|
345
|
+
pareto_ids = comparison.pareto_frontier(objectives, maximize)
|
|
346
|
+
|
|
347
|
+
print("=" * 80)
|
|
348
|
+
print("Pareto Frontier Analysis")
|
|
349
|
+
print("=" * 80)
|
|
350
|
+
|
|
351
|
+
if not pareto_ids:
|
|
352
|
+
print(
|
|
353
|
+
"\n⚠️ No Pareto-optimal experiments found (all experiments have missing values)\n"
|
|
354
|
+
)
|
|
355
|
+
return 0
|
|
356
|
+
|
|
357
|
+
print(f"\n⭐ Found {len(pareto_ids)} Pareto-optimal experiment(s):\n")
|
|
358
|
+
|
|
359
|
+
# Show Pareto-optimal experiments
|
|
360
|
+
for run_id in pareto_ids:
|
|
361
|
+
exp = next(e for e in comparison.experiments if e.run_id == run_id)
|
|
362
|
+
print(f" • {run_id}")
|
|
363
|
+
for obj in objectives:
|
|
364
|
+
val = exp.get_metric(obj)
|
|
365
|
+
print(
|
|
366
|
+
f" {obj}: {val:.4f}"
|
|
367
|
+
if val is not None
|
|
368
|
+
else f" {obj}: N/A"
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Show dominated experiments
|
|
372
|
+
dominated = [
|
|
373
|
+
exp for exp in comparison.experiments if exp.run_id not in pareto_ids
|
|
374
|
+
]
|
|
375
|
+
if dominated:
|
|
376
|
+
print(f"\n📊 Dominated experiments ({len(dominated)}):")
|
|
377
|
+
for exp in dominated:
|
|
378
|
+
print(f" • {exp.run_id}")
|
|
379
|
+
|
|
380
|
+
print("\n" + "=" * 80)
|
|
381
|
+
return 0
|
|
382
|
+
|
|
383
|
+
except ValueError as e:
|
|
384
|
+
print(f"Error: {e}")
|
|
385
|
+
return 1
|
|
386
|
+
except Exception as e:
|
|
387
|
+
print(f"Unexpected error: {e}")
|
|
388
|
+
import traceback
|
|
389
|
+
|
|
390
|
+
traceback.print_exc()
|
|
391
|
+
return 1
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
__all__ = ["compare_command", "diff_command", "pareto_command"]
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Configuration-related commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Literal
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.cli.utils import effective_total, export_outputs
|
|
11
|
+
from themis.config import (
|
|
12
|
+
load_dataset_from_config,
|
|
13
|
+
load_experiment_config,
|
|
14
|
+
run_experiment_from_config,
|
|
15
|
+
summarize_report_for_config,
|
|
16
|
+
)
|
|
17
|
+
from themis.utils.logging_utils import configure_logging
|
|
18
|
+
from themis.utils.progress import ProgressReporter
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_configured_experiment(
|
|
22
|
+
*,
|
|
23
|
+
config: Annotated[
|
|
24
|
+
Path, Parameter(help="Path to a Hydra/OmegaConf experiment config file")
|
|
25
|
+
],
|
|
26
|
+
overrides: Annotated[
|
|
27
|
+
tuple[str, ...],
|
|
28
|
+
Parameter(
|
|
29
|
+
help="Optional Hydra-style overrides (e.g. generation.sampling.temperature=0.2)",
|
|
30
|
+
show_default=False,
|
|
31
|
+
),
|
|
32
|
+
] = (),
|
|
33
|
+
log_level: Annotated[
|
|
34
|
+
str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
|
|
35
|
+
] = "info",
|
|
36
|
+
csv_output: Annotated[
|
|
37
|
+
Path | None, Parameter(help="Write CSV export to this path")
|
|
38
|
+
] = None,
|
|
39
|
+
html_output: Annotated[
|
|
40
|
+
Path | None, Parameter(help="Write HTML summary to this path")
|
|
41
|
+
] = None,
|
|
42
|
+
json_output: Annotated[
|
|
43
|
+
Path | None, Parameter(help="Write JSON export to this path")
|
|
44
|
+
] = None,
|
|
45
|
+
) -> int:
|
|
46
|
+
"""Execute an experiment described via config file."""
|
|
47
|
+
configure_logging(log_level)
|
|
48
|
+
experiment_config = load_experiment_config(config, overrides)
|
|
49
|
+
dataset = load_dataset_from_config(experiment_config)
|
|
50
|
+
total = effective_total(len(dataset), experiment_config.max_samples)
|
|
51
|
+
with ProgressReporter(total=total, description="Generating") as progress:
|
|
52
|
+
report = run_experiment_from_config(
|
|
53
|
+
experiment_config,
|
|
54
|
+
dataset=dataset,
|
|
55
|
+
on_result=progress.on_result,
|
|
56
|
+
)
|
|
57
|
+
print(summarize_report_for_config(experiment_config, report))
|
|
58
|
+
export_outputs(
|
|
59
|
+
report,
|
|
60
|
+
csv_output=csv_output,
|
|
61
|
+
html_output=html_output,
|
|
62
|
+
json_output=json_output,
|
|
63
|
+
title=f"{experiment_config.name} experiment",
|
|
64
|
+
)
|
|
65
|
+
return 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def validate_config(
|
|
69
|
+
*,
|
|
70
|
+
config: Annotated[Path, Parameter(help="Path to config file to validate")],
|
|
71
|
+
) -> int:
|
|
72
|
+
"""Validate a configuration file without running the experiment."""
|
|
73
|
+
if not config.exists():
|
|
74
|
+
print(f"❌ Error: Config file not found: {config}")
|
|
75
|
+
return 1
|
|
76
|
+
|
|
77
|
+
print(f"Validating config: {config}")
|
|
78
|
+
print("-" * 60)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Try to load as experiment config
|
|
82
|
+
experiment_config = load_experiment_config(config, overrides=())
|
|
83
|
+
print("✓ Config file is valid")
|
|
84
|
+
print(f"\nExperiment: {experiment_config.name}")
|
|
85
|
+
print(f"Run ID: {experiment_config.run_id or '(auto-generated)'}")
|
|
86
|
+
print(f"Resume: {experiment_config.resume}")
|
|
87
|
+
print(f"Max samples: {experiment_config.max_samples or '(unlimited)'}")
|
|
88
|
+
|
|
89
|
+
print("\nDataset:")
|
|
90
|
+
print(f" Source: {experiment_config.dataset.source}")
|
|
91
|
+
print(f" Split: {experiment_config.dataset.split}")
|
|
92
|
+
if experiment_config.dataset.limit:
|
|
93
|
+
print(f" Limit: {experiment_config.dataset.limit}")
|
|
94
|
+
if experiment_config.dataset.subjects:
|
|
95
|
+
print(f" Subjects: {', '.join(experiment_config.dataset.subjects)}")
|
|
96
|
+
|
|
97
|
+
print("\nGeneration:")
|
|
98
|
+
print(f" Model: {experiment_config.generation.model_identifier}")
|
|
99
|
+
print(f" Provider: {experiment_config.generation.provider.name}")
|
|
100
|
+
print(f" Temperature: {experiment_config.generation.sampling.temperature}")
|
|
101
|
+
print(f" Max tokens: {experiment_config.generation.sampling.max_tokens}")
|
|
102
|
+
|
|
103
|
+
if experiment_config.storage.path:
|
|
104
|
+
print(f"\nStorage: {experiment_config.storage.path}")
|
|
105
|
+
|
|
106
|
+
return 0
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print(f"❌ Config validation failed: {e}")
|
|
109
|
+
return 1
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def init_config(
|
|
113
|
+
*,
|
|
114
|
+
output: Annotated[Path, Parameter(help="Output path for config file")] = Path(
|
|
115
|
+
"themis_config.yaml"
|
|
116
|
+
),
|
|
117
|
+
template: Annotated[
|
|
118
|
+
Literal["basic", "math500", "inline"],
|
|
119
|
+
Parameter(help="Config template to generate"),
|
|
120
|
+
] = "basic",
|
|
121
|
+
) -> int:
|
|
122
|
+
"""Generate a sample configuration file for use with run-config."""
|
|
123
|
+
templates = {
|
|
124
|
+
"basic": """name: my_experiment
|
|
125
|
+
task: math500
|
|
126
|
+
dataset:
|
|
127
|
+
source: huggingface
|
|
128
|
+
dataset_id: math500
|
|
129
|
+
limit: 50
|
|
130
|
+
generation:
|
|
131
|
+
model_identifier: fake-math-llm
|
|
132
|
+
provider:
|
|
133
|
+
name: fake
|
|
134
|
+
sampling:
|
|
135
|
+
temperature: 0.0
|
|
136
|
+
top_p: 0.95
|
|
137
|
+
max_tokens: 512
|
|
138
|
+
runner:
|
|
139
|
+
max_parallel: 1
|
|
140
|
+
max_retries: 3
|
|
141
|
+
storage:
|
|
142
|
+
path: .cache/my_experiment
|
|
143
|
+
run_id: my-experiment-001
|
|
144
|
+
resume: true
|
|
145
|
+
""",
|
|
146
|
+
"math500": """name: math500_evaluation
|
|
147
|
+
task: math500
|
|
148
|
+
dataset:
|
|
149
|
+
source: huggingface
|
|
150
|
+
dataset_id: math500
|
|
151
|
+
limit: null # No limit, run full dataset
|
|
152
|
+
subjects:
|
|
153
|
+
- algebra
|
|
154
|
+
- geometry
|
|
155
|
+
generation:
|
|
156
|
+
model_identifier: my-model
|
|
157
|
+
provider:
|
|
158
|
+
name: openai-compatible
|
|
159
|
+
options:
|
|
160
|
+
base_url: http://localhost:1234/v1
|
|
161
|
+
api_key: not-needed
|
|
162
|
+
model_name: qwen2.5-7b-instruct
|
|
163
|
+
timeout: 60
|
|
164
|
+
sampling:
|
|
165
|
+
temperature: 0.0
|
|
166
|
+
top_p: 0.95
|
|
167
|
+
max_tokens: 512
|
|
168
|
+
runner:
|
|
169
|
+
max_parallel: 4
|
|
170
|
+
max_retries: 3
|
|
171
|
+
retry_initial_delay: 0.5
|
|
172
|
+
retry_backoff_multiplier: 2.0
|
|
173
|
+
retry_max_delay: 2.0
|
|
174
|
+
storage:
|
|
175
|
+
path: .cache/math500
|
|
176
|
+
run_id: math500-run-001
|
|
177
|
+
resume: true
|
|
178
|
+
max_samples: null
|
|
179
|
+
""",
|
|
180
|
+
"inline": """name: inline_dataset_experiment
|
|
181
|
+
task: math500
|
|
182
|
+
dataset:
|
|
183
|
+
source: inline
|
|
184
|
+
inline_samples:
|
|
185
|
+
- unique_id: sample-1
|
|
186
|
+
problem: "What is 2 + 2?"
|
|
187
|
+
answer: "4"
|
|
188
|
+
subject: arithmetic
|
|
189
|
+
level: 1
|
|
190
|
+
- unique_id: sample-2
|
|
191
|
+
problem: "Solve for x: 2x + 5 = 13"
|
|
192
|
+
answer: "4"
|
|
193
|
+
subject: algebra
|
|
194
|
+
level: 2
|
|
195
|
+
generation:
|
|
196
|
+
model_identifier: fake-math-llm
|
|
197
|
+
provider:
|
|
198
|
+
name: fake
|
|
199
|
+
sampling:
|
|
200
|
+
temperature: 0.0
|
|
201
|
+
top_p: 0.95
|
|
202
|
+
max_tokens: 512
|
|
203
|
+
storage:
|
|
204
|
+
path: .cache/inline_experiment
|
|
205
|
+
run_id: inline-001
|
|
206
|
+
resume: true
|
|
207
|
+
""",
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if output.exists():
|
|
211
|
+
print(f"❌ Error: File already exists: {output}")
|
|
212
|
+
print(" Use a different --output path or delete the existing file")
|
|
213
|
+
return 1
|
|
214
|
+
|
|
215
|
+
config_content = templates[template]
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
219
|
+
with open(output, "w") as f:
|
|
220
|
+
f.write(config_content)
|
|
221
|
+
|
|
222
|
+
print(f"✓ Created config file: {output}")
|
|
223
|
+
print(f" Template: {template}")
|
|
224
|
+
print("\n📝 Next steps:")
|
|
225
|
+
print(f" 1. Edit {output} to customize settings")
|
|
226
|
+
print(
|
|
227
|
+
f" 2. Validate: uv run python -m themis.cli validate-config --config {output}"
|
|
228
|
+
)
|
|
229
|
+
print(f" 3. Run: uv run python -m themis.cli run-config --config {output}")
|
|
230
|
+
|
|
231
|
+
if template == "math500":
|
|
232
|
+
print("\n⚠️ Remember to:")
|
|
233
|
+
print(" • Update provider.options.base_url with your LLM server endpoint")
|
|
234
|
+
print(" • Update provider.options.model_name with your actual model")
|
|
235
|
+
print(" • Set provider.options.api_key if required by your server")
|
|
236
|
+
elif template == "inline":
|
|
237
|
+
print("\n💡 Tip:")
|
|
238
|
+
print(" • Add more samples to dataset.inline_samples list")
|
|
239
|
+
print(" • Each sample needs: unique_id, problem, answer")
|
|
240
|
+
|
|
241
|
+
return 0
|
|
242
|
+
except Exception as e:
|
|
243
|
+
print(f"❌ Error creating config file: {e}")
|
|
244
|
+
return 1
|