themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,394 @@
1
+ """Multi-experiment comparison commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.experiment.comparison import compare_experiments, diff_configs
11
+
12
+
13
+ def compare_command(
14
+ *,
15
+ run_ids: Annotated[
16
+ list[str],
17
+ Parameter(
18
+ help="Run IDs to compare (comma-separated or multiple --run-ids)",
19
+ ),
20
+ ],
21
+ storage: Annotated[
22
+ Path,
23
+ Parameter(
24
+ help="Storage directory containing experiment results",
25
+ ),
26
+ ] = Path(".cache/runs"),
27
+ metrics: Annotated[
28
+ list[str] | None,
29
+ Parameter(
30
+ help="Metrics to compare (default: all available)",
31
+ ),
32
+ ] = None,
33
+ output: Annotated[
34
+ Path | None,
35
+ Parameter(
36
+ help="Output file path (format inferred from extension: .csv, .md, .json)",
37
+ ),
38
+ ] = None,
39
+ format: Annotated[
40
+ str,
41
+ Parameter(
42
+ help="Output format: csv, markdown, json, latex",
43
+ ),
44
+ ] = "markdown",
45
+ highlight_best: Annotated[
46
+ str | None,
47
+ Parameter(
48
+ help="Metric to highlight best performer (e.g., 'accuracy')",
49
+ ),
50
+ ] = None,
51
+ ) -> int:
52
+ """Compare multiple experiment runs.
53
+
54
+ Automatically includes cost data when available. Costs are tracked
55
+ automatically during experiment runs and displayed in comparisons.
56
+
57
+ Examples:
58
+ # Compare three runs with default metrics (includes cost if tracked)
59
+ uv run python -m themis.cli compare \\
60
+ --run-ids run-1 run-2 run-3 \\
61
+ --storage .cache/runs
62
+
63
+ # Compare with specific metrics, export to CSV
64
+ uv run python -m themis.cli compare \\
65
+ --run-ids run-1 run-2 run-3 \\
66
+ --metrics accuracy \\
67
+ --output comparison.csv
68
+
69
+ # Use 'cost' as a metric for ranking and Pareto analysis
70
+ uv run python -m themis.cli pareto \\
71
+ --run-ids run-1 run-2 run-3 \\
72
+ --objectives accuracy cost \\
73
+ --maximize true false
74
+
75
+ # Highlight best accuracy performer
76
+ uv run python -m themis.cli compare \\
77
+ --run-ids run-1 run-2 run-3 \\
78
+ --highlight-best accuracy
79
+ """
80
+ try:
81
+ # Load and compare experiments
82
+ print(f"Loading experiments from {storage}...")
83
+ comparison = compare_experiments(
84
+ run_ids=run_ids,
85
+ storage_dir=storage,
86
+ metrics=metrics,
87
+ include_metadata=True,
88
+ )
89
+
90
+ print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
91
+ print(f" Metrics: {', '.join(comparison.metrics)}\n")
92
+
93
+ # Display comparison table
94
+ print("=" * 80)
95
+ print("Experiment Comparison")
96
+ print("=" * 80)
97
+
98
+ # Check if any experiment has cost data
99
+ has_cost = any(exp.get_cost() is not None for exp in comparison.experiments)
100
+
101
+ # Header
102
+ header_cols = ["Run ID"] + comparison.metrics + ["Samples", "Failures"]
103
+ if has_cost:
104
+ header_cols.append("Cost ($)")
105
+ col_widths = [max(20, len(col)) for col in header_cols]
106
+
107
+ header = " | ".join(
108
+ col.ljust(width) for col, width in zip(header_cols, col_widths)
109
+ )
110
+ print(header)
111
+ print("-" * len(header))
112
+
113
+ # Rows
114
+ for exp in comparison.experiments:
115
+ row_values = [exp.run_id[:20]] # Truncate run ID
116
+ for metric in comparison.metrics:
117
+ val = exp.get_metric(metric)
118
+ row_values.append(f"{val:.4f}" if val is not None else "N/A")
119
+ row_values.append(str(exp.sample_count))
120
+ row_values.append(str(exp.failure_count))
121
+
122
+ # Add cost if available
123
+ if has_cost:
124
+ cost = exp.get_cost()
125
+ row_values.append(f"{cost:.4f}" if cost is not None else "N/A")
126
+
127
+ row = " | ".join(
128
+ val.ljust(width) for val, width in zip(row_values, col_widths)
129
+ )
130
+ print(row)
131
+
132
+ print("=" * 80)
133
+
134
+ # Highlight best if requested
135
+ if highlight_best:
136
+ if highlight_best in comparison.metrics:
137
+ best = comparison.highlight_best(highlight_best)
138
+ if best:
139
+ best_value = best.get_metric(highlight_best)
140
+ print(
141
+ f"\n⭐ Best {highlight_best}: {best.run_id} ({best_value:.4f})"
142
+ )
143
+ else:
144
+ print(f"\n⚠️ No valid values for metric '{highlight_best}'")
145
+ else:
146
+ print(
147
+ f"\n⚠️ Metric '{highlight_best}' not found. Available: {comparison.metrics}"
148
+ )
149
+
150
+ # Export if requested
151
+ if output:
152
+ output = Path(output)
153
+ # Infer format from extension if not specified
154
+ if output.suffix == ".csv":
155
+ comparison.to_csv(output)
156
+ print(f"\n✓ Exported to {output} (CSV)")
157
+ elif output.suffix == ".md":
158
+ comparison.to_markdown(output)
159
+ print(f"\n✓ Exported to {output} (Markdown)")
160
+ elif output.suffix == ".json":
161
+ import json
162
+
163
+ output.write_text(
164
+ json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
165
+ )
166
+ print(f"\n✓ Exported to {output} (JSON)")
167
+ elif output.suffix == ".tex":
168
+ comparison.to_latex(output, style="booktabs")
169
+ print(f"\n✓ Exported to {output} (LaTeX)")
170
+ else:
171
+ # Use specified format
172
+ if format == "csv":
173
+ comparison.to_csv(output)
174
+ print(f"\n✓ Exported to {output} (CSV)")
175
+ elif format == "markdown":
176
+ comparison.to_markdown(output)
177
+ print(f"\n✓ Exported to {output} (Markdown)")
178
+ elif format == "json":
179
+ import json
180
+
181
+ output.write_text(
182
+ json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
183
+ )
184
+ print(f"\n✓ Exported to {output} (JSON)")
185
+ elif format == "latex":
186
+ comparison.to_latex(output, style="booktabs")
187
+ print(f"\n✓ Exported to {output} (LaTeX)")
188
+ else:
189
+ print(f"\n⚠️ Unknown format: {format}")
190
+ print("Available formats: csv, markdown, json, latex")
191
+ return 1
192
+
193
+ return 0
194
+
195
+ except ValueError as e:
196
+ print(f"Error: {e}")
197
+ return 1
198
+ except FileNotFoundError as e:
199
+ print(f"Error: {e}")
200
+ return 1
201
+ except Exception as e:
202
+ print(f"Unexpected error: {e}")
203
+ import traceback
204
+
205
+ traceback.print_exc()
206
+ return 1
207
+
208
+
209
+ def diff_command(
210
+ *,
211
+ run_id_a: Annotated[
212
+ str,
213
+ Parameter(
214
+ help="First run ID",
215
+ ),
216
+ ],
217
+ run_id_b: Annotated[
218
+ str,
219
+ Parameter(
220
+ help="Second run ID",
221
+ ),
222
+ ],
223
+ storage: Annotated[
224
+ Path,
225
+ Parameter(
226
+ help="Storage directory containing experiment results",
227
+ ),
228
+ ] = Path(".cache/runs"),
229
+ ) -> int:
230
+ """Show configuration differences between two experiment runs.
231
+
232
+ Examples:
233
+ # Compare configurations
234
+ uv run python -m themis.cli diff \\
235
+ --run-id-a run-1 \\
236
+ --run-id-b run-2 \\
237
+ --storage .cache/runs
238
+ """
239
+ try:
240
+ diff = diff_configs(run_id_a, run_id_b, storage)
241
+
242
+ print("=" * 80)
243
+ print(f"Configuration Diff: {run_id_a} → {run_id_b}")
244
+ print("=" * 80)
245
+
246
+ if not diff.has_differences():
247
+ print("\n✓ No differences found - configurations are identical\n")
248
+ return 0
249
+
250
+ # Show changed fields
251
+ if diff.changed_fields:
252
+ print("\n📝 Changed Fields:")
253
+ for key, (old, new) in diff.changed_fields.items():
254
+ print(f"\n {key}:")
255
+ print(f" - {run_id_a}: {old}")
256
+ print(f" + {run_id_b}: {new}")
257
+
258
+ # Show added fields
259
+ if diff.added_fields:
260
+ print("\n➕ Added Fields (in run_id_b):")
261
+ for key, value in diff.added_fields.items():
262
+ print(f" {key}: {value}")
263
+
264
+ # Show removed fields
265
+ if diff.removed_fields:
266
+ print("\n➖ Removed Fields (from run_id_a):")
267
+ for key, value in diff.removed_fields.items():
268
+ print(f" {key}: {value}")
269
+
270
+ print("\n" + "=" * 80)
271
+ return 0
272
+
273
+ except FileNotFoundError as e:
274
+ print(f"Error: {e}")
275
+ print("\nMake sure both run IDs exist and have config.json files.")
276
+ return 1
277
+ except Exception as e:
278
+ print(f"Unexpected error: {e}")
279
+ import traceback
280
+
281
+ traceback.print_exc()
282
+ return 1
283
+
284
+
285
+ def pareto_command(
286
+ *,
287
+ run_ids: Annotated[
288
+ list[str],
289
+ Parameter(
290
+ help="Run IDs to analyze",
291
+ ),
292
+ ],
293
+ storage: Annotated[
294
+ Path,
295
+ Parameter(
296
+ help="Storage directory containing experiment results",
297
+ ),
298
+ ] = Path(".cache/runs"),
299
+ objectives: Annotated[
300
+ list[str],
301
+ Parameter(
302
+ help="Metrics to optimize (e.g., accuracy cost)",
303
+ ),
304
+ ],
305
+ maximize: Annotated[
306
+ list[bool] | None,
307
+ Parameter(
308
+ help="Whether to maximize each objective (true/false for each)",
309
+ ),
310
+ ] = None,
311
+ ) -> int:
312
+ """Find Pareto-optimal experiments across multiple objectives.
313
+
314
+ The Pareto frontier consists of experiments where no other experiment
315
+ is better on all objectives simultaneously.
316
+
317
+ Examples:
318
+ # Find experiments with best accuracy/cost tradeoff
319
+ # (maximize accuracy, minimize cost)
320
+ uv run python -m themis.cli pareto \\
321
+ --run-ids run-1 run-2 run-3 run-4 \\
322
+ --objectives accuracy cost \\
323
+ --maximize true false
324
+
325
+ # Find experiments with best accuracy/latency tradeoff
326
+ uv run python -m themis.cli pareto \\
327
+ --run-ids run-1 run-2 run-3 \\
328
+ --objectives accuracy latency \\
329
+ --maximize true false
330
+ """
331
+ try:
332
+ # Load experiments
333
+ print(f"Loading experiments from {storage}...")
334
+ comparison = compare_experiments(
335
+ run_ids=run_ids,
336
+ storage_dir=storage,
337
+ metrics=objectives,
338
+ include_metadata=True,
339
+ )
340
+
341
+ print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
342
+ print(f" Objectives: {', '.join(objectives)}\n")
343
+
344
+ # Compute Pareto frontier
345
+ pareto_ids = comparison.pareto_frontier(objectives, maximize)
346
+
347
+ print("=" * 80)
348
+ print("Pareto Frontier Analysis")
349
+ print("=" * 80)
350
+
351
+ if not pareto_ids:
352
+ print(
353
+ "\n⚠️ No Pareto-optimal experiments found (all experiments have missing values)\n"
354
+ )
355
+ return 0
356
+
357
+ print(f"\n⭐ Found {len(pareto_ids)} Pareto-optimal experiment(s):\n")
358
+
359
+ # Show Pareto-optimal experiments
360
+ for run_id in pareto_ids:
361
+ exp = next(e for e in comparison.experiments if e.run_id == run_id)
362
+ print(f" • {run_id}")
363
+ for obj in objectives:
364
+ val = exp.get_metric(obj)
365
+ print(
366
+ f" {obj}: {val:.4f}"
367
+ if val is not None
368
+ else f" {obj}: N/A"
369
+ )
370
+
371
+ # Show dominated experiments
372
+ dominated = [
373
+ exp for exp in comparison.experiments if exp.run_id not in pareto_ids
374
+ ]
375
+ if dominated:
376
+ print(f"\n📊 Dominated experiments ({len(dominated)}):")
377
+ for exp in dominated:
378
+ print(f" • {exp.run_id}")
379
+
380
+ print("\n" + "=" * 80)
381
+ return 0
382
+
383
+ except ValueError as e:
384
+ print(f"Error: {e}")
385
+ return 1
386
+ except Exception as e:
387
+ print(f"Unexpected error: {e}")
388
+ import traceback
389
+
390
+ traceback.print_exc()
391
+ return 1
392
+
393
+
394
+ __all__ = ["compare_command", "diff_command", "pareto_command"]
@@ -0,0 +1,244 @@
1
+ """Configuration-related commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Literal
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.config import (
12
+ load_dataset_from_config,
13
+ load_experiment_config,
14
+ run_experiment_from_config,
15
+ summarize_report_for_config,
16
+ )
17
+ from themis.utils.logging_utils import configure_logging
18
+ from themis.utils.progress import ProgressReporter
19
+
20
+
21
+ def run_configured_experiment(
22
+ *,
23
+ config: Annotated[
24
+ Path, Parameter(help="Path to a Hydra/OmegaConf experiment config file")
25
+ ],
26
+ overrides: Annotated[
27
+ tuple[str, ...],
28
+ Parameter(
29
+ help="Optional Hydra-style overrides (e.g. generation.sampling.temperature=0.2)",
30
+ show_default=False,
31
+ ),
32
+ ] = (),
33
+ log_level: Annotated[
34
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
35
+ ] = "info",
36
+ csv_output: Annotated[
37
+ Path | None, Parameter(help="Write CSV export to this path")
38
+ ] = None,
39
+ html_output: Annotated[
40
+ Path | None, Parameter(help="Write HTML summary to this path")
41
+ ] = None,
42
+ json_output: Annotated[
43
+ Path | None, Parameter(help="Write JSON export to this path")
44
+ ] = None,
45
+ ) -> int:
46
+ """Execute an experiment described via config file."""
47
+ configure_logging(log_level)
48
+ experiment_config = load_experiment_config(config, overrides)
49
+ dataset = load_dataset_from_config(experiment_config)
50
+ total = effective_total(len(dataset), experiment_config.max_samples)
51
+ with ProgressReporter(total=total, description="Generating") as progress:
52
+ report = run_experiment_from_config(
53
+ experiment_config,
54
+ dataset=dataset,
55
+ on_result=progress.on_result,
56
+ )
57
+ print(summarize_report_for_config(experiment_config, report))
58
+ export_outputs(
59
+ report,
60
+ csv_output=csv_output,
61
+ html_output=html_output,
62
+ json_output=json_output,
63
+ title=f"{experiment_config.name} experiment",
64
+ )
65
+ return 0
66
+
67
+
68
+ def validate_config(
69
+ *,
70
+ config: Annotated[Path, Parameter(help="Path to config file to validate")],
71
+ ) -> int:
72
+ """Validate a configuration file without running the experiment."""
73
+ if not config.exists():
74
+ print(f"❌ Error: Config file not found: {config}")
75
+ return 1
76
+
77
+ print(f"Validating config: {config}")
78
+ print("-" * 60)
79
+
80
+ try:
81
+ # Try to load as experiment config
82
+ experiment_config = load_experiment_config(config, overrides=())
83
+ print("✓ Config file is valid")
84
+ print(f"\nExperiment: {experiment_config.name}")
85
+ print(f"Run ID: {experiment_config.run_id or '(auto-generated)'}")
86
+ print(f"Resume: {experiment_config.resume}")
87
+ print(f"Max samples: {experiment_config.max_samples or '(unlimited)'}")
88
+
89
+ print("\nDataset:")
90
+ print(f" Source: {experiment_config.dataset.source}")
91
+ print(f" Split: {experiment_config.dataset.split}")
92
+ if experiment_config.dataset.limit:
93
+ print(f" Limit: {experiment_config.dataset.limit}")
94
+ if experiment_config.dataset.subjects:
95
+ print(f" Subjects: {', '.join(experiment_config.dataset.subjects)}")
96
+
97
+ print("\nGeneration:")
98
+ print(f" Model: {experiment_config.generation.model_identifier}")
99
+ print(f" Provider: {experiment_config.generation.provider.name}")
100
+ print(f" Temperature: {experiment_config.generation.sampling.temperature}")
101
+ print(f" Max tokens: {experiment_config.generation.sampling.max_tokens}")
102
+
103
+ if experiment_config.storage.path:
104
+ print(f"\nStorage: {experiment_config.storage.path}")
105
+
106
+ return 0
107
+ except Exception as e:
108
+ print(f"❌ Config validation failed: {e}")
109
+ return 1
110
+
111
+
112
+ def init_config(
113
+ *,
114
+ output: Annotated[Path, Parameter(help="Output path for config file")] = Path(
115
+ "themis_config.yaml"
116
+ ),
117
+ template: Annotated[
118
+ Literal["basic", "math500", "inline"],
119
+ Parameter(help="Config template to generate"),
120
+ ] = "basic",
121
+ ) -> int:
122
+ """Generate a sample configuration file for use with run-config."""
123
+ templates = {
124
+ "basic": """name: my_experiment
125
+ task: math500
126
+ dataset:
127
+ source: huggingface
128
+ dataset_id: math500
129
+ limit: 50
130
+ generation:
131
+ model_identifier: fake-math-llm
132
+ provider:
133
+ name: fake
134
+ sampling:
135
+ temperature: 0.0
136
+ top_p: 0.95
137
+ max_tokens: 512
138
+ runner:
139
+ max_parallel: 1
140
+ max_retries: 3
141
+ storage:
142
+ path: .cache/my_experiment
143
+ run_id: my-experiment-001
144
+ resume: true
145
+ """,
146
+ "math500": """name: math500_evaluation
147
+ task: math500
148
+ dataset:
149
+ source: huggingface
150
+ dataset_id: math500
151
+ limit: null # No limit, run full dataset
152
+ subjects:
153
+ - algebra
154
+ - geometry
155
+ generation:
156
+ model_identifier: my-model
157
+ provider:
158
+ name: openai-compatible
159
+ options:
160
+ base_url: http://localhost:1234/v1
161
+ api_key: not-needed
162
+ model_name: qwen2.5-7b-instruct
163
+ timeout: 60
164
+ sampling:
165
+ temperature: 0.0
166
+ top_p: 0.95
167
+ max_tokens: 512
168
+ runner:
169
+ max_parallel: 4
170
+ max_retries: 3
171
+ retry_initial_delay: 0.5
172
+ retry_backoff_multiplier: 2.0
173
+ retry_max_delay: 2.0
174
+ storage:
175
+ path: .cache/math500
176
+ run_id: math500-run-001
177
+ resume: true
178
+ max_samples: null
179
+ """,
180
+ "inline": """name: inline_dataset_experiment
181
+ task: math500
182
+ dataset:
183
+ source: inline
184
+ inline_samples:
185
+ - unique_id: sample-1
186
+ problem: "What is 2 + 2?"
187
+ answer: "4"
188
+ subject: arithmetic
189
+ level: 1
190
+ - unique_id: sample-2
191
+ problem: "Solve for x: 2x + 5 = 13"
192
+ answer: "4"
193
+ subject: algebra
194
+ level: 2
195
+ generation:
196
+ model_identifier: fake-math-llm
197
+ provider:
198
+ name: fake
199
+ sampling:
200
+ temperature: 0.0
201
+ top_p: 0.95
202
+ max_tokens: 512
203
+ storage:
204
+ path: .cache/inline_experiment
205
+ run_id: inline-001
206
+ resume: true
207
+ """,
208
+ }
209
+
210
+ if output.exists():
211
+ print(f"❌ Error: File already exists: {output}")
212
+ print(" Use a different --output path or delete the existing file")
213
+ return 1
214
+
215
+ config_content = templates[template]
216
+
217
+ try:
218
+ output.parent.mkdir(parents=True, exist_ok=True)
219
+ with open(output, "w") as f:
220
+ f.write(config_content)
221
+
222
+ print(f"✓ Created config file: {output}")
223
+ print(f" Template: {template}")
224
+ print("\n📝 Next steps:")
225
+ print(f" 1. Edit {output} to customize settings")
226
+ print(
227
+ f" 2. Validate: uv run python -m themis.cli validate-config --config {output}"
228
+ )
229
+ print(f" 3. Run: uv run python -m themis.cli run-config --config {output}")
230
+
231
+ if template == "math500":
232
+ print("\n⚠️ Remember to:")
233
+ print(" • Update provider.options.base_url with your LLM server endpoint")
234
+ print(" • Update provider.options.model_name with your actual model")
235
+ print(" • Set provider.options.api_key if required by your server")
236
+ elif template == "inline":
237
+ print("\n💡 Tip:")
238
+ print(" • Add more samples to dataset.inline_samples list")
239
+ print(" • Each sample needs: unique_id, problem, answer")
240
+
241
+ return 0
242
+ except Exception as e:
243
+ print(f"❌ Error creating config file: {e}")
244
+ return 1