themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
1
+ """Visualization commands for interactive charts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.experiment.comparison import compare_experiments
11
+ from themis.experiment.visualization import (
12
+ PLOTLY_AVAILABLE,
13
+ InteractiveVisualizer,
14
+ export_interactive_html,
15
+ )
16
+
17
+
18
+ def visualize_comparison_command(
19
+ *,
20
+ run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
21
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
22
+ metric: Annotated[str | None, Parameter(help="Metric to visualize")] = None,
23
+ output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
24
+ "visualization.html"
25
+ ),
26
+ chart_type: Annotated[
27
+ str,
28
+ Parameter(help="Chart type: comparison, evolution, dashboard, pareto"),
29
+ ] = "comparison",
30
+ ) -> int:
31
+ """Generate interactive visualization for experiments.
32
+
33
+ Examples:
34
+ # Bar chart comparing accuracy across runs
35
+ uv run python -m themis.cli visualize \\
36
+ --run-ids run-1 run-2 run-3 \\
37
+ --metric accuracy \\
38
+ --output accuracy_comparison.html
39
+
40
+ # Evolution chart showing metric over time
41
+ uv run python -m themis.cli visualize \\
42
+ --run-ids run-1 run-2 run-3 run-4 \\
43
+ --metric accuracy \\
44
+ --chart-type evolution \\
45
+ --output accuracy_evolution.html
46
+
47
+ # Dashboard with multiple metrics
48
+ uv run python -m themis.cli visualize \\
49
+ --run-ids run-1 run-2 run-3 \\
50
+ --chart-type dashboard \\
51
+ --output dashboard.html
52
+
53
+ # Pareto frontier (requires --pareto-metrics and --maximize)
54
+ uv run python -m themis.cli visualize-pareto \\
55
+ --run-ids run-1 run-2 run-3 \\
56
+ --metric1 accuracy \\
57
+ --metric2 cost \\
58
+ --output pareto.html
59
+ """
60
+ if not PLOTLY_AVAILABLE:
61
+ print("Error: Plotly is not installed.")
62
+ print("Install with: pip install plotly")
63
+ return 1
64
+
65
+ try:
66
+ # Load experiments
67
+ print(f"Loading experiments from {storage}...")
68
+ comparison = compare_experiments(
69
+ run_ids=run_ids,
70
+ storage_dir=storage,
71
+ include_metadata=True,
72
+ )
73
+
74
+ print(f"✓ Loaded {len(comparison.experiments)} experiments")
75
+
76
+ # Create visualizer
77
+ visualizer = InteractiveVisualizer()
78
+
79
+ # Generate chart based on type
80
+ if chart_type == "comparison":
81
+ if not metric:
82
+ metric = comparison.metrics[0] if comparison.metrics else "accuracy"
83
+ print(f"Using default metric: {metric}")
84
+
85
+ print(f"Creating comparison chart for '{metric}'...")
86
+ fig = visualizer.plot_metric_comparison(comparison, metric)
87
+
88
+ elif chart_type == "evolution":
89
+ if not metric:
90
+ metric = comparison.metrics[0] if comparison.metrics else "accuracy"
91
+ print(f"Using default metric: {metric}")
92
+
93
+ print(f"Creating evolution chart for '{metric}'...")
94
+ fig = visualizer.plot_metric_evolution(comparison, metric)
95
+
96
+ elif chart_type == "dashboard":
97
+ print("Creating dashboard with multiple metrics...")
98
+ fig = visualizer.create_dashboard(comparison)
99
+
100
+ else:
101
+ print(f"Error: Unknown chart type '{chart_type}'")
102
+ print("Available: comparison, evolution, dashboard")
103
+ return 1
104
+
105
+ # Export to HTML
106
+ export_interactive_html(fig, output)
107
+ print(f"\n✓ Visualization saved to {output}")
108
+ print(" Open in browser to interact with chart")
109
+
110
+ return 0
111
+
112
+ except Exception as e:
113
+ print(f"Error: {e}")
114
+ return 1
115
+
116
+
117
+ def visualize_pareto_command(
118
+ *,
119
+ run_ids: Annotated[list[str], Parameter(help="Run IDs to visualize")],
120
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
121
+ metric1: Annotated[str, Parameter(help="First metric (x-axis)")],
122
+ metric2: Annotated[str, Parameter(help="Second metric (y-axis)")],
123
+ maximize1: Annotated[bool, Parameter(help="Maximize metric1")] = True,
124
+ maximize2: Annotated[bool, Parameter(help="Maximize metric2")] = True,
125
+ output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
126
+ "pareto.html"
127
+ ),
128
+ ) -> int:
129
+ """Generate Pareto frontier visualization.
130
+
131
+ Examples:
132
+ # Maximize accuracy, minimize cost
133
+ uv run python -m themis.cli visualize-pareto \\
134
+ --run-ids run-1 run-2 run-3 run-4 \\
135
+ --metric1 accuracy \\
136
+ --metric2 cost \\
137
+ --maximize1 true \\
138
+ --maximize2 false \\
139
+ --output pareto.html
140
+ """
141
+ if not PLOTLY_AVAILABLE:
142
+ print("Error: Plotly is not installed.")
143
+ print("Install with: pip install plotly")
144
+ return 1
145
+
146
+ try:
147
+ # Load experiments
148
+ print(f"Loading experiments from {storage}...")
149
+ comparison = compare_experiments(
150
+ run_ids=run_ids,
151
+ storage_dir=storage,
152
+ include_metadata=True,
153
+ )
154
+
155
+ print(f"✓ Loaded {len(comparison.experiments)} experiments")
156
+
157
+ # Compute Pareto frontier
158
+ print(f"Computing Pareto frontier for {metric1} and {metric2}...")
159
+ pareto_ids = comparison.pareto_frontier(
160
+ objectives=[metric1, metric2],
161
+ maximize=[maximize1, maximize2],
162
+ )
163
+
164
+ print(f"✓ Found {len(pareto_ids)} Pareto-optimal experiments:")
165
+ for run_id in pareto_ids:
166
+ print(f" - {run_id}")
167
+
168
+ # Create visualization
169
+ visualizer = InteractiveVisualizer()
170
+ fig = visualizer.plot_pareto_frontier(
171
+ comparison, metric1, metric2, pareto_ids, maximize1, maximize2
172
+ )
173
+
174
+ # Export to HTML
175
+ export_interactive_html(fig, output)
176
+ print(f"\n✓ Visualization saved to {output}")
177
+ print(" Red points are Pareto-optimal")
178
+ print(" Blue points are dominated")
179
+
180
+ return 0
181
+
182
+ except Exception as e:
183
+ print(f"Error: {e}")
184
+ return 1
185
+
186
+
187
+ def visualize_distribution_command(
188
+ *,
189
+ run_id: Annotated[str, Parameter(help="Run ID to visualize")],
190
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
191
+ metric: Annotated[str, Parameter(help="Metric to visualize")],
192
+ plot_type: Annotated[
193
+ str, Parameter(help="Plot type: histogram, box, violin")
194
+ ] = "histogram",
195
+ output: Annotated[Path, Parameter(help="Output HTML file path")] = Path(
196
+ "distribution.html"
197
+ ),
198
+ ) -> int:
199
+ """Generate metric distribution visualization.
200
+
201
+ Shows the distribution of a metric across all samples in an experiment.
202
+
203
+ Examples:
204
+ # Histogram of accuracy scores
205
+ uv run python -m themis.cli visualize-distribution \\
206
+ --run-id my-run \\
207
+ --metric accuracy \\
208
+ --output accuracy_dist.html
209
+
210
+ # Violin plot
211
+ uv run python -m themis.cli visualize-distribution \\
212
+ --run-id my-run \\
213
+ --metric accuracy \\
214
+ --plot-type violin \\
215
+ --output accuracy_violin.html
216
+ """
217
+ if not PLOTLY_AVAILABLE:
218
+ print("Error: Plotly is not installed.")
219
+ print("Install with: pip install plotly")
220
+ return 1
221
+
222
+ try:
223
+ import json
224
+
225
+ # Load report
226
+ print(f"Loading report from {storage / run_id}...")
227
+ report_path = storage / run_id / "report.json"
228
+
229
+ if not report_path.exists():
230
+ print(f"Error: Report not found at {report_path}")
231
+ return 1
232
+
233
+ with report_path.open("r", encoding="utf-8") as f:
234
+ report_data = json.load(f)
235
+
236
+ # Extract evaluation report
237
+ # Note: This is simplified - in production you'd deserialize properly
238
+ from themis.core.entities import EvaluationRecord, MetricScore
239
+ from themis.evaluation.reports import EvaluationReport, MetricAggregate
240
+
241
+ # Build evaluation report from JSON
242
+ records = []
243
+ for sample_data in report_data.get("samples", []):
244
+ scores = [
245
+ MetricScore(
246
+ metric_name=score["metric"],
247
+ value=score["value"],
248
+ details=score.get("details"),
249
+ metadata=score.get("metadata", {}),
250
+ )
251
+ for score in sample_data["scores"]
252
+ ]
253
+ records.append(
254
+ EvaluationRecord(
255
+ sample_id=sample_data["sample_id"],
256
+ scores=scores,
257
+ failures=[],
258
+ )
259
+ )
260
+
261
+ # Build metric aggregates
262
+ metrics = {}
263
+ for metric_data in report_data.get("metrics", []):
264
+ metrics[metric_data["name"]] = MetricAggregate(
265
+ count=metric_data["count"],
266
+ mean=metric_data["mean"],
267
+ )
268
+
269
+ eval_report = EvaluationReport(
270
+ records=records,
271
+ metrics=metrics,
272
+ failures=[],
273
+ )
274
+
275
+ print(f"✓ Loaded report with {len(records)} samples")
276
+
277
+ # Create visualization
278
+ visualizer = InteractiveVisualizer()
279
+ fig = visualizer.plot_metric_distribution(eval_report, metric, plot_type)
280
+
281
+ # Export to HTML
282
+ export_interactive_html(fig, output)
283
+ print(f"\n✓ Visualization saved to {output}")
284
+
285
+ return 0
286
+
287
+ except Exception as e:
288
+ print(f"Error: {e}")
289
+ import traceback
290
+
291
+ traceback.print_exc()
292
+ return 1
293
+
294
+
295
+ __all__ = [
296
+ "visualize_comparison_command",
297
+ "visualize_pareto_command",
298
+ "visualize_distribution_command",
299
+ ]
themis/cli/main.py ADDED
@@ -0,0 +1,93 @@
1
+ """Cyclopts-powered CLI entrypoints for Themis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Sequence
6
+
7
+ from cyclopts import App
8
+
9
+ # Import command modules
10
+ from themis.cli.commands import (
11
+ benchmarks,
12
+ comparison,
13
+ config_commands,
14
+ cost,
15
+ demo,
16
+ info,
17
+ leaderboard,
18
+ sample_run,
19
+ visualize,
20
+ )
21
+ from themis.cli.commands import math_benchmarks as math_cmds
22
+ from themis.cli.commands import mcq_benchmarks as mcq_cmds
23
+
24
+ # Import provider modules to ensure they register themselves
25
+ try:
26
+ from themis.generation import clients # noqa: F401 - registers fake provider
27
+ from themis.generation.providers import (
28
+ litellm_provider, # noqa: F401
29
+ vllm_provider, # noqa: F401
30
+ )
31
+ except ImportError:
32
+ pass # Some providers may not be available
33
+
34
+ app = App(help="Run Themis experiments from the command line")
35
+
36
+ # Register demo command
37
+ app.command(name="demo")(demo.demo_command)
38
+
39
+ # Register math benchmark commands
40
+ app.command(name="math500")(math_cmds.math500_command)
41
+ app.command(name="aime24")(math_cmds.aime24_command)
42
+ app.command(name="aime25")(math_cmds.aime25_command)
43
+ app.command(name="amc23")(math_cmds.amc23_command)
44
+ app.command(name="olympiadbench")(math_cmds.olympiadbench_command)
45
+ app.command(name="beyondaime")(math_cmds.beyond_aime_command)
46
+
47
+ # Register MCQ benchmark commands
48
+ app.command(name="supergpqa")(mcq_cmds.supergpqa_command)
49
+ app.command(name="mmlu-pro")(mcq_cmds.mmlu_pro_command)
50
+
51
+ # Register config commands
52
+ app.command(name="run-config")(config_commands.run_configured_experiment)
53
+ app.command(name="validate-config")(config_commands.validate_config)
54
+ app.command(name="init")(config_commands.init_config)
55
+
56
+ # Register info and listing commands
57
+ app.command(name="list-providers")(benchmarks.list_providers)
58
+ app.command(name="list-benchmarks")(benchmarks.list_benchmarks)
59
+ app.command(name="info")(info.show_info)
60
+ app.command(name="new-project")(info.new_project)
61
+
62
+ # Register comparison commands
63
+ app.command(name="compare")(comparison.compare_command)
64
+ app.command(name="diff")(comparison.diff_command)
65
+ app.command(name="pareto")(comparison.pareto_command)
66
+
67
+ # Register cost commands
68
+ app.command(name="estimate-cost")(cost.estimate_cost_command)
69
+ app.command(name="show-pricing")(cost.show_pricing_command)
70
+
71
+ # Register visualization commands
72
+ app.command(name="visualize")(visualize.visualize_comparison_command)
73
+ app.command(name="visualize-pareto")(visualize.visualize_pareto_command)
74
+ app.command(name="visualize-distribution")(visualize.visualize_distribution_command)
75
+
76
+ # Register leaderboard command
77
+ app.command(name="leaderboard")(leaderboard.leaderboard_command)
78
+
79
+ # Register sample-run command
80
+ app.command(name="sample-run")(sample_run.sample_run_command)
81
+
82
+
83
+ def main(argv: Sequence[str] | None = None) -> int:
84
+ parsed_argv = list(argv) if argv is not None else None
85
+ try:
86
+ result = app(parsed_argv)
87
+ except SystemExit as exc: # pragma: no cover - CLI integration path
88
+ return int(exc.code or 0)
89
+ return int(result) if isinstance(result, int) else 0
90
+
91
+
92
+ if __name__ == "__main__": # pragma: no cover
93
+ raise SystemExit(main())
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ def create_project(project_name: str, project_path: Path) -> None:
7
+ if (project_path / project_name).exists():
8
+ raise FileExistsError(
9
+ f"Project '{project_name}' already exists in {project_path}"
10
+ )
11
+
12
+ project_dir = project_path / project_name
13
+ project_dir.mkdir()
14
+
15
+ templates_dir = Path(__file__).parent / "templates"
16
+
17
+ # Create config.sample.json
18
+ with open(templates_dir / "config.sample.json.tpl", "r") as f:
19
+ config_template = f.read()
20
+ with open(project_dir / "config.sample.json", "w") as f:
21
+ f.write(config_template.replace("{{project_name}}", project_name))
22
+
23
+ # Create cli.py
24
+ with open(templates_dir / "cli.py.tpl", "r") as f:
25
+ cli_template = f.read()
26
+ with open(project_dir / "cli.py", "w") as f:
27
+ f.write(cli_template)
28
+
29
+ # Create README.md
30
+ with open(templates_dir / "README.md.tpl", "r") as f:
31
+ readme_template = f.read()
32
+ with open(project_dir / "README.md", "w") as f:
33
+ f.write(readme_template.replace("{{project_name}}", project_name))
themis/cli/utils.py ADDED
@@ -0,0 +1,51 @@
1
+ """CLI utility functions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from themis.experiment import export as experiment_export
8
+ from themis.experiment import orchestrator
9
+
10
+
11
+ def export_outputs(
12
+ report: orchestrator.ExperimentReport,
13
+ *,
14
+ csv_output: Path | None,
15
+ html_output: Path | None,
16
+ json_output: Path | None,
17
+ title: str,
18
+ ) -> None:
19
+ """Export experiment report to various formats.
20
+
21
+ Args:
22
+ report: Experiment report to export
23
+ csv_output: Optional path for CSV export
24
+ html_output: Optional path for HTML export
25
+ json_output: Optional path for JSON export
26
+ title: Title for the report
27
+ """
28
+ outputs = experiment_export.export_report_bundle(
29
+ report,
30
+ csv_path=csv_output,
31
+ html_path=html_output,
32
+ json_path=json_output,
33
+ title=title,
34
+ )
35
+ for kind, output_path in outputs.items():
36
+ print(f"Exported {kind.upper()} to {output_path}")
37
+
38
+
39
+ def effective_total(total: int, limit: int | None) -> int:
40
+ """Calculate effective total based on limit.
41
+
42
+ Args:
43
+ total: Total number of items
44
+ limit: Optional limit
45
+
46
+ Returns:
47
+ Effective total (min of total and limit)
48
+ """
49
+ if limit is None:
50
+ return total
51
+ return min(total, limit)
@@ -0,0 +1,19 @@
1
+ """Hydra-backed configuration helpers for assembling experiments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .loader import load_experiment_config
6
+ from .runtime import (
7
+ load_dataset_from_config,
8
+ run_experiment_from_config,
9
+ summarize_report_for_config,
10
+ )
11
+ from .schema import ExperimentConfig
12
+
13
+ __all__ = [
14
+ "ExperimentConfig",
15
+ "load_dataset_from_config",
16
+ "load_experiment_config",
17
+ "run_experiment_from_config",
18
+ "summarize_report_for_config",
19
+ ]
@@ -0,0 +1,27 @@
1
+ """Utilities for loading experiment configs via Hydra/OmegaConf."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ from omegaconf import OmegaConf
9
+
10
+ from . import schema
11
+
12
+
13
+ def load_experiment_config(
14
+ config_path: Path,
15
+ overrides: Iterable[str] | None = None,
16
+ ) -> schema.ExperimentConfig:
17
+ """Load and validate an experiment config file with optional overrides."""
18
+
19
+ base = OmegaConf.structured(schema.ExperimentConfig)
20
+ file_conf = OmegaConf.load(config_path)
21
+ merged = OmegaConf.merge(base, file_conf)
22
+
23
+ if overrides:
24
+ override_conf = OmegaConf.from_dotlist(list(overrides))
25
+ merged = OmegaConf.merge(merged, override_conf)
26
+
27
+ return OmegaConf.to_object(merged)
@@ -0,0 +1,34 @@
1
+ """Registry for experiment builders."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable
6
+
7
+ from themis.config import schema
8
+ from themis.experiment import orchestrator
9
+
10
+ ExperimentBuilder = Callable[
11
+ [schema.ExperimentConfig], orchestrator.ExperimentOrchestrator
12
+ ]
13
+
14
+ _EXPERIMENT_BUILDERS: dict[str, ExperimentBuilder] = {}
15
+
16
+
17
+ def register_experiment_builder(task: str) -> Callable[[ExperimentBuilder], ExperimentBuilder]:
18
+ """Decorator to register an experiment builder for a specific task."""
19
+
20
+ def decorator(builder: ExperimentBuilder) -> ExperimentBuilder:
21
+ _EXPERIMENT_BUILDERS[task] = builder
22
+ return builder
23
+
24
+ return decorator
25
+
26
+
27
+ def get_experiment_builder(task: str) -> ExperimentBuilder:
28
+ """Get the experiment builder for a specific task."""
29
+ if task not in _EXPERIMENT_BUILDERS:
30
+ raise ValueError(
31
+ f"No experiment builder registered for task '{task}'. "
32
+ f"Available tasks: {', '.join(sorted(_EXPERIMENT_BUILDERS.keys()))}"
33
+ )
34
+ return _EXPERIMENT_BUILDERS[task]