themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,207 @@
1
+ """Multiple-choice question benchmark commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Callable, Literal, Sequence
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.datasets import (
12
+ mmlu_pro as mmlu_pro_dataset,
13
+ )
14
+ from themis.datasets import (
15
+ super_gpqa as super_gpqa_dataset,
16
+ )
17
+ from themis.experiment import mcq as mcq_experiment
18
+ from themis.experiment import storage as experiment_storage
19
+ from themis.utils.logging_utils import configure_logging
20
+ from themis.utils.progress import ProgressReporter
21
+
22
+
23
+ def load_multiple_choice_dataset(
24
+ *,
25
+ loader: Callable[..., Sequence],
26
+ source: Literal["huggingface", "local"],
27
+ data_dir: Path | None,
28
+ split: str,
29
+ limit: int | None,
30
+ subjects: Sequence[str] | None,
31
+ ):
32
+ """Load multiple choice dataset.
33
+
34
+ Args:
35
+ loader: Dataset loader function
36
+ source: Dataset source
37
+ data_dir: Directory containing local dataset
38
+ split: Dataset split
39
+ limit: Max rows to load
40
+ subjects: Subjects to filter
41
+
42
+ Returns:
43
+ List of generation examples
44
+ """
45
+ if source == "local" and data_dir is None:
46
+ raise ValueError(
47
+ "The --data-dir option is required when --source=local so Themis "
48
+ "knows where to read the dataset."
49
+ )
50
+ samples = loader(
51
+ source=source,
52
+ data_dir=data_dir,
53
+ split=split,
54
+ limit=limit,
55
+ subjects=subjects,
56
+ )
57
+ return [sample.to_generation_example() for sample in samples]
58
+
59
+
60
+ def supergpqa_command(
61
+ *,
62
+ source: Annotated[
63
+ Literal["huggingface", "local"], Parameter(help="Dataset source")
64
+ ] = "huggingface",
65
+ split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
66
+ data_dir: Annotated[
67
+ Path | None, Parameter(help="Directory containing local dataset")
68
+ ] = None,
69
+ limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
70
+ subjects: Annotated[
71
+ tuple[str, ...], Parameter(help="Subjects or categories to filter")
72
+ ] = (),
73
+ max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
74
+ storage: Annotated[
75
+ Path | None, Parameter(help="Cache directory for datasets/results")
76
+ ] = None,
77
+ run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
78
+ resume: Annotated[
79
+ bool, Parameter(help="Reuse cached generations when storage is set")
80
+ ] = True,
81
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
82
+ log_level: Annotated[
83
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
84
+ ] = "info",
85
+ csv_output: Annotated[
86
+ Path | None, Parameter(help="Write CSV export to this path")
87
+ ] = None,
88
+ html_output: Annotated[
89
+ Path | None, Parameter(help="Write HTML summary to this path")
90
+ ] = None,
91
+ json_output: Annotated[
92
+ Path | None, Parameter(help="Write JSON export to this path")
93
+ ] = None,
94
+ ) -> int:
95
+ """Run the SuperGPQA multiple-choice evaluation."""
96
+ configure_logging(log_level)
97
+ subject_filter = list(subjects) if subjects else None
98
+ rows = load_multiple_choice_dataset(
99
+ loader=super_gpqa_dataset.load_super_gpqa,
100
+ source=source,
101
+ data_dir=data_dir,
102
+ split=split,
103
+ limit=limit,
104
+ subjects=subject_filter,
105
+ )
106
+
107
+ storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
108
+ experiment = mcq_experiment.build_multiple_choice_json_experiment(
109
+ dataset_name="supergpqa",
110
+ task_id="supergpqa",
111
+ temperature=temperature,
112
+ storage=storage_impl,
113
+ )
114
+
115
+ total = effective_total(len(rows), max_samples)
116
+ with ProgressReporter(total=total, description="Generating") as progress:
117
+ report = experiment.run(
118
+ rows,
119
+ max_samples=max_samples,
120
+ run_id=run_id,
121
+ resume=resume,
122
+ on_result=progress.on_result,
123
+ )
124
+ print(mcq_experiment.summarize_report(report))
125
+ export_outputs(
126
+ report,
127
+ csv_output=csv_output,
128
+ html_output=html_output,
129
+ json_output=json_output,
130
+ title="supergpqa experiment",
131
+ )
132
+ return 0
133
+
134
+
135
+ def mmlu_pro_command(
136
+ *,
137
+ source: Annotated[
138
+ Literal["huggingface", "local"], Parameter(help="Dataset source")
139
+ ] = "huggingface",
140
+ split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
141
+ data_dir: Annotated[
142
+ Path | None, Parameter(help="Directory containing local dataset")
143
+ ] = None,
144
+ limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
145
+ subjects: Annotated[
146
+ tuple[str, ...], Parameter(help="Subjects or categories to filter")
147
+ ] = (),
148
+ max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
149
+ storage: Annotated[
150
+ Path | None, Parameter(help="Cache directory for datasets/results")
151
+ ] = None,
152
+ run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
153
+ resume: Annotated[
154
+ bool, Parameter(help="Reuse cached generations when storage is set")
155
+ ] = True,
156
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
157
+ log_level: Annotated[
158
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
159
+ ] = "info",
160
+ csv_output: Annotated[
161
+ Path | None, Parameter(help="Write CSV export to this path")
162
+ ] = None,
163
+ html_output: Annotated[
164
+ Path | None, Parameter(help="Write HTML summary to this path")
165
+ ] = None,
166
+ json_output: Annotated[
167
+ Path | None, Parameter(help="Write JSON export to this path")
168
+ ] = None,
169
+ ) -> int:
170
+ """Run the MMLU-Pro multiple-choice evaluation."""
171
+ configure_logging(log_level)
172
+ subject_filter = list(subjects) if subjects else None
173
+ rows = load_multiple_choice_dataset(
174
+ loader=mmlu_pro_dataset.load_mmlu_pro,
175
+ source=source,
176
+ data_dir=data_dir,
177
+ split=split,
178
+ limit=limit,
179
+ subjects=subject_filter,
180
+ )
181
+
182
+ storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
183
+ experiment = mcq_experiment.build_multiple_choice_json_experiment(
184
+ dataset_name="mmlu-pro",
185
+ task_id="mmlu_pro",
186
+ temperature=temperature,
187
+ storage=storage_impl,
188
+ )
189
+
190
+ total = effective_total(len(rows), max_samples)
191
+ with ProgressReporter(total=total, description="Generating") as progress:
192
+ report = experiment.run(
193
+ rows,
194
+ max_samples=max_samples,
195
+ run_id=run_id,
196
+ resume=resume,
197
+ on_result=progress.on_result,
198
+ )
199
+ print(mcq_experiment.summarize_report(report))
200
+ export_outputs(
201
+ report,
202
+ csv_output=csv_output,
203
+ html_output=html_output,
204
+ json_output=json_output,
205
+ title="mmlu_pro experiment",
206
+ )
207
+ return 0
@@ -0,0 +1,252 @@
1
+ """Quick results viewing commands for experiment summaries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ from cyclopts import Parameter
10
+
11
+
12
+ def summary_command(
13
+ *,
14
+ run_id: Annotated[
15
+ str,
16
+ Parameter(
17
+ help="Run ID to view summary for",
18
+ ),
19
+ ],
20
+ storage: Annotated[
21
+ Path,
22
+ Parameter(
23
+ help="Storage directory containing experiment results",
24
+ ),
25
+ ] = Path(".cache/runs"),
26
+ ) -> int:
27
+ """View quick summary of a single experiment run.
28
+
29
+ This command reads the lightweight summary.json file (~1KB) instead of
30
+ the full report.json (~1.6MB), making it much faster for quick checks.
31
+
32
+ Examples:
33
+ # View summary for a specific run
34
+ uv run python -m themis.cli results summary \\
35
+ --run-id run-20260118-032014 \\
36
+ --storage outputs/evaluation
37
+
38
+ # Quick check of latest run
39
+ uv run python -m themis.cli results summary \\
40
+ --run-id $(ls -t outputs/evaluation | head -1)
41
+ """
42
+ try:
43
+ # Try to find summary.json
44
+ run_dir = storage / run_id
45
+ summary_path = run_dir / "summary.json"
46
+
47
+ if not summary_path.exists():
48
+ print(f"Error: Summary file not found at {summary_path}")
49
+ print("\nNote: summary.json is only available for runs created with")
50
+ print("the updated export functionality. For older runs, use the")
51
+ print("'compare' command which reads full report.json files.")
52
+ return 1
53
+
54
+ # Load summary
55
+ with summary_path.open("r", encoding="utf-8") as f:
56
+ summary = json.load(f)
57
+
58
+ # Display summary
59
+ print("=" * 80)
60
+ print(f"Experiment Summary: {run_id}")
61
+ print("=" * 80)
62
+
63
+ # Basic info
64
+ print(f"\nRun ID: {summary.get('run_id', 'N/A')}")
65
+ print(f"Total Samples: {summary.get('total_samples', 0)}")
66
+
67
+ # Metadata
68
+ metadata = summary.get("metadata", {})
69
+ if metadata:
70
+ print("\nConfiguration:")
71
+ print(f" Model: {metadata.get('model', 'N/A')}")
72
+ print(f" Prompt: {metadata.get('prompt_template', 'N/A')}")
73
+ sampling = metadata.get("sampling", {})
74
+ if sampling:
75
+ print(f" Temperature: {sampling.get('temperature', 'N/A')}")
76
+ print(f" Max Tokens: {sampling.get('max_tokens', 'N/A')}")
77
+
78
+ # Metrics
79
+ metrics = summary.get("metrics", {})
80
+ if metrics:
81
+ print("\nMetrics:")
82
+ for name, data in metrics.items():
83
+ mean = data.get("mean", 0)
84
+ count = data.get("count", 0)
85
+ print(f" {name}: {mean:.4f} (n={count})")
86
+
87
+ # Cost
88
+ cost = summary.get("cost_usd")
89
+ if cost is not None:
90
+ print(f"\nCost: ${cost:.4f}")
91
+
92
+ # Failures
93
+ failures = summary.get("failures", 0)
94
+ failure_rate = summary.get("failure_rate", 0)
95
+ if failures > 0:
96
+ print(f"\nFailures: {failures} ({failure_rate:.2%})")
97
+
98
+ print("\n" + "=" * 80)
99
+ return 0
100
+
101
+ except FileNotFoundError:
102
+ print(f"Error: Run directory not found: {run_dir}")
103
+ return 1
104
+ except json.JSONDecodeError as e:
105
+ print(f"Error: Invalid JSON in summary file: {e}")
106
+ return 1
107
+ except Exception as e:
108
+ print(f"Unexpected error: {e}")
109
+ import traceback
110
+
111
+ traceback.print_exc()
112
+ return 1
113
+
114
+
115
+ def list_command(
116
+ *,
117
+ storage: Annotated[
118
+ Path,
119
+ Parameter(
120
+ help="Storage directory containing experiment results",
121
+ ),
122
+ ] = Path(".cache/runs"),
123
+ limit: Annotated[
124
+ int | None,
125
+ Parameter(
126
+ help="Maximum number of runs to display",
127
+ ),
128
+ ] = None,
129
+ sort_by: Annotated[
130
+ str,
131
+ Parameter(
132
+ help="Sort runs by: time (newest first) or metric name",
133
+ ),
134
+ ] = "time",
135
+ ) -> int:
136
+ """List all experiment runs with quick summaries.
137
+
138
+ This command scans for summary.json files and displays a table of all runs.
139
+ Much faster than loading full report.json files.
140
+
141
+ Examples:
142
+ # List all runs
143
+ uv run python -m themis.cli results list
144
+
145
+ # List 10 most recent runs
146
+ uv run python -m themis.cli results list --limit 10
147
+
148
+ # List runs sorted by accuracy
149
+ uv run python -m themis.cli results list --sort-by accuracy
150
+ """
151
+ try:
152
+ if not storage.exists():
153
+ print(f"Error: Storage directory not found: {storage}")
154
+ return 1
155
+
156
+ # Find all summary.json files
157
+ summaries = []
158
+ for run_dir in storage.iterdir():
159
+ if not run_dir.is_dir():
160
+ continue
161
+ summary_path = run_dir / "summary.json"
162
+ if summary_path.exists():
163
+ try:
164
+ with summary_path.open("r", encoding="utf-8") as f:
165
+ summary = json.load(f)
166
+ summary["_run_dir"] = run_dir.name
167
+ summary["_mtime"] = summary_path.stat().st_mtime
168
+ summaries.append(summary)
169
+ except Exception:
170
+ continue
171
+
172
+ if not summaries:
173
+ print(f"No experiment runs found in {storage}")
174
+ print("\nNote: Only runs with summary.json files are shown.")
175
+ return 0
176
+
177
+ # Sort summaries
178
+ if sort_by == "time":
179
+ summaries.sort(key=lambda s: s.get("_mtime", 0), reverse=True)
180
+ else:
181
+ # Sort by metric value
182
+ summaries.sort(
183
+ key=lambda s: s.get("metrics", {}).get(sort_by, {}).get("mean", 0),
184
+ reverse=True,
185
+ )
186
+
187
+ # Apply limit
188
+ if limit:
189
+ summaries = summaries[:limit]
190
+
191
+ # Display table
192
+ print("=" * 120)
193
+ print(f"Found {len(summaries)} experiment run(s)")
194
+ print("=" * 120)
195
+
196
+ # Collect all metric names
197
+ all_metrics = set()
198
+ for s in summaries:
199
+ all_metrics.update(s.get("metrics", {}).keys())
200
+ metric_names = sorted(all_metrics)
201
+
202
+ # Header
203
+ header_cols = ["Run ID", "Model", "Samples"] + metric_names + ["Cost ($)"]
204
+ col_widths = [25, 30, 8] + [12] * len(metric_names) + [10]
205
+
206
+ header = " | ".join(
207
+ col.ljust(width)[:width] for col, width in zip(header_cols, col_widths)
208
+ )
209
+ print(header)
210
+ print("-" * len(header))
211
+
212
+ # Rows
213
+ for summary in summaries:
214
+ run_id = summary.get("_run_dir", "N/A")[:25]
215
+ model = summary.get("metadata", {}).get("model", "N/A")[:30]
216
+ samples = str(summary.get("total_samples", 0))
217
+ cost = summary.get("cost_usd")
218
+
219
+ row_values = [run_id, model, samples]
220
+
221
+ # Add metric values
222
+ for metric_name in metric_names:
223
+ metric_data = summary.get("metrics", {}).get(metric_name, {})
224
+ mean = metric_data.get("mean")
225
+ if mean is not None:
226
+ row_values.append(f"{mean:.4f}")
227
+ else:
228
+ row_values.append("N/A")
229
+
230
+ # Add cost
231
+ if cost is not None:
232
+ row_values.append(f"{cost:.4f}")
233
+ else:
234
+ row_values.append("N/A")
235
+
236
+ row = " | ".join(
237
+ val.ljust(width)[:width] for val, width in zip(row_values, col_widths)
238
+ )
239
+ print(row)
240
+
241
+ print("=" * 120)
242
+ return 0
243
+
244
+ except Exception as e:
245
+ print(f"Unexpected error: {e}")
246
+ import traceback
247
+
248
+ traceback.print_exc()
249
+ return 1
250
+
251
+
252
+ __all__ = ["summary_command", "list_command"]
@@ -0,0 +1,244 @@
1
+ """Sample run command for quick testing before full experiments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.commands.config_commands import run_configured_experiment
11
+
12
+
13
+ def sample_run_command(
14
+ *,
15
+ config: Annotated[Path, Parameter(help="Path to experiment configuration file")],
16
+ n: Annotated[int, Parameter(help="Number of samples to test")] = 5,
17
+ verbose: Annotated[bool, Parameter(help="Show detailed output")] = False,
18
+ show_outputs: Annotated[
19
+ bool, Parameter(help="Display sample outputs and predictions")
20
+ ] = False,
21
+ estimate_cost: Annotated[
22
+ bool, Parameter(help="Estimate full run cost based on sample")
23
+ ] = True,
24
+ ) -> int:
25
+ """Quick test run on N samples before running full experiment.
26
+
27
+ This command helps you:
28
+ - Test your configuration works correctly
29
+ - Preview sample outputs before full run
30
+ - Estimate total cost based on actual token usage
31
+ - Catch configuration errors early
32
+ - Iterate on prompts quickly
33
+
34
+ Examples:
35
+ # Basic quick test
36
+ uv run python -m themis.cli sample-run \\
37
+ --config my_config.yaml \\
38
+ --n 5
39
+
40
+ # Test with verbose output
41
+ uv run python -m themis.cli sample-run \\
42
+ --config my_config.yaml \\
43
+ --n 3 \\
44
+ --verbose \\
45
+ --show-outputs
46
+
47
+ # Test and estimate full run cost
48
+ uv run python -m themis.cli sample-run \\
49
+ --config my_config.yaml \\
50
+ --n 10 \\
51
+ --estimate-cost
52
+ """
53
+ try:
54
+ import json
55
+ import tempfile
56
+
57
+ from hydra import compose, initialize_config_dir
58
+
59
+ # Load config
60
+ config_path = Path(config).resolve()
61
+ if not config_path.exists():
62
+ print(f"Error: Config file not found: {config_path}")
63
+ return 1
64
+
65
+ config_dir = str(config_path.parent)
66
+ config_name = config_path.stem
67
+
68
+ print("=" * 80)
69
+ print(f"🧪 Sample Run: Testing {n} samples")
70
+ print("=" * 80)
71
+ print(f"Config: {config_path}")
72
+ print(f"Samples: {n}")
73
+ print()
74
+
75
+ # Initialize Hydra
76
+ with initialize_config_dir(config_dir=config_dir, version_base=None):
77
+ cfg = compose(config_name=config_name)
78
+
79
+ # Override dataset limit
80
+ original_limit = cfg.dataset.get("limit")
81
+ cfg.dataset.limit = n
82
+
83
+ # Use temporary storage
84
+ with tempfile.TemporaryDirectory() as temp_dir:
85
+ cfg.storage.path = temp_dir
86
+
87
+ # Generate temporary run_id
88
+ cfg.run_id = "sample-run-temp"
89
+ cfg.resume = False
90
+
91
+ print("📋 Configuration:")
92
+ print(f" Model: {cfg.generation.model_identifier}")
93
+ print(f" Provider: {cfg.generation.provider.name}")
94
+ print(f" Temperature: {cfg.generation.sampling.temperature}")
95
+ print(f" Max tokens: {cfg.generation.sampling.max_tokens}")
96
+ if hasattr(cfg.dataset, "source"):
97
+ print(f" Dataset: {cfg.dataset.source}")
98
+ print()
99
+
100
+ # Run experiment on sample
101
+ print("🚀 Running sample experiment...")
102
+ print()
103
+
104
+ # Redirect to capture run
105
+ result = run_configured_experiment(
106
+ config_path=config_path,
107
+ overrides=[
108
+ f"dataset.limit={n}",
109
+ f"storage.path={temp_dir}",
110
+ "run_id=sample-run-temp",
111
+ "resume=false",
112
+ ],
113
+ )
114
+
115
+ if result != 0:
116
+ print("\n❌ Sample run failed")
117
+ return result
118
+
119
+ # Load results
120
+ report_path = Path(temp_dir) / "sample-run-temp" / "report.json"
121
+ if not report_path.exists():
122
+ print("\n⚠️ No report generated")
123
+ return 1
124
+
125
+ with report_path.open("r") as f:
126
+ report_data = json.load(f)
127
+
128
+ # Display results
129
+ print("\n" + "=" * 80)
130
+ print("✅ Sample Run Complete")
131
+ print("=" * 80)
132
+
133
+ # Metrics
134
+ metrics = report_data.get("metrics", [])
135
+ if metrics:
136
+ print("\n📊 Metrics:")
137
+ for metric in metrics:
138
+ name = metric["name"]
139
+ mean = metric["mean"]
140
+ count = metric["count"]
141
+ print(f" {name}: {mean:.4f} (n={count})")
142
+
143
+ # Cost analysis
144
+ cost_data = report_data.get("summary", {}).get("cost")
145
+ if cost_data:
146
+ total_cost = cost_data.get("total_cost", 0)
147
+ token_counts = cost_data.get("token_counts", {})
148
+ prompt_tokens = token_counts.get("prompt_tokens", 0)
149
+ completion_tokens = token_counts.get("completion_tokens", 0)
150
+
151
+ print("\n💰 Cost (sample run):")
152
+ print(f" Total: ${total_cost:.4f}")
153
+ print(f" Per sample: ${total_cost / n:.6f}")
154
+ print(
155
+ f" Prompt tokens: {prompt_tokens} ({prompt_tokens / n:.0f} avg)"
156
+ )
157
+ print(
158
+ f" Completion tokens: {completion_tokens} ({completion_tokens / n:.0f} avg)"
159
+ )
160
+
161
+ # Estimate full run cost
162
+ if estimate_cost and original_limit:
163
+ full_cost = (total_cost / n) * original_limit
164
+ print("\n📈 Estimated full run cost:")
165
+ print(f" Dataset size: {original_limit} samples")
166
+ print(f" Estimated cost: ${full_cost:.2f}")
167
+ print(
168
+ f" 95% CI: ${full_cost * 0.8:.2f} - ${full_cost * 1.2:.2f}"
169
+ )
170
+
171
+ if full_cost > 10.0:
172
+ print(f"\n⚠️ Warning: Estimated cost is ${full_cost:.2f}")
173
+ print(" Consider using --limit for initial testing")
174
+
175
+ # Failures
176
+ failures = report_data.get("run_failures", [])
177
+ eval_failures = report_data.get("evaluation_failures", [])
178
+ total_failures = len(failures) + len(eval_failures)
179
+
180
+ if total_failures > 0:
181
+ print(f"\n⚠️ Failures: {total_failures}")
182
+ if failures:
183
+ print(f" Generation failures: {len(failures)}")
184
+ if verbose:
185
+ for failure in failures[:3]:
186
+ print(
187
+ f" - {failure.get('sample_id')}: {failure.get('message')}"
188
+ )
189
+ if eval_failures:
190
+ print(f" Evaluation failures: {len(eval_failures)}")
191
+
192
+ # Show sample outputs
193
+ if show_outputs:
194
+ samples = report_data.get("samples", [])
195
+ print("\n📝 Sample Outputs (showing up to 3):")
196
+ for i, sample in enumerate(samples[:3], 1):
197
+ sample_id = sample.get("sample_id", f"sample-{i}")
198
+ scores = sample.get("scores", [])
199
+
200
+ print(f"\n Sample {i}: {sample_id}")
201
+ if scores:
202
+ for score in scores:
203
+ metric_name = score.get("metric")
204
+ value = score.get("value")
205
+ print(f" {metric_name}: {value:.4f}")
206
+
207
+ # Summary
208
+ print("\n" + "=" * 80)
209
+ print("✨ Next Steps:")
210
+ print("=" * 80)
211
+
212
+ if total_failures == 0 and metrics:
213
+ avg_metric = metrics[0]["mean"]
214
+ if avg_metric > 0.1: # Reasonable performance
215
+ print(" ✅ Configuration looks good!")
216
+ print(" Run full experiment with:")
217
+ print(
218
+ f" uv run python -m themis.cli run-config --config {config_path}"
219
+ )
220
+ else:
221
+ print(" ⚠️ Low performance on sample - consider:")
222
+ print(" - Adjusting prompt template")
223
+ print(" - Tuning temperature/max_tokens")
224
+ print(" - Testing different model")
225
+ else:
226
+ print(" ⚠️ Issues detected:")
227
+ if total_failures > 0:
228
+ print(" - Fix failures before full run")
229
+ if not metrics:
230
+ print(" - Check evaluation metrics")
231
+ print(" - Review configuration")
232
+
233
+ return 0
234
+
235
+ except Exception as e:
236
+ print(f"\n❌ Error: {e}")
237
+ import traceback
238
+
239
+ if verbose:
240
+ traceback.print_exc()
241
+ return 1
242
+
243
+
244
+ __all__ = ["sample_run_command"]