themis-eval 0.1.0__py3-none-any.whl â 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +463 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +184 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +348 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +134 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +798 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +415 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +1458 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +72 -0
- themis/integrations/wandb.py +77 -0
- themis/interfaces/__init__.py +169 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- themis_eval-0.2.0.dist-info/RECORD +157 -0
- {themis_eval-0.1.0.dist-info â themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.0.dist-info/METADATA +0 -758
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info â themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info â themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Multiple-choice question benchmark commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Callable, Literal, Sequence
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.cli.utils import effective_total, export_outputs
|
|
11
|
+
from themis.datasets import (
|
|
12
|
+
mmlu_pro as mmlu_pro_dataset,
|
|
13
|
+
)
|
|
14
|
+
from themis.datasets import (
|
|
15
|
+
super_gpqa as super_gpqa_dataset,
|
|
16
|
+
)
|
|
17
|
+
from themis.experiment import mcq as mcq_experiment
|
|
18
|
+
from themis.experiment import storage as experiment_storage
|
|
19
|
+
from themis.utils.logging_utils import configure_logging
|
|
20
|
+
from themis.utils.progress import ProgressReporter
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_multiple_choice_dataset(
|
|
24
|
+
*,
|
|
25
|
+
loader: Callable[..., Sequence],
|
|
26
|
+
source: Literal["huggingface", "local"],
|
|
27
|
+
data_dir: Path | None,
|
|
28
|
+
split: str,
|
|
29
|
+
limit: int | None,
|
|
30
|
+
subjects: Sequence[str] | None,
|
|
31
|
+
):
|
|
32
|
+
"""Load multiple choice dataset.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
loader: Dataset loader function
|
|
36
|
+
source: Dataset source
|
|
37
|
+
data_dir: Directory containing local dataset
|
|
38
|
+
split: Dataset split
|
|
39
|
+
limit: Max rows to load
|
|
40
|
+
subjects: Subjects to filter
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List of generation examples
|
|
44
|
+
"""
|
|
45
|
+
if source == "local" and data_dir is None:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
"The --data-dir option is required when --source=local so Themis "
|
|
48
|
+
"knows where to read the dataset."
|
|
49
|
+
)
|
|
50
|
+
samples = loader(
|
|
51
|
+
source=source,
|
|
52
|
+
data_dir=data_dir,
|
|
53
|
+
split=split,
|
|
54
|
+
limit=limit,
|
|
55
|
+
subjects=subjects,
|
|
56
|
+
)
|
|
57
|
+
return [sample.to_generation_example() for sample in samples]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def supergpqa_command(
|
|
61
|
+
*,
|
|
62
|
+
source: Annotated[
|
|
63
|
+
Literal["huggingface", "local"], Parameter(help="Dataset source")
|
|
64
|
+
] = "huggingface",
|
|
65
|
+
split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
|
|
66
|
+
data_dir: Annotated[
|
|
67
|
+
Path | None, Parameter(help="Directory containing local dataset")
|
|
68
|
+
] = None,
|
|
69
|
+
limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
|
|
70
|
+
subjects: Annotated[
|
|
71
|
+
tuple[str, ...], Parameter(help="Subjects or categories to filter")
|
|
72
|
+
] = (),
|
|
73
|
+
max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
|
|
74
|
+
storage: Annotated[
|
|
75
|
+
Path | None, Parameter(help="Cache directory for datasets/results")
|
|
76
|
+
] = None,
|
|
77
|
+
run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
|
|
78
|
+
resume: Annotated[
|
|
79
|
+
bool, Parameter(help="Reuse cached generations when storage is set")
|
|
80
|
+
] = True,
|
|
81
|
+
temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
|
|
82
|
+
log_level: Annotated[
|
|
83
|
+
str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
|
|
84
|
+
] = "info",
|
|
85
|
+
csv_output: Annotated[
|
|
86
|
+
Path | None, Parameter(help="Write CSV export to this path")
|
|
87
|
+
] = None,
|
|
88
|
+
html_output: Annotated[
|
|
89
|
+
Path | None, Parameter(help="Write HTML summary to this path")
|
|
90
|
+
] = None,
|
|
91
|
+
json_output: Annotated[
|
|
92
|
+
Path | None, Parameter(help="Write JSON export to this path")
|
|
93
|
+
] = None,
|
|
94
|
+
) -> int:
|
|
95
|
+
"""Run the SuperGPQA multiple-choice evaluation."""
|
|
96
|
+
configure_logging(log_level)
|
|
97
|
+
subject_filter = list(subjects) if subjects else None
|
|
98
|
+
rows = load_multiple_choice_dataset(
|
|
99
|
+
loader=super_gpqa_dataset.load_super_gpqa,
|
|
100
|
+
source=source,
|
|
101
|
+
data_dir=data_dir,
|
|
102
|
+
split=split,
|
|
103
|
+
limit=limit,
|
|
104
|
+
subjects=subject_filter,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
|
|
108
|
+
experiment = mcq_experiment.build_multiple_choice_json_experiment(
|
|
109
|
+
dataset_name="supergpqa",
|
|
110
|
+
task_id="supergpqa",
|
|
111
|
+
temperature=temperature,
|
|
112
|
+
storage=storage_impl,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
total = effective_total(len(rows), max_samples)
|
|
116
|
+
with ProgressReporter(total=total, description="Generating") as progress:
|
|
117
|
+
report = experiment.run(
|
|
118
|
+
rows,
|
|
119
|
+
max_samples=max_samples,
|
|
120
|
+
run_id=run_id,
|
|
121
|
+
resume=resume,
|
|
122
|
+
on_result=progress.on_result,
|
|
123
|
+
)
|
|
124
|
+
print(mcq_experiment.summarize_report(report))
|
|
125
|
+
export_outputs(
|
|
126
|
+
report,
|
|
127
|
+
csv_output=csv_output,
|
|
128
|
+
html_output=html_output,
|
|
129
|
+
json_output=json_output,
|
|
130
|
+
title="supergpqa experiment",
|
|
131
|
+
)
|
|
132
|
+
return 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def mmlu_pro_command(
|
|
136
|
+
*,
|
|
137
|
+
source: Annotated[
|
|
138
|
+
Literal["huggingface", "local"], Parameter(help="Dataset source")
|
|
139
|
+
] = "huggingface",
|
|
140
|
+
split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
|
|
141
|
+
data_dir: Annotated[
|
|
142
|
+
Path | None, Parameter(help="Directory containing local dataset")
|
|
143
|
+
] = None,
|
|
144
|
+
limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
|
|
145
|
+
subjects: Annotated[
|
|
146
|
+
tuple[str, ...], Parameter(help="Subjects or categories to filter")
|
|
147
|
+
] = (),
|
|
148
|
+
max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
|
|
149
|
+
storage: Annotated[
|
|
150
|
+
Path | None, Parameter(help="Cache directory for datasets/results")
|
|
151
|
+
] = None,
|
|
152
|
+
run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
|
|
153
|
+
resume: Annotated[
|
|
154
|
+
bool, Parameter(help="Reuse cached generations when storage is set")
|
|
155
|
+
] = True,
|
|
156
|
+
temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
|
|
157
|
+
log_level: Annotated[
|
|
158
|
+
str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
|
|
159
|
+
] = "info",
|
|
160
|
+
csv_output: Annotated[
|
|
161
|
+
Path | None, Parameter(help="Write CSV export to this path")
|
|
162
|
+
] = None,
|
|
163
|
+
html_output: Annotated[
|
|
164
|
+
Path | None, Parameter(help="Write HTML summary to this path")
|
|
165
|
+
] = None,
|
|
166
|
+
json_output: Annotated[
|
|
167
|
+
Path | None, Parameter(help="Write JSON export to this path")
|
|
168
|
+
] = None,
|
|
169
|
+
) -> int:
|
|
170
|
+
"""Run the MMLU-Pro multiple-choice evaluation."""
|
|
171
|
+
configure_logging(log_level)
|
|
172
|
+
subject_filter = list(subjects) if subjects else None
|
|
173
|
+
rows = load_multiple_choice_dataset(
|
|
174
|
+
loader=mmlu_pro_dataset.load_mmlu_pro,
|
|
175
|
+
source=source,
|
|
176
|
+
data_dir=data_dir,
|
|
177
|
+
split=split,
|
|
178
|
+
limit=limit,
|
|
179
|
+
subjects=subject_filter,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
|
|
183
|
+
experiment = mcq_experiment.build_multiple_choice_json_experiment(
|
|
184
|
+
dataset_name="mmlu-pro",
|
|
185
|
+
task_id="mmlu_pro",
|
|
186
|
+
temperature=temperature,
|
|
187
|
+
storage=storage_impl,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
total = effective_total(len(rows), max_samples)
|
|
191
|
+
with ProgressReporter(total=total, description="Generating") as progress:
|
|
192
|
+
report = experiment.run(
|
|
193
|
+
rows,
|
|
194
|
+
max_samples=max_samples,
|
|
195
|
+
run_id=run_id,
|
|
196
|
+
resume=resume,
|
|
197
|
+
on_result=progress.on_result,
|
|
198
|
+
)
|
|
199
|
+
print(mcq_experiment.summarize_report(report))
|
|
200
|
+
export_outputs(
|
|
201
|
+
report,
|
|
202
|
+
csv_output=csv_output,
|
|
203
|
+
html_output=html_output,
|
|
204
|
+
json_output=json_output,
|
|
205
|
+
title="mmlu_pro experiment",
|
|
206
|
+
)
|
|
207
|
+
return 0
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Quick results viewing commands for experiment summaries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated
|
|
8
|
+
|
|
9
|
+
from cyclopts import Parameter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def summary_command(
|
|
13
|
+
*,
|
|
14
|
+
run_id: Annotated[
|
|
15
|
+
str,
|
|
16
|
+
Parameter(
|
|
17
|
+
help="Run ID to view summary for",
|
|
18
|
+
),
|
|
19
|
+
],
|
|
20
|
+
storage: Annotated[
|
|
21
|
+
Path,
|
|
22
|
+
Parameter(
|
|
23
|
+
help="Storage directory containing experiment results",
|
|
24
|
+
),
|
|
25
|
+
] = Path(".cache/runs"),
|
|
26
|
+
) -> int:
|
|
27
|
+
"""View quick summary of a single experiment run.
|
|
28
|
+
|
|
29
|
+
This command reads the lightweight summary.json file (~1KB) instead of
|
|
30
|
+
the full report.json (~1.6MB), making it much faster for quick checks.
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
# View summary for a specific run
|
|
34
|
+
uv run python -m themis.cli results summary \\
|
|
35
|
+
--run-id run-20260118-032014 \\
|
|
36
|
+
--storage outputs/evaluation
|
|
37
|
+
|
|
38
|
+
# Quick check of latest run
|
|
39
|
+
uv run python -m themis.cli results summary \\
|
|
40
|
+
--run-id $(ls -t outputs/evaluation | head -1)
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
# Try to find summary.json
|
|
44
|
+
run_dir = storage / run_id
|
|
45
|
+
summary_path = run_dir / "summary.json"
|
|
46
|
+
|
|
47
|
+
if not summary_path.exists():
|
|
48
|
+
print(f"Error: Summary file not found at {summary_path}")
|
|
49
|
+
print("\nNote: summary.json is only available for runs created with")
|
|
50
|
+
print("the updated export functionality. For older runs, use the")
|
|
51
|
+
print("'compare' command which reads full report.json files.")
|
|
52
|
+
return 1
|
|
53
|
+
|
|
54
|
+
# Load summary
|
|
55
|
+
with summary_path.open("r", encoding="utf-8") as f:
|
|
56
|
+
summary = json.load(f)
|
|
57
|
+
|
|
58
|
+
# Display summary
|
|
59
|
+
print("=" * 80)
|
|
60
|
+
print(f"Experiment Summary: {run_id}")
|
|
61
|
+
print("=" * 80)
|
|
62
|
+
|
|
63
|
+
# Basic info
|
|
64
|
+
print(f"\nRun ID: {summary.get('run_id', 'N/A')}")
|
|
65
|
+
print(f"Total Samples: {summary.get('total_samples', 0)}")
|
|
66
|
+
|
|
67
|
+
# Metadata
|
|
68
|
+
metadata = summary.get("metadata", {})
|
|
69
|
+
if metadata:
|
|
70
|
+
print("\nConfiguration:")
|
|
71
|
+
print(f" Model: {metadata.get('model', 'N/A')}")
|
|
72
|
+
print(f" Prompt: {metadata.get('prompt_template', 'N/A')}")
|
|
73
|
+
sampling = metadata.get("sampling", {})
|
|
74
|
+
if sampling:
|
|
75
|
+
print(f" Temperature: {sampling.get('temperature', 'N/A')}")
|
|
76
|
+
print(f" Max Tokens: {sampling.get('max_tokens', 'N/A')}")
|
|
77
|
+
|
|
78
|
+
# Metrics
|
|
79
|
+
metrics = summary.get("metrics", {})
|
|
80
|
+
if metrics:
|
|
81
|
+
print("\nMetrics:")
|
|
82
|
+
for name, data in metrics.items():
|
|
83
|
+
mean = data.get("mean", 0)
|
|
84
|
+
count = data.get("count", 0)
|
|
85
|
+
print(f" {name}: {mean:.4f} (n={count})")
|
|
86
|
+
|
|
87
|
+
# Cost
|
|
88
|
+
cost = summary.get("cost_usd")
|
|
89
|
+
if cost is not None:
|
|
90
|
+
print(f"\nCost: ${cost:.4f}")
|
|
91
|
+
|
|
92
|
+
# Failures
|
|
93
|
+
failures = summary.get("failures", 0)
|
|
94
|
+
failure_rate = summary.get("failure_rate", 0)
|
|
95
|
+
if failures > 0:
|
|
96
|
+
print(f"\nFailures: {failures} ({failure_rate:.2%})")
|
|
97
|
+
|
|
98
|
+
print("\n" + "=" * 80)
|
|
99
|
+
return 0
|
|
100
|
+
|
|
101
|
+
except FileNotFoundError:
|
|
102
|
+
print(f"Error: Run directory not found: {run_dir}")
|
|
103
|
+
return 1
|
|
104
|
+
except json.JSONDecodeError as e:
|
|
105
|
+
print(f"Error: Invalid JSON in summary file: {e}")
|
|
106
|
+
return 1
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print(f"Unexpected error: {e}")
|
|
109
|
+
import traceback
|
|
110
|
+
|
|
111
|
+
traceback.print_exc()
|
|
112
|
+
return 1
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def list_command(
|
|
116
|
+
*,
|
|
117
|
+
storage: Annotated[
|
|
118
|
+
Path,
|
|
119
|
+
Parameter(
|
|
120
|
+
help="Storage directory containing experiment results",
|
|
121
|
+
),
|
|
122
|
+
] = Path(".cache/runs"),
|
|
123
|
+
limit: Annotated[
|
|
124
|
+
int | None,
|
|
125
|
+
Parameter(
|
|
126
|
+
help="Maximum number of runs to display",
|
|
127
|
+
),
|
|
128
|
+
] = None,
|
|
129
|
+
sort_by: Annotated[
|
|
130
|
+
str,
|
|
131
|
+
Parameter(
|
|
132
|
+
help="Sort runs by: time (newest first) or metric name",
|
|
133
|
+
),
|
|
134
|
+
] = "time",
|
|
135
|
+
) -> int:
|
|
136
|
+
"""List all experiment runs with quick summaries.
|
|
137
|
+
|
|
138
|
+
This command scans for summary.json files and displays a table of all runs.
|
|
139
|
+
Much faster than loading full report.json files.
|
|
140
|
+
|
|
141
|
+
Examples:
|
|
142
|
+
# List all runs
|
|
143
|
+
uv run python -m themis.cli results list
|
|
144
|
+
|
|
145
|
+
# List 10 most recent runs
|
|
146
|
+
uv run python -m themis.cli results list --limit 10
|
|
147
|
+
|
|
148
|
+
# List runs sorted by accuracy
|
|
149
|
+
uv run python -m themis.cli results list --sort-by accuracy
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
if not storage.exists():
|
|
153
|
+
print(f"Error: Storage directory not found: {storage}")
|
|
154
|
+
return 1
|
|
155
|
+
|
|
156
|
+
# Find all summary.json files
|
|
157
|
+
summaries = []
|
|
158
|
+
for run_dir in storage.iterdir():
|
|
159
|
+
if not run_dir.is_dir():
|
|
160
|
+
continue
|
|
161
|
+
summary_path = run_dir / "summary.json"
|
|
162
|
+
if summary_path.exists():
|
|
163
|
+
try:
|
|
164
|
+
with summary_path.open("r", encoding="utf-8") as f:
|
|
165
|
+
summary = json.load(f)
|
|
166
|
+
summary["_run_dir"] = run_dir.name
|
|
167
|
+
summary["_mtime"] = summary_path.stat().st_mtime
|
|
168
|
+
summaries.append(summary)
|
|
169
|
+
except Exception:
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
if not summaries:
|
|
173
|
+
print(f"No experiment runs found in {storage}")
|
|
174
|
+
print("\nNote: Only runs with summary.json files are shown.")
|
|
175
|
+
return 0
|
|
176
|
+
|
|
177
|
+
# Sort summaries
|
|
178
|
+
if sort_by == "time":
|
|
179
|
+
summaries.sort(key=lambda s: s.get("_mtime", 0), reverse=True)
|
|
180
|
+
else:
|
|
181
|
+
# Sort by metric value
|
|
182
|
+
summaries.sort(
|
|
183
|
+
key=lambda s: s.get("metrics", {}).get(sort_by, {}).get("mean", 0),
|
|
184
|
+
reverse=True,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Apply limit
|
|
188
|
+
if limit:
|
|
189
|
+
summaries = summaries[:limit]
|
|
190
|
+
|
|
191
|
+
# Display table
|
|
192
|
+
print("=" * 120)
|
|
193
|
+
print(f"Found {len(summaries)} experiment run(s)")
|
|
194
|
+
print("=" * 120)
|
|
195
|
+
|
|
196
|
+
# Collect all metric names
|
|
197
|
+
all_metrics = set()
|
|
198
|
+
for s in summaries:
|
|
199
|
+
all_metrics.update(s.get("metrics", {}).keys())
|
|
200
|
+
metric_names = sorted(all_metrics)
|
|
201
|
+
|
|
202
|
+
# Header
|
|
203
|
+
header_cols = ["Run ID", "Model", "Samples"] + metric_names + ["Cost ($)"]
|
|
204
|
+
col_widths = [25, 30, 8] + [12] * len(metric_names) + [10]
|
|
205
|
+
|
|
206
|
+
header = " | ".join(
|
|
207
|
+
col.ljust(width)[:width] for col, width in zip(header_cols, col_widths)
|
|
208
|
+
)
|
|
209
|
+
print(header)
|
|
210
|
+
print("-" * len(header))
|
|
211
|
+
|
|
212
|
+
# Rows
|
|
213
|
+
for summary in summaries:
|
|
214
|
+
run_id = summary.get("_run_dir", "N/A")[:25]
|
|
215
|
+
model = summary.get("metadata", {}).get("model", "N/A")[:30]
|
|
216
|
+
samples = str(summary.get("total_samples", 0))
|
|
217
|
+
cost = summary.get("cost_usd")
|
|
218
|
+
|
|
219
|
+
row_values = [run_id, model, samples]
|
|
220
|
+
|
|
221
|
+
# Add metric values
|
|
222
|
+
for metric_name in metric_names:
|
|
223
|
+
metric_data = summary.get("metrics", {}).get(metric_name, {})
|
|
224
|
+
mean = metric_data.get("mean")
|
|
225
|
+
if mean is not None:
|
|
226
|
+
row_values.append(f"{mean:.4f}")
|
|
227
|
+
else:
|
|
228
|
+
row_values.append("N/A")
|
|
229
|
+
|
|
230
|
+
# Add cost
|
|
231
|
+
if cost is not None:
|
|
232
|
+
row_values.append(f"{cost:.4f}")
|
|
233
|
+
else:
|
|
234
|
+
row_values.append("N/A")
|
|
235
|
+
|
|
236
|
+
row = " | ".join(
|
|
237
|
+
val.ljust(width)[:width] for val, width in zip(row_values, col_widths)
|
|
238
|
+
)
|
|
239
|
+
print(row)
|
|
240
|
+
|
|
241
|
+
print("=" * 120)
|
|
242
|
+
return 0
|
|
243
|
+
|
|
244
|
+
except Exception as e:
|
|
245
|
+
print(f"Unexpected error: {e}")
|
|
246
|
+
import traceback
|
|
247
|
+
|
|
248
|
+
traceback.print_exc()
|
|
249
|
+
return 1
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
__all__ = ["summary_command", "list_command"]
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Sample run command for quick testing before full experiments."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.cli.commands.config_commands import run_configured_experiment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def sample_run_command(
|
|
14
|
+
*,
|
|
15
|
+
config: Annotated[Path, Parameter(help="Path to experiment configuration file")],
|
|
16
|
+
n: Annotated[int, Parameter(help="Number of samples to test")] = 5,
|
|
17
|
+
verbose: Annotated[bool, Parameter(help="Show detailed output")] = False,
|
|
18
|
+
show_outputs: Annotated[
|
|
19
|
+
bool, Parameter(help="Display sample outputs and predictions")
|
|
20
|
+
] = False,
|
|
21
|
+
estimate_cost: Annotated[
|
|
22
|
+
bool, Parameter(help="Estimate full run cost based on sample")
|
|
23
|
+
] = True,
|
|
24
|
+
) -> int:
|
|
25
|
+
"""Quick test run on N samples before running full experiment.
|
|
26
|
+
|
|
27
|
+
This command helps you:
|
|
28
|
+
- Test your configuration works correctly
|
|
29
|
+
- Preview sample outputs before full run
|
|
30
|
+
- Estimate total cost based on actual token usage
|
|
31
|
+
- Catch configuration errors early
|
|
32
|
+
- Iterate on prompts quickly
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
# Basic quick test
|
|
36
|
+
uv run python -m themis.cli sample-run \\
|
|
37
|
+
--config my_config.yaml \\
|
|
38
|
+
--n 5
|
|
39
|
+
|
|
40
|
+
# Test with verbose output
|
|
41
|
+
uv run python -m themis.cli sample-run \\
|
|
42
|
+
--config my_config.yaml \\
|
|
43
|
+
--n 3 \\
|
|
44
|
+
--verbose \\
|
|
45
|
+
--show-outputs
|
|
46
|
+
|
|
47
|
+
# Test and estimate full run cost
|
|
48
|
+
uv run python -m themis.cli sample-run \\
|
|
49
|
+
--config my_config.yaml \\
|
|
50
|
+
--n 10 \\
|
|
51
|
+
--estimate-cost
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
import json
|
|
55
|
+
import tempfile
|
|
56
|
+
|
|
57
|
+
from hydra import compose, initialize_config_dir
|
|
58
|
+
|
|
59
|
+
# Load config
|
|
60
|
+
config_path = Path(config).resolve()
|
|
61
|
+
if not config_path.exists():
|
|
62
|
+
print(f"Error: Config file not found: {config_path}")
|
|
63
|
+
return 1
|
|
64
|
+
|
|
65
|
+
config_dir = str(config_path.parent)
|
|
66
|
+
config_name = config_path.stem
|
|
67
|
+
|
|
68
|
+
print("=" * 80)
|
|
69
|
+
print(f"đ§Ş Sample Run: Testing {n} samples")
|
|
70
|
+
print("=" * 80)
|
|
71
|
+
print(f"Config: {config_path}")
|
|
72
|
+
print(f"Samples: {n}")
|
|
73
|
+
print()
|
|
74
|
+
|
|
75
|
+
# Initialize Hydra
|
|
76
|
+
with initialize_config_dir(config_dir=config_dir, version_base=None):
|
|
77
|
+
cfg = compose(config_name=config_name)
|
|
78
|
+
|
|
79
|
+
# Override dataset limit
|
|
80
|
+
original_limit = cfg.dataset.get("limit")
|
|
81
|
+
cfg.dataset.limit = n
|
|
82
|
+
|
|
83
|
+
# Use temporary storage
|
|
84
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
85
|
+
cfg.storage.path = temp_dir
|
|
86
|
+
|
|
87
|
+
# Generate temporary run_id
|
|
88
|
+
cfg.run_id = "sample-run-temp"
|
|
89
|
+
cfg.resume = False
|
|
90
|
+
|
|
91
|
+
print("đ Configuration:")
|
|
92
|
+
print(f" Model: {cfg.generation.model_identifier}")
|
|
93
|
+
print(f" Provider: {cfg.generation.provider.name}")
|
|
94
|
+
print(f" Temperature: {cfg.generation.sampling.temperature}")
|
|
95
|
+
print(f" Max tokens: {cfg.generation.sampling.max_tokens}")
|
|
96
|
+
if hasattr(cfg.dataset, "source"):
|
|
97
|
+
print(f" Dataset: {cfg.dataset.source}")
|
|
98
|
+
print()
|
|
99
|
+
|
|
100
|
+
# Run experiment on sample
|
|
101
|
+
print("đ Running sample experiment...")
|
|
102
|
+
print()
|
|
103
|
+
|
|
104
|
+
# Redirect to capture run
|
|
105
|
+
result = run_configured_experiment(
|
|
106
|
+
config_path=config_path,
|
|
107
|
+
overrides=[
|
|
108
|
+
f"dataset.limit={n}",
|
|
109
|
+
f"storage.path={temp_dir}",
|
|
110
|
+
"run_id=sample-run-temp",
|
|
111
|
+
"resume=false",
|
|
112
|
+
],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if result != 0:
|
|
116
|
+
print("\nâ Sample run failed")
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
# Load results
|
|
120
|
+
report_path = Path(temp_dir) / "sample-run-temp" / "report.json"
|
|
121
|
+
if not report_path.exists():
|
|
122
|
+
print("\nâ ď¸ No report generated")
|
|
123
|
+
return 1
|
|
124
|
+
|
|
125
|
+
with report_path.open("r") as f:
|
|
126
|
+
report_data = json.load(f)
|
|
127
|
+
|
|
128
|
+
# Display results
|
|
129
|
+
print("\n" + "=" * 80)
|
|
130
|
+
print("â
Sample Run Complete")
|
|
131
|
+
print("=" * 80)
|
|
132
|
+
|
|
133
|
+
# Metrics
|
|
134
|
+
metrics = report_data.get("metrics", [])
|
|
135
|
+
if metrics:
|
|
136
|
+
print("\nđ Metrics:")
|
|
137
|
+
for metric in metrics:
|
|
138
|
+
name = metric["name"]
|
|
139
|
+
mean = metric["mean"]
|
|
140
|
+
count = metric["count"]
|
|
141
|
+
print(f" {name}: {mean:.4f} (n={count})")
|
|
142
|
+
|
|
143
|
+
# Cost analysis
|
|
144
|
+
cost_data = report_data.get("summary", {}).get("cost")
|
|
145
|
+
if cost_data:
|
|
146
|
+
total_cost = cost_data.get("total_cost", 0)
|
|
147
|
+
token_counts = cost_data.get("token_counts", {})
|
|
148
|
+
prompt_tokens = token_counts.get("prompt_tokens", 0)
|
|
149
|
+
completion_tokens = token_counts.get("completion_tokens", 0)
|
|
150
|
+
|
|
151
|
+
print("\nđ° Cost (sample run):")
|
|
152
|
+
print(f" Total: ${total_cost:.4f}")
|
|
153
|
+
print(f" Per sample: ${total_cost / n:.6f}")
|
|
154
|
+
print(
|
|
155
|
+
f" Prompt tokens: {prompt_tokens} ({prompt_tokens / n:.0f} avg)"
|
|
156
|
+
)
|
|
157
|
+
print(
|
|
158
|
+
f" Completion tokens: {completion_tokens} ({completion_tokens / n:.0f} avg)"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Estimate full run cost
|
|
162
|
+
if estimate_cost and original_limit:
|
|
163
|
+
full_cost = (total_cost / n) * original_limit
|
|
164
|
+
print("\nđ Estimated full run cost:")
|
|
165
|
+
print(f" Dataset size: {original_limit} samples")
|
|
166
|
+
print(f" Estimated cost: ${full_cost:.2f}")
|
|
167
|
+
print(
|
|
168
|
+
f" 95% CI: ${full_cost * 0.8:.2f} - ${full_cost * 1.2:.2f}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if full_cost > 10.0:
|
|
172
|
+
print(f"\nâ ď¸ Warning: Estimated cost is ${full_cost:.2f}")
|
|
173
|
+
print(" Consider using --limit for initial testing")
|
|
174
|
+
|
|
175
|
+
# Failures
|
|
176
|
+
failures = report_data.get("run_failures", [])
|
|
177
|
+
eval_failures = report_data.get("evaluation_failures", [])
|
|
178
|
+
total_failures = len(failures) + len(eval_failures)
|
|
179
|
+
|
|
180
|
+
if total_failures > 0:
|
|
181
|
+
print(f"\nâ ď¸ Failures: {total_failures}")
|
|
182
|
+
if failures:
|
|
183
|
+
print(f" Generation failures: {len(failures)}")
|
|
184
|
+
if verbose:
|
|
185
|
+
for failure in failures[:3]:
|
|
186
|
+
print(
|
|
187
|
+
f" - {failure.get('sample_id')}: {failure.get('message')}"
|
|
188
|
+
)
|
|
189
|
+
if eval_failures:
|
|
190
|
+
print(f" Evaluation failures: {len(eval_failures)}")
|
|
191
|
+
|
|
192
|
+
# Show sample outputs
|
|
193
|
+
if show_outputs:
|
|
194
|
+
samples = report_data.get("samples", [])
|
|
195
|
+
print("\nđ Sample Outputs (showing up to 3):")
|
|
196
|
+
for i, sample in enumerate(samples[:3], 1):
|
|
197
|
+
sample_id = sample.get("sample_id", f"sample-{i}")
|
|
198
|
+
scores = sample.get("scores", [])
|
|
199
|
+
|
|
200
|
+
print(f"\n Sample {i}: {sample_id}")
|
|
201
|
+
if scores:
|
|
202
|
+
for score in scores:
|
|
203
|
+
metric_name = score.get("metric")
|
|
204
|
+
value = score.get("value")
|
|
205
|
+
print(f" {metric_name}: {value:.4f}")
|
|
206
|
+
|
|
207
|
+
# Summary
|
|
208
|
+
print("\n" + "=" * 80)
|
|
209
|
+
print("⨠Next Steps:")
|
|
210
|
+
print("=" * 80)
|
|
211
|
+
|
|
212
|
+
if total_failures == 0 and metrics:
|
|
213
|
+
avg_metric = metrics[0]["mean"]
|
|
214
|
+
if avg_metric > 0.1: # Reasonable performance
|
|
215
|
+
print(" â
Configuration looks good!")
|
|
216
|
+
print(" Run full experiment with:")
|
|
217
|
+
print(
|
|
218
|
+
f" uv run python -m themis.cli run-config --config {config_path}"
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
print(" â ď¸ Low performance on sample - consider:")
|
|
222
|
+
print(" - Adjusting prompt template")
|
|
223
|
+
print(" - Tuning temperature/max_tokens")
|
|
224
|
+
print(" - Testing different model")
|
|
225
|
+
else:
|
|
226
|
+
print(" â ď¸ Issues detected:")
|
|
227
|
+
if total_failures > 0:
|
|
228
|
+
print(" - Fix failures before full run")
|
|
229
|
+
if not metrics:
|
|
230
|
+
print(" - Check evaluation metrics")
|
|
231
|
+
print(" - Review configuration")
|
|
232
|
+
|
|
233
|
+
return 0
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
print(f"\nâ Error: {e}")
|
|
237
|
+
import traceback
|
|
238
|
+
|
|
239
|
+
if verbose:
|
|
240
|
+
traceback.print_exc()
|
|
241
|
+
return 1
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
__all__ = ["sample_run_command"]
|