themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +93 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +164 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +288 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +129 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +690 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +373 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +255 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +61 -0
- themis/integrations/wandb.py +65 -0
- themis/interfaces/__init__.py +83 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
- themis_eval-0.1.1.dist-info/RECORD +134 -0
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
themis/cli/__init__.py
ADDED
themis/cli/__main__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""CLI command modules."""
|
|
2
|
+
|
|
3
|
+
from themis.cli.commands import (
|
|
4
|
+
benchmarks,
|
|
5
|
+
config_commands,
|
|
6
|
+
demo,
|
|
7
|
+
info,
|
|
8
|
+
math_benchmarks,
|
|
9
|
+
mcq_benchmarks,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"benchmarks",
|
|
14
|
+
"config_commands",
|
|
15
|
+
"demo",
|
|
16
|
+
"info",
|
|
17
|
+
"math_benchmarks",
|
|
18
|
+
"mcq_benchmarks",
|
|
19
|
+
]
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Benchmark listing commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Annotated
|
|
6
|
+
|
|
7
|
+
from cyclopts import Parameter
|
|
8
|
+
|
|
9
|
+
from themis.providers.registry import _REGISTRY
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def list_providers(
|
|
13
|
+
*,
|
|
14
|
+
verbose: Annotated[
|
|
15
|
+
bool, Parameter(help="Show detailed provider information")
|
|
16
|
+
] = False,
|
|
17
|
+
) -> int:
|
|
18
|
+
"""List available LLM providers."""
|
|
19
|
+
providers = sorted(_REGISTRY._factories.keys())
|
|
20
|
+
|
|
21
|
+
if not providers:
|
|
22
|
+
print("No providers registered.")
|
|
23
|
+
return 0
|
|
24
|
+
|
|
25
|
+
print("Available Providers:")
|
|
26
|
+
print("=" * 60)
|
|
27
|
+
|
|
28
|
+
provider_info = {
|
|
29
|
+
"fake": "Built-in fake provider for testing (no API required)",
|
|
30
|
+
"openai-compatible": "OpenAI-compatible API (LM Studio, Ollama, vLLM, OpenAI)",
|
|
31
|
+
"vllm": "vLLM server provider for local model hosting",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
for provider in providers:
|
|
35
|
+
status = "✓" if provider in provider_info else "·"
|
|
36
|
+
print(f"{status} {provider}")
|
|
37
|
+
if verbose and provider in provider_info:
|
|
38
|
+
print(f" {provider_info[provider]}")
|
|
39
|
+
|
|
40
|
+
if not verbose:
|
|
41
|
+
print("\nUse --verbose for more details")
|
|
42
|
+
|
|
43
|
+
return 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def list_benchmarks(
|
|
47
|
+
*,
|
|
48
|
+
verbose: Annotated[
|
|
49
|
+
bool, Parameter(help="Show detailed benchmark information")
|
|
50
|
+
] = False,
|
|
51
|
+
) -> int:
|
|
52
|
+
"""List available datasets and benchmarks."""
|
|
53
|
+
benchmarks = [
|
|
54
|
+
{
|
|
55
|
+
"name": "math500",
|
|
56
|
+
"description": "MATH-500 dataset for mathematical reasoning",
|
|
57
|
+
"source": "huggingface (default) or local",
|
|
58
|
+
"subjects": [
|
|
59
|
+
"algebra",
|
|
60
|
+
"counting_and_probability",
|
|
61
|
+
"geometry",
|
|
62
|
+
"intermediate_algebra",
|
|
63
|
+
"number_theory",
|
|
64
|
+
"prealgebra",
|
|
65
|
+
"precalculus",
|
|
66
|
+
],
|
|
67
|
+
"command": "uv run python -m themis.cli math500",
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"name": "gsm8k",
|
|
71
|
+
"description": "GSM8K dataset for grade school math word problems",
|
|
72
|
+
"source": "huggingface (default) or local",
|
|
73
|
+
"subjects": "math",
|
|
74
|
+
"command": "uv run python -m themis.cli gsm8k",
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"name": "gpqa",
|
|
78
|
+
"description": "GPQA dataset for graduate-level science questions",
|
|
79
|
+
"source": "huggingface (default) or local",
|
|
80
|
+
"subjects": "science",
|
|
81
|
+
"command": "uv run python -m themis.cli gpqa",
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"name": "gsm-symbolic",
|
|
85
|
+
"description": "GSM-Symbolic dataset for symbolic math reasoning",
|
|
86
|
+
"source": "huggingface (default) or local",
|
|
87
|
+
"subjects": "math",
|
|
88
|
+
"command": "uv run python -m themis.cli gsm-symbolic",
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"name": "medmcqa",
|
|
92
|
+
"description": "MedMCQA dataset for medical entrance exams",
|
|
93
|
+
"source": "huggingface (default) or local",
|
|
94
|
+
"subjects": "medicine",
|
|
95
|
+
"command": "uv run python -m themis.cli medmcqa",
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"name": "med_qa",
|
|
99
|
+
"description": "MedQA dataset for medical question answering",
|
|
100
|
+
"source": "huggingface (default) or local",
|
|
101
|
+
"subjects": "medicine",
|
|
102
|
+
"command": "uv run python -m themis.cli med_qa",
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"name": "sciq",
|
|
106
|
+
"description": "SciQ dataset for science questions",
|
|
107
|
+
"source": "huggingface (default) or local",
|
|
108
|
+
"subjects": "science",
|
|
109
|
+
"command": "uv run python -m themis.cli sciq",
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"name": "commonsense_qa",
|
|
113
|
+
"description": "CommonsenseQA dataset for commonsense reasoning",
|
|
114
|
+
"source": "huggingface (default) or local",
|
|
115
|
+
"subjects": "commonsense",
|
|
116
|
+
"command": "uv run python -m themis.cli commonsense_qa",
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
"name": "piqa",
|
|
120
|
+
"description": "PIQA dataset for physical commonsense reasoning",
|
|
121
|
+
"source": "huggingface (default) or local",
|
|
122
|
+
"subjects": "commonsense",
|
|
123
|
+
"command": "uv run python -m themis.cli piqa",
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"name": "social_i_qa",
|
|
127
|
+
"description": "Social IQA dataset for social commonsense reasoning",
|
|
128
|
+
"source": "huggingface (default) or local",
|
|
129
|
+
"subjects": "commonsense",
|
|
130
|
+
"command": "uv run python -m themis.cli social_i_qa",
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"name": "coqa",
|
|
134
|
+
"description": "CoQA dataset for conversational question answering",
|
|
135
|
+
"source": "huggingface (default) or local",
|
|
136
|
+
"subjects": "conversational",
|
|
137
|
+
"command": "uv run python -m themis.cli coqa",
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"name": "supergpqa",
|
|
141
|
+
"description": "Graduate-level QA benchmark with multiple-choice questions",
|
|
142
|
+
"source": "huggingface (default) or local",
|
|
143
|
+
"subjects": "category filter via --subjects",
|
|
144
|
+
"command": "uv run python -m themis.cli supergpqa",
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"name": "mmlu-pro",
|
|
148
|
+
"description": "Professional-level MMLU benchmark with refined distractors",
|
|
149
|
+
"source": "huggingface (default) or local",
|
|
150
|
+
"subjects": "subject filter via --subjects",
|
|
151
|
+
"command": "uv run python -m themis.cli mmlu-pro",
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
"name": "aime24",
|
|
155
|
+
"description": "AIME 2024 competition problems",
|
|
156
|
+
"source": "huggingface (default) or local",
|
|
157
|
+
"subjects": "problem set",
|
|
158
|
+
"command": "uv run python -m themis.cli aime24",
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
"name": "aime25",
|
|
162
|
+
"description": "AIME 2025 competition problems",
|
|
163
|
+
"source": "huggingface (default) or local",
|
|
164
|
+
"subjects": "problem set",
|
|
165
|
+
"command": "uv run python -m themis.cli aime25",
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"name": "amc23",
|
|
169
|
+
"description": "AMC 2023 competition problems",
|
|
170
|
+
"source": "huggingface (default) or local",
|
|
171
|
+
"subjects": "problem set",
|
|
172
|
+
"command": "uv run python -m themis.cli amc23",
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
"name": "olympiadbench",
|
|
176
|
+
"description": "Mixed Olympiad-style math benchmark",
|
|
177
|
+
"source": "huggingface (default) or local",
|
|
178
|
+
"subjects": "competition metadata",
|
|
179
|
+
"command": "uv run python -m themis.cli olympiadbench",
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
"name": "beyondaime",
|
|
183
|
+
"description": "BeyondAIME advanced math competition set",
|
|
184
|
+
"source": "huggingface (default) or local",
|
|
185
|
+
"subjects": "problem set",
|
|
186
|
+
"command": "uv run python -m themis.cli beyondaime",
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"name": "demo",
|
|
190
|
+
"description": "Built-in demo with 2 math problems",
|
|
191
|
+
"source": "inline",
|
|
192
|
+
"subjects": ["precalculus", "arithmetic"],
|
|
193
|
+
"command": "uv run python -m themis.cli demo",
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
"name": "inline",
|
|
197
|
+
"description": "Custom inline dataset (via config file)",
|
|
198
|
+
"source": "config file",
|
|
199
|
+
"subjects": "user-defined",
|
|
200
|
+
"command": "uv run python -m themis.cli run-config --config your_config.yaml",
|
|
201
|
+
},
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
print("Available Datasets & Benchmarks:")
|
|
205
|
+
print("=" * 60)
|
|
206
|
+
|
|
207
|
+
for bench in benchmarks:
|
|
208
|
+
print(f"\n📊 {bench['name']}")
|
|
209
|
+
print(f" {bench['description']}")
|
|
210
|
+
if verbose:
|
|
211
|
+
print(f" Source: {bench['source']}")
|
|
212
|
+
if isinstance(bench["subjects"], list):
|
|
213
|
+
print(f" Subjects: {', '.join(bench['subjects'])}")
|
|
214
|
+
else:
|
|
215
|
+
print(f" Subjects: {bench['subjects']}")
|
|
216
|
+
print(f" Command: {bench['command']}")
|
|
217
|
+
|
|
218
|
+
if not verbose:
|
|
219
|
+
print("\nUse --verbose for more details and example commands")
|
|
220
|
+
|
|
221
|
+
return 0
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""Multi-experiment comparison commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.experiment.comparison import compare_experiments, diff_configs
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def compare_command(
|
|
14
|
+
*,
|
|
15
|
+
run_ids: Annotated[
|
|
16
|
+
list[str],
|
|
17
|
+
Parameter(
|
|
18
|
+
help="Run IDs to compare (comma-separated or multiple --run-ids)",
|
|
19
|
+
),
|
|
20
|
+
],
|
|
21
|
+
storage: Annotated[
|
|
22
|
+
Path,
|
|
23
|
+
Parameter(
|
|
24
|
+
help="Storage directory containing experiment results",
|
|
25
|
+
),
|
|
26
|
+
] = Path(".cache/runs"),
|
|
27
|
+
metrics: Annotated[
|
|
28
|
+
list[str] | None,
|
|
29
|
+
Parameter(
|
|
30
|
+
help="Metrics to compare (default: all available)",
|
|
31
|
+
),
|
|
32
|
+
] = None,
|
|
33
|
+
output: Annotated[
|
|
34
|
+
Path | None,
|
|
35
|
+
Parameter(
|
|
36
|
+
help="Output file path (format inferred from extension: .csv, .md, .json)",
|
|
37
|
+
),
|
|
38
|
+
] = None,
|
|
39
|
+
format: Annotated[
|
|
40
|
+
str,
|
|
41
|
+
Parameter(
|
|
42
|
+
help="Output format: csv, markdown, json, latex",
|
|
43
|
+
),
|
|
44
|
+
] = "markdown",
|
|
45
|
+
highlight_best: Annotated[
|
|
46
|
+
str | None,
|
|
47
|
+
Parameter(
|
|
48
|
+
help="Metric to highlight best performer (e.g., 'accuracy')",
|
|
49
|
+
),
|
|
50
|
+
] = None,
|
|
51
|
+
) -> int:
|
|
52
|
+
"""Compare multiple experiment runs.
|
|
53
|
+
|
|
54
|
+
Automatically includes cost data when available. Costs are tracked
|
|
55
|
+
automatically during experiment runs and displayed in comparisons.
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
# Compare three runs with default metrics (includes cost if tracked)
|
|
59
|
+
uv run python -m themis.cli compare \\
|
|
60
|
+
--run-ids run-1 run-2 run-3 \\
|
|
61
|
+
--storage .cache/runs
|
|
62
|
+
|
|
63
|
+
# Compare with specific metrics, export to CSV
|
|
64
|
+
uv run python -m themis.cli compare \\
|
|
65
|
+
--run-ids run-1 run-2 run-3 \\
|
|
66
|
+
--metrics accuracy \\
|
|
67
|
+
--output comparison.csv
|
|
68
|
+
|
|
69
|
+
# Use 'cost' as a metric for ranking and Pareto analysis
|
|
70
|
+
uv run python -m themis.cli pareto \\
|
|
71
|
+
--run-ids run-1 run-2 run-3 \\
|
|
72
|
+
--objectives accuracy cost \\
|
|
73
|
+
--maximize true false
|
|
74
|
+
|
|
75
|
+
# Highlight best accuracy performer
|
|
76
|
+
uv run python -m themis.cli compare \\
|
|
77
|
+
--run-ids run-1 run-2 run-3 \\
|
|
78
|
+
--highlight-best accuracy
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
# Load and compare experiments
|
|
82
|
+
print(f"Loading experiments from {storage}...")
|
|
83
|
+
comparison = compare_experiments(
|
|
84
|
+
run_ids=run_ids,
|
|
85
|
+
storage_dir=storage,
|
|
86
|
+
metrics=metrics,
|
|
87
|
+
include_metadata=True,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
|
|
91
|
+
print(f" Metrics: {', '.join(comparison.metrics)}\n")
|
|
92
|
+
|
|
93
|
+
# Display comparison table
|
|
94
|
+
print("=" * 80)
|
|
95
|
+
print("Experiment Comparison")
|
|
96
|
+
print("=" * 80)
|
|
97
|
+
|
|
98
|
+
# Check if any experiment has cost data
|
|
99
|
+
has_cost = any(exp.get_cost() is not None for exp in comparison.experiments)
|
|
100
|
+
|
|
101
|
+
# Header
|
|
102
|
+
header_cols = ["Run ID"] + comparison.metrics + ["Samples", "Failures"]
|
|
103
|
+
if has_cost:
|
|
104
|
+
header_cols.append("Cost ($)")
|
|
105
|
+
col_widths = [max(20, len(col)) for col in header_cols]
|
|
106
|
+
|
|
107
|
+
header = " | ".join(
|
|
108
|
+
col.ljust(width) for col, width in zip(header_cols, col_widths)
|
|
109
|
+
)
|
|
110
|
+
print(header)
|
|
111
|
+
print("-" * len(header))
|
|
112
|
+
|
|
113
|
+
# Rows
|
|
114
|
+
for exp in comparison.experiments:
|
|
115
|
+
row_values = [exp.run_id[:20]] # Truncate run ID
|
|
116
|
+
for metric in comparison.metrics:
|
|
117
|
+
val = exp.get_metric(metric)
|
|
118
|
+
row_values.append(f"{val:.4f}" if val is not None else "N/A")
|
|
119
|
+
row_values.append(str(exp.sample_count))
|
|
120
|
+
row_values.append(str(exp.failure_count))
|
|
121
|
+
|
|
122
|
+
# Add cost if available
|
|
123
|
+
if has_cost:
|
|
124
|
+
cost = exp.get_cost()
|
|
125
|
+
row_values.append(f"{cost:.4f}" if cost is not None else "N/A")
|
|
126
|
+
|
|
127
|
+
row = " | ".join(
|
|
128
|
+
val.ljust(width) for val, width in zip(row_values, col_widths)
|
|
129
|
+
)
|
|
130
|
+
print(row)
|
|
131
|
+
|
|
132
|
+
print("=" * 80)
|
|
133
|
+
|
|
134
|
+
# Highlight best if requested
|
|
135
|
+
if highlight_best:
|
|
136
|
+
if highlight_best in comparison.metrics:
|
|
137
|
+
best = comparison.highlight_best(highlight_best)
|
|
138
|
+
if best:
|
|
139
|
+
best_value = best.get_metric(highlight_best)
|
|
140
|
+
print(
|
|
141
|
+
f"\n⭐ Best {highlight_best}: {best.run_id} ({best_value:.4f})"
|
|
142
|
+
)
|
|
143
|
+
else:
|
|
144
|
+
print(f"\n⚠️ No valid values for metric '{highlight_best}'")
|
|
145
|
+
else:
|
|
146
|
+
print(
|
|
147
|
+
f"\n⚠️ Metric '{highlight_best}' not found. Available: {comparison.metrics}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Export if requested
|
|
151
|
+
if output:
|
|
152
|
+
output = Path(output)
|
|
153
|
+
# Infer format from extension if not specified
|
|
154
|
+
if output.suffix == ".csv":
|
|
155
|
+
comparison.to_csv(output)
|
|
156
|
+
print(f"\n✓ Exported to {output} (CSV)")
|
|
157
|
+
elif output.suffix == ".md":
|
|
158
|
+
comparison.to_markdown(output)
|
|
159
|
+
print(f"\n✓ Exported to {output} (Markdown)")
|
|
160
|
+
elif output.suffix == ".json":
|
|
161
|
+
import json
|
|
162
|
+
|
|
163
|
+
output.write_text(
|
|
164
|
+
json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
|
|
165
|
+
)
|
|
166
|
+
print(f"\n✓ Exported to {output} (JSON)")
|
|
167
|
+
elif output.suffix == ".tex":
|
|
168
|
+
comparison.to_latex(output, style="booktabs")
|
|
169
|
+
print(f"\n✓ Exported to {output} (LaTeX)")
|
|
170
|
+
else:
|
|
171
|
+
# Use specified format
|
|
172
|
+
if format == "csv":
|
|
173
|
+
comparison.to_csv(output)
|
|
174
|
+
print(f"\n✓ Exported to {output} (CSV)")
|
|
175
|
+
elif format == "markdown":
|
|
176
|
+
comparison.to_markdown(output)
|
|
177
|
+
print(f"\n✓ Exported to {output} (Markdown)")
|
|
178
|
+
elif format == "json":
|
|
179
|
+
import json
|
|
180
|
+
|
|
181
|
+
output.write_text(
|
|
182
|
+
json.dumps(comparison.to_dict(), indent=2), encoding="utf-8"
|
|
183
|
+
)
|
|
184
|
+
print(f"\n✓ Exported to {output} (JSON)")
|
|
185
|
+
elif format == "latex":
|
|
186
|
+
comparison.to_latex(output, style="booktabs")
|
|
187
|
+
print(f"\n✓ Exported to {output} (LaTeX)")
|
|
188
|
+
else:
|
|
189
|
+
print(f"\n⚠️ Unknown format: {format}")
|
|
190
|
+
print("Available formats: csv, markdown, json, latex")
|
|
191
|
+
return 1
|
|
192
|
+
|
|
193
|
+
return 0
|
|
194
|
+
|
|
195
|
+
except ValueError as e:
|
|
196
|
+
print(f"Error: {e}")
|
|
197
|
+
return 1
|
|
198
|
+
except FileNotFoundError as e:
|
|
199
|
+
print(f"Error: {e}")
|
|
200
|
+
return 1
|
|
201
|
+
except Exception as e:
|
|
202
|
+
print(f"Unexpected error: {e}")
|
|
203
|
+
import traceback
|
|
204
|
+
|
|
205
|
+
traceback.print_exc()
|
|
206
|
+
return 1
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def diff_command(
|
|
210
|
+
*,
|
|
211
|
+
run_id_a: Annotated[
|
|
212
|
+
str,
|
|
213
|
+
Parameter(
|
|
214
|
+
help="First run ID",
|
|
215
|
+
),
|
|
216
|
+
],
|
|
217
|
+
run_id_b: Annotated[
|
|
218
|
+
str,
|
|
219
|
+
Parameter(
|
|
220
|
+
help="Second run ID",
|
|
221
|
+
),
|
|
222
|
+
],
|
|
223
|
+
storage: Annotated[
|
|
224
|
+
Path,
|
|
225
|
+
Parameter(
|
|
226
|
+
help="Storage directory containing experiment results",
|
|
227
|
+
),
|
|
228
|
+
] = Path(".cache/runs"),
|
|
229
|
+
) -> int:
|
|
230
|
+
"""Show configuration differences between two experiment runs.
|
|
231
|
+
|
|
232
|
+
Examples:
|
|
233
|
+
# Compare configurations
|
|
234
|
+
uv run python -m themis.cli diff \\
|
|
235
|
+
--run-id-a run-1 \\
|
|
236
|
+
--run-id-b run-2 \\
|
|
237
|
+
--storage .cache/runs
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
diff = diff_configs(run_id_a, run_id_b, storage)
|
|
241
|
+
|
|
242
|
+
print("=" * 80)
|
|
243
|
+
print(f"Configuration Diff: {run_id_a} → {run_id_b}")
|
|
244
|
+
print("=" * 80)
|
|
245
|
+
|
|
246
|
+
if not diff.has_differences():
|
|
247
|
+
print("\n✓ No differences found - configurations are identical\n")
|
|
248
|
+
return 0
|
|
249
|
+
|
|
250
|
+
# Show changed fields
|
|
251
|
+
if diff.changed_fields:
|
|
252
|
+
print("\n📝 Changed Fields:")
|
|
253
|
+
for key, (old, new) in diff.changed_fields.items():
|
|
254
|
+
print(f"\n {key}:")
|
|
255
|
+
print(f" - {run_id_a}: {old}")
|
|
256
|
+
print(f" + {run_id_b}: {new}")
|
|
257
|
+
|
|
258
|
+
# Show added fields
|
|
259
|
+
if diff.added_fields:
|
|
260
|
+
print("\n➕ Added Fields (in run_id_b):")
|
|
261
|
+
for key, value in diff.added_fields.items():
|
|
262
|
+
print(f" {key}: {value}")
|
|
263
|
+
|
|
264
|
+
# Show removed fields
|
|
265
|
+
if diff.removed_fields:
|
|
266
|
+
print("\n➖ Removed Fields (from run_id_a):")
|
|
267
|
+
for key, value in diff.removed_fields.items():
|
|
268
|
+
print(f" {key}: {value}")
|
|
269
|
+
|
|
270
|
+
print("\n" + "=" * 80)
|
|
271
|
+
return 0
|
|
272
|
+
|
|
273
|
+
except FileNotFoundError as e:
|
|
274
|
+
print(f"Error: {e}")
|
|
275
|
+
print("\nMake sure both run IDs exist and have config.json files.")
|
|
276
|
+
return 1
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print(f"Unexpected error: {e}")
|
|
279
|
+
import traceback
|
|
280
|
+
|
|
281
|
+
traceback.print_exc()
|
|
282
|
+
return 1
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def pareto_command(
|
|
286
|
+
*,
|
|
287
|
+
run_ids: Annotated[
|
|
288
|
+
list[str],
|
|
289
|
+
Parameter(
|
|
290
|
+
help="Run IDs to analyze",
|
|
291
|
+
),
|
|
292
|
+
],
|
|
293
|
+
storage: Annotated[
|
|
294
|
+
Path,
|
|
295
|
+
Parameter(
|
|
296
|
+
help="Storage directory containing experiment results",
|
|
297
|
+
),
|
|
298
|
+
] = Path(".cache/runs"),
|
|
299
|
+
objectives: Annotated[
|
|
300
|
+
list[str],
|
|
301
|
+
Parameter(
|
|
302
|
+
help="Metrics to optimize (e.g., accuracy cost)",
|
|
303
|
+
),
|
|
304
|
+
],
|
|
305
|
+
maximize: Annotated[
|
|
306
|
+
list[bool] | None,
|
|
307
|
+
Parameter(
|
|
308
|
+
help="Whether to maximize each objective (true/false for each)",
|
|
309
|
+
),
|
|
310
|
+
] = None,
|
|
311
|
+
) -> int:
|
|
312
|
+
"""Find Pareto-optimal experiments across multiple objectives.
|
|
313
|
+
|
|
314
|
+
The Pareto frontier consists of experiments where no other experiment
|
|
315
|
+
is better on all objectives simultaneously.
|
|
316
|
+
|
|
317
|
+
Examples:
|
|
318
|
+
# Find experiments with best accuracy/cost tradeoff
|
|
319
|
+
# (maximize accuracy, minimize cost)
|
|
320
|
+
uv run python -m themis.cli pareto \\
|
|
321
|
+
--run-ids run-1 run-2 run-3 run-4 \\
|
|
322
|
+
--objectives accuracy cost \\
|
|
323
|
+
--maximize true false
|
|
324
|
+
|
|
325
|
+
# Find experiments with best accuracy/latency tradeoff
|
|
326
|
+
uv run python -m themis.cli pareto \\
|
|
327
|
+
--run-ids run-1 run-2 run-3 \\
|
|
328
|
+
--objectives accuracy latency \\
|
|
329
|
+
--maximize true false
|
|
330
|
+
"""
|
|
331
|
+
try:
|
|
332
|
+
# Load experiments
|
|
333
|
+
print(f"Loading experiments from {storage}...")
|
|
334
|
+
comparison = compare_experiments(
|
|
335
|
+
run_ids=run_ids,
|
|
336
|
+
storage_dir=storage,
|
|
337
|
+
metrics=objectives,
|
|
338
|
+
include_metadata=True,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
print(f"\n✓ Loaded {len(comparison.experiments)} experiments")
|
|
342
|
+
print(f" Objectives: {', '.join(objectives)}\n")
|
|
343
|
+
|
|
344
|
+
# Compute Pareto frontier
|
|
345
|
+
pareto_ids = comparison.pareto_frontier(objectives, maximize)
|
|
346
|
+
|
|
347
|
+
print("=" * 80)
|
|
348
|
+
print("Pareto Frontier Analysis")
|
|
349
|
+
print("=" * 80)
|
|
350
|
+
|
|
351
|
+
if not pareto_ids:
|
|
352
|
+
print(
|
|
353
|
+
"\n⚠️ No Pareto-optimal experiments found (all experiments have missing values)\n"
|
|
354
|
+
)
|
|
355
|
+
return 0
|
|
356
|
+
|
|
357
|
+
print(f"\n⭐ Found {len(pareto_ids)} Pareto-optimal experiment(s):\n")
|
|
358
|
+
|
|
359
|
+
# Show Pareto-optimal experiments
|
|
360
|
+
for run_id in pareto_ids:
|
|
361
|
+
exp = next(e for e in comparison.experiments if e.run_id == run_id)
|
|
362
|
+
print(f" • {run_id}")
|
|
363
|
+
for obj in objectives:
|
|
364
|
+
val = exp.get_metric(obj)
|
|
365
|
+
print(
|
|
366
|
+
f" {obj}: {val:.4f}"
|
|
367
|
+
if val is not None
|
|
368
|
+
else f" {obj}: N/A"
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Show dominated experiments
|
|
372
|
+
dominated = [
|
|
373
|
+
exp for exp in comparison.experiments if exp.run_id not in pareto_ids
|
|
374
|
+
]
|
|
375
|
+
if dominated:
|
|
376
|
+
print(f"\n📊 Dominated experiments ({len(dominated)}):")
|
|
377
|
+
for exp in dominated:
|
|
378
|
+
print(f" • {exp.run_id}")
|
|
379
|
+
|
|
380
|
+
print("\n" + "=" * 80)
|
|
381
|
+
return 0
|
|
382
|
+
|
|
383
|
+
except ValueError as e:
|
|
384
|
+
print(f"Error: {e}")
|
|
385
|
+
return 1
|
|
386
|
+
except Exception as e:
|
|
387
|
+
print(f"Unexpected error: {e}")
|
|
388
|
+
import traceback
|
|
389
|
+
|
|
390
|
+
traceback.print_exc()
|
|
391
|
+
return 1
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
__all__ = ["compare_command", "diff_command", "pareto_command"]
|