themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +429 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/main.py +427 -57
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/core/entities.py +23 -3
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/pipelines/standard_pipeline.py +68 -8
- themis/experiment/cache_manager.py +8 -3
- themis/experiment/export.py +110 -2
- themis/experiment/orchestrator.py +109 -11
- themis/experiment/storage.py +1457 -110
- themis/generation/providers/litellm_provider.py +46 -0
- themis/generation/runner.py +22 -6
- themis/integrations/huggingface.py +12 -1
- themis/integrations/wandb.py +13 -1
- themis/interfaces/__init__.py +86 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis_eval-0.2.1.dist-info/METADATA +596 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
- themis_eval-0.1.1.dist-info/METADATA +0 -758
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
themis/cli/main.py
CHANGED
|
@@ -1,25 +1,16 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Simplified CLI for Themis - Five core commands only.
|
|
2
|
+
|
|
3
|
+
This is the new unified CLI that leverages the themis.evaluate() API.
|
|
4
|
+
It replaces 20+ commands with 5 essential ones.
|
|
5
|
+
"""
|
|
2
6
|
|
|
3
7
|
from __future__ import annotations
|
|
4
8
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from themis.cli.commands import (
|
|
11
|
-
benchmarks,
|
|
12
|
-
comparison,
|
|
13
|
-
config_commands,
|
|
14
|
-
cost,
|
|
15
|
-
demo,
|
|
16
|
-
info,
|
|
17
|
-
leaderboard,
|
|
18
|
-
sample_run,
|
|
19
|
-
visualize,
|
|
20
|
-
)
|
|
21
|
-
from themis.cli.commands import math_benchmarks as math_cmds
|
|
22
|
-
from themis.cli.commands import mcq_benchmarks as mcq_cmds
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Annotated, Sequence
|
|
12
|
+
|
|
13
|
+
from cyclopts import App, Parameter
|
|
23
14
|
|
|
24
15
|
# Import provider modules to ensure they register themselves
|
|
25
16
|
try:
|
|
@@ -29,63 +20,442 @@ try:
|
|
|
29
20
|
vllm_provider, # noqa: F401
|
|
30
21
|
)
|
|
31
22
|
except ImportError:
|
|
32
|
-
pass
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
app = App(
|
|
26
|
+
name="themis",
|
|
27
|
+
help="Dead simple LLM evaluation platform",
|
|
28
|
+
version="2.0.0-alpha.1",
|
|
29
|
+
)
|
|
30
|
+
|
|
33
31
|
|
|
34
|
-
app
|
|
32
|
+
@app.command
|
|
33
|
+
def eval(
|
|
34
|
+
benchmark_or_dataset: Annotated[str, Parameter(name="BENCHMARK_OR_DATASET", show_default=False)],
|
|
35
|
+
*,
|
|
36
|
+
model: Annotated[str, Parameter(help="Model identifier (e.g., 'gpt-4', 'claude-3-opus')")],
|
|
37
|
+
limit: Annotated[int | None, Parameter(help="Maximum number of samples")] = None,
|
|
38
|
+
prompt: Annotated[str | None, Parameter(help="Custom prompt template")] = None,
|
|
39
|
+
temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
|
|
40
|
+
max_tokens: Annotated[int, Parameter(help="Maximum tokens to generate")] = 512,
|
|
41
|
+
storage: Annotated[str | None, Parameter(help="Storage location (local path or s3://...)")] = None,
|
|
42
|
+
run_id: Annotated[str | None, Parameter(help="Unique run identifier")] = None,
|
|
43
|
+
resume: Annotated[bool, Parameter(help="Resume from cached results")] = True,
|
|
44
|
+
distributed: Annotated[bool, Parameter(help="Use distributed execution with Ray")] = False,
|
|
45
|
+
workers: Annotated[int, Parameter(help="Number of workers for distributed execution")] = 4,
|
|
46
|
+
output: Annotated[str | None, Parameter(help="Output file (CSV, JSON, or HTML)")] = None,
|
|
47
|
+
) -> int:
|
|
48
|
+
"""Run an evaluation on a benchmark or custom dataset.
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
# Simple benchmark
|
|
52
|
+
themis eval math500 --model gpt-4 --limit 100
|
|
53
|
+
|
|
54
|
+
# Custom dataset
|
|
55
|
+
themis eval data.jsonl --model claude-3-opus --prompt "Q: {question}\\nA:"
|
|
56
|
+
|
|
57
|
+
# Distributed execution
|
|
58
|
+
themis eval gsm8k --model gpt-4 --distributed --workers 8
|
|
59
|
+
"""
|
|
60
|
+
import themis
|
|
61
|
+
from themis.experiment import export as experiment_export
|
|
62
|
+
|
|
63
|
+
print(f"Running evaluation: {benchmark_or_dataset}")
|
|
64
|
+
print(f"Model: {model}")
|
|
65
|
+
if limit:
|
|
66
|
+
print(f"Limit: {limit} samples")
|
|
67
|
+
print()
|
|
68
|
+
|
|
69
|
+
# Check if it's a file (custom dataset)
|
|
70
|
+
if Path(benchmark_or_dataset).exists():
|
|
71
|
+
print(f"Loading custom dataset from: {benchmark_or_dataset}")
|
|
72
|
+
# TODO: Load dataset from file
|
|
73
|
+
print("Error: Custom dataset files not yet implemented")
|
|
74
|
+
return 1
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# Run evaluation using unified API
|
|
78
|
+
report = themis.evaluate(
|
|
79
|
+
benchmark_or_dataset,
|
|
80
|
+
model=model,
|
|
81
|
+
limit=limit,
|
|
82
|
+
prompt=prompt,
|
|
83
|
+
temperature=temperature,
|
|
84
|
+
max_tokens=max_tokens,
|
|
85
|
+
storage=storage,
|
|
86
|
+
run_id=run_id,
|
|
87
|
+
resume=resume,
|
|
88
|
+
distributed=distributed,
|
|
89
|
+
workers=workers,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Print results
|
|
93
|
+
print("\n" + "=" * 80)
|
|
94
|
+
print("EVALUATION RESULTS")
|
|
95
|
+
print("=" * 80)
|
|
96
|
+
|
|
97
|
+
# Print metrics
|
|
98
|
+
eval_report = report.evaluation_report
|
|
99
|
+
if eval_report and eval_report.aggregates:
|
|
100
|
+
print("\nMetrics:")
|
|
101
|
+
for agg in eval_report.aggregates:
|
|
102
|
+
print(f" {agg.metric_name}: {agg.mean:.4f} (±{agg.std:.4f})")
|
|
103
|
+
|
|
104
|
+
# Print sample counts
|
|
105
|
+
total = len(report.generation_results)
|
|
106
|
+
failures = len(report.failures)
|
|
107
|
+
successful = total - failures
|
|
108
|
+
print(f"\nSamples: {successful}/{total} successful")
|
|
109
|
+
|
|
110
|
+
# Export if requested
|
|
111
|
+
if output:
|
|
112
|
+
output_path = Path(output)
|
|
113
|
+
suffix = output_path.suffix.lower()
|
|
114
|
+
|
|
115
|
+
if suffix == ".csv":
|
|
116
|
+
experiment_export.export_csv(report, output_path)
|
|
117
|
+
print(f"\nExported to CSV: {output_path}")
|
|
118
|
+
elif suffix == ".json":
|
|
119
|
+
experiment_export.export_json(report, output_path)
|
|
120
|
+
print(f"\nExported to JSON: {output_path}")
|
|
121
|
+
elif suffix in [".html", ".htm"]:
|
|
122
|
+
experiment_export.export_html(report, output_path)
|
|
123
|
+
print(f"\nExported to HTML: {output_path}")
|
|
124
|
+
else:
|
|
125
|
+
print(f"\nWarning: Unknown output format: {suffix}")
|
|
126
|
+
|
|
127
|
+
return 0
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
print(f"\nError: {e}", file=sys.stderr)
|
|
131
|
+
import traceback
|
|
132
|
+
traceback.print_exc()
|
|
133
|
+
return 1
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@app.command
|
|
137
|
+
def compare(
|
|
138
|
+
run_ids: Annotated[list[str], Parameter(name="RUN_IDS", show_default=False)],
|
|
139
|
+
*,
|
|
140
|
+
metric: Annotated[str | None, Parameter(help="Metric to compare")] = None,
|
|
141
|
+
output: Annotated[str | None, Parameter(help="Output file (HTML or Markdown)")] = None,
|
|
142
|
+
show_diff: Annotated[bool, Parameter(help="Show examples where results differ")] = False,
|
|
143
|
+
) -> int:
|
|
144
|
+
"""Compare results from multiple runs with statistical tests.
|
|
145
|
+
|
|
146
|
+
Performs pairwise comparisons across all specified runs and metrics,
|
|
147
|
+
computing win/loss matrices and statistical significance.
|
|
148
|
+
|
|
149
|
+
Examples:
|
|
150
|
+
# Compare two runs
|
|
151
|
+
themis compare run-1 run-2
|
|
152
|
+
|
|
153
|
+
# Compare with specific metric
|
|
154
|
+
themis compare run-1 run-2 run-3 --metric ExactMatch
|
|
155
|
+
|
|
156
|
+
# Export to HTML
|
|
157
|
+
themis compare run-1 run-2 --output comparison.html --show-diff
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
if len(run_ids) < 2:
|
|
161
|
+
print("Error: Need at least 2 runs to compare", file=sys.stderr)
|
|
162
|
+
return 1
|
|
163
|
+
|
|
164
|
+
# Determine storage path (default to .cache/experiments)
|
|
165
|
+
storage_path = Path(".cache/experiments")
|
|
166
|
+
|
|
167
|
+
if not storage_path.exists():
|
|
168
|
+
print(f"Error: Storage path not found: {storage_path}", file=sys.stderr)
|
|
169
|
+
print(f"Tip: Specify storage path with THEMIS_STORAGE env var", file=sys.stderr)
|
|
170
|
+
return 1
|
|
171
|
+
|
|
172
|
+
# Run comparison
|
|
173
|
+
print(f"Comparing {len(run_ids)} runs: {', '.join(run_ids)}")
|
|
174
|
+
print(f"Storage: {storage_path}")
|
|
175
|
+
print()
|
|
176
|
+
|
|
177
|
+
from themis.comparison import compare_runs
|
|
178
|
+
from themis.comparison.statistics import StatisticalTest
|
|
179
|
+
|
|
180
|
+
metrics_list = [metric] if metric else None
|
|
181
|
+
|
|
182
|
+
report = compare_runs(
|
|
183
|
+
run_ids=run_ids,
|
|
184
|
+
storage_path=storage_path,
|
|
185
|
+
metrics=metrics_list,
|
|
186
|
+
statistical_test=StatisticalTest.BOOTSTRAP,
|
|
187
|
+
alpha=0.05,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Print summary
|
|
191
|
+
print(report.summary(include_details=show_diff))
|
|
192
|
+
|
|
193
|
+
# Export if requested
|
|
194
|
+
if output:
|
|
195
|
+
output_path = Path(output)
|
|
196
|
+
suffix = output_path.suffix.lower()
|
|
197
|
+
|
|
198
|
+
if suffix == ".json":
|
|
199
|
+
import json
|
|
200
|
+
output_path.write_text(json.dumps(report.to_dict(), indent=2))
|
|
201
|
+
print(f"\n✓ Exported to JSON: {output_path}")
|
|
202
|
+
elif suffix == ".html":
|
|
203
|
+
html = _generate_comparison_html(report)
|
|
204
|
+
output_path.write_text(html)
|
|
205
|
+
print(f"\n✓ Exported to HTML: {output_path}")
|
|
206
|
+
elif suffix == ".md":
|
|
207
|
+
md = _generate_comparison_markdown(report)
|
|
208
|
+
output_path.write_text(md)
|
|
209
|
+
print(f"\n✓ Exported to Markdown: {output_path}")
|
|
210
|
+
else:
|
|
211
|
+
print(f"\nWarning: Unknown output format: {suffix}", file=sys.stderr)
|
|
212
|
+
|
|
213
|
+
return 0
|
|
214
|
+
|
|
215
|
+
except Exception as e:
|
|
216
|
+
print(f"\nError: {e}", file=sys.stderr)
|
|
217
|
+
import traceback
|
|
218
|
+
traceback.print_exc()
|
|
219
|
+
return 1
|
|
35
220
|
|
|
36
|
-
# Register demo command
|
|
37
|
-
app.command(name="demo")(demo.demo_command)
|
|
38
221
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
222
|
+
@app.command
|
|
223
|
+
def serve(
|
|
224
|
+
*,
|
|
225
|
+
port: Annotated[int, Parameter(help="Port to run server on")] = 8080,
|
|
226
|
+
host: Annotated[str, Parameter(help="Host to bind to")] = "127.0.0.1",
|
|
227
|
+
storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
|
|
228
|
+
reload: Annotated[bool, Parameter(help="Enable auto-reload (dev mode)")] = False,
|
|
229
|
+
) -> int:
|
|
230
|
+
"""Start the Themis API server with REST and WebSocket endpoints.
|
|
231
|
+
|
|
232
|
+
Provides:
|
|
233
|
+
- REST API for listing/viewing runs
|
|
234
|
+
- Comparison endpoints with statistical tests
|
|
235
|
+
- WebSocket for real-time monitoring
|
|
236
|
+
- Interactive API docs at /docs
|
|
237
|
+
|
|
238
|
+
Examples:
|
|
239
|
+
# Start server on default port
|
|
240
|
+
themis serve
|
|
241
|
+
|
|
242
|
+
# Custom port and storage
|
|
243
|
+
themis serve --port 3000 --storage ~/.themis/runs
|
|
244
|
+
|
|
245
|
+
# Development mode with auto-reload
|
|
246
|
+
themis serve --reload
|
|
247
|
+
"""
|
|
248
|
+
try:
|
|
249
|
+
from themis.server import create_app
|
|
250
|
+
import uvicorn
|
|
251
|
+
except ImportError:
|
|
252
|
+
print("Error: FastAPI server dependencies not installed", file=sys.stderr)
|
|
253
|
+
print("\nInstall with: pip install themis[server]", file=sys.stderr)
|
|
254
|
+
print(" or: uv pip install themis[server]", file=sys.stderr)
|
|
255
|
+
return 1
|
|
256
|
+
|
|
257
|
+
# Determine storage path
|
|
258
|
+
storage_path = Path(storage) if storage else Path(".cache/experiments")
|
|
259
|
+
|
|
260
|
+
print(f"Starting Themis API server...")
|
|
261
|
+
print(f" URL: http://{host}:{port}")
|
|
262
|
+
print(f" Storage: {storage_path}")
|
|
263
|
+
print(f" Docs: http://{host}:{port}/docs")
|
|
264
|
+
print()
|
|
265
|
+
|
|
266
|
+
# Create app
|
|
267
|
+
app_instance = create_app(storage_path=storage_path)
|
|
268
|
+
|
|
269
|
+
# Run server
|
|
270
|
+
uvicorn.run(
|
|
271
|
+
app_instance,
|
|
272
|
+
host=host,
|
|
273
|
+
port=port,
|
|
274
|
+
reload=reload,
|
|
275
|
+
log_level="info",
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return 0
|
|
46
279
|
|
|
47
|
-
# Register MCQ benchmark commands
|
|
48
|
-
app.command(name="supergpqa")(mcq_cmds.supergpqa_command)
|
|
49
|
-
app.command(name="mmlu-pro")(mcq_cmds.mmlu_pro_command)
|
|
50
280
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
281
|
+
@app.command
|
|
282
|
+
def list(
|
|
283
|
+
what: Annotated[str, Parameter(name="WHAT", show_default=False)],
|
|
284
|
+
*,
|
|
285
|
+
storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
|
|
286
|
+
limit: Annotated[int | None, Parameter(help="Limit number of results")] = None,
|
|
287
|
+
) -> int:
|
|
288
|
+
"""List runs, benchmarks, or available metrics.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
what: What to list (runs, benchmarks, or metrics)
|
|
292
|
+
|
|
293
|
+
Examples:
|
|
294
|
+
# List all runs
|
|
295
|
+
themis list runs
|
|
296
|
+
|
|
297
|
+
# List available benchmarks
|
|
298
|
+
themis list benchmarks
|
|
299
|
+
|
|
300
|
+
# List available metrics
|
|
301
|
+
themis list metrics
|
|
302
|
+
"""
|
|
303
|
+
# Validate input
|
|
304
|
+
if what not in ["runs", "benchmarks", "metrics"]:
|
|
305
|
+
print(f"Error: '{what}' is not valid. Choose from: runs, benchmarks, metrics")
|
|
306
|
+
return 1
|
|
307
|
+
|
|
308
|
+
if what == "benchmarks":
|
|
309
|
+
from themis.presets import list_benchmarks
|
|
310
|
+
|
|
311
|
+
benchmarks = list_benchmarks()
|
|
312
|
+
print("Available benchmarks:")
|
|
313
|
+
for benchmark in benchmarks:
|
|
314
|
+
print(f" - {benchmark}")
|
|
315
|
+
return 0
|
|
316
|
+
|
|
317
|
+
elif what == "metrics":
|
|
318
|
+
print("Available metrics:")
|
|
319
|
+
print(" Math:")
|
|
320
|
+
print(" - exact_match")
|
|
321
|
+
print(" - math_verify")
|
|
322
|
+
print(" General:")
|
|
323
|
+
print(" - response_length")
|
|
324
|
+
print("\n Note: NLP and code metrics will be added in Phase 2")
|
|
325
|
+
return 0
|
|
326
|
+
|
|
327
|
+
elif what == "runs":
|
|
328
|
+
print("Listing runs...")
|
|
329
|
+
print("Note: Run listing not yet fully implemented")
|
|
330
|
+
return 1
|
|
331
|
+
|
|
332
|
+
return 0
|
|
55
333
|
|
|
56
|
-
# Register info and listing commands
|
|
57
|
-
app.command(name="list-providers")(benchmarks.list_providers)
|
|
58
|
-
app.command(name="list-benchmarks")(benchmarks.list_benchmarks)
|
|
59
|
-
app.command(name="info")(info.show_info)
|
|
60
|
-
app.command(name="new-project")(info.new_project)
|
|
61
334
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
335
|
+
@app.command
|
|
336
|
+
def clean(
|
|
337
|
+
*,
|
|
338
|
+
storage: Annotated[str | None, Parameter(help="Storage path to clean")] = None,
|
|
339
|
+
older_than: Annotated[int | None, Parameter(help="Remove runs older than N days")] = None,
|
|
340
|
+
dry_run: Annotated[bool, Parameter(help="Show what would be deleted")] = False,
|
|
341
|
+
) -> int:
|
|
342
|
+
"""Clean up old runs and cached data.
|
|
343
|
+
|
|
344
|
+
Examples:
|
|
345
|
+
# Dry run to see what would be deleted
|
|
346
|
+
themis clean --dry-run
|
|
347
|
+
|
|
348
|
+
# Remove runs older than 30 days
|
|
349
|
+
themis clean --older-than 30
|
|
350
|
+
"""
|
|
351
|
+
print("Cleaning storage...")
|
|
352
|
+
print("Note: Storage cleanup not yet implemented")
|
|
353
|
+
print("This will be implemented in Phase 6")
|
|
354
|
+
return 1
|
|
66
355
|
|
|
67
|
-
# Register cost commands
|
|
68
|
-
app.command(name="estimate-cost")(cost.estimate_cost_command)
|
|
69
|
-
app.command(name="show-pricing")(cost.show_pricing_command)
|
|
70
356
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
357
|
+
def _generate_comparison_html(report) -> str:
|
|
358
|
+
"""Generate HTML report for comparison."""
|
|
359
|
+
html = f"""<!DOCTYPE html>
|
|
360
|
+
<html>
|
|
361
|
+
<head>
|
|
362
|
+
<title>Comparison Report</title>
|
|
363
|
+
<style>
|
|
364
|
+
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
|
365
|
+
h1, h2, h3 {{ color: #333; }}
|
|
366
|
+
table {{ border-collapse: collapse; margin: 20px 0; }}
|
|
367
|
+
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
|
368
|
+
th {{ background-color: #f2f2f2; }}
|
|
369
|
+
.win {{ background-color: #d4edda; }}
|
|
370
|
+
.loss {{ background-color: #f8d7da; }}
|
|
371
|
+
.tie {{ background-color: #fff3cd; }}
|
|
372
|
+
.significant {{ font-weight: bold; color: #28a745; }}
|
|
373
|
+
</style>
|
|
374
|
+
</head>
|
|
375
|
+
<body>
|
|
376
|
+
<h1>Comparison Report</h1>
|
|
377
|
+
<p><strong>Runs:</strong> {', '.join(report.run_ids)}</p>
|
|
378
|
+
<p><strong>Metrics:</strong> {', '.join(report.metrics)}</p>
|
|
379
|
+
<p><strong>Overall Best:</strong> {report.overall_best_run}</p>
|
|
380
|
+
|
|
381
|
+
<h2>Best Run Per Metric</h2>
|
|
382
|
+
<ul>
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
for metric, run_id in report.best_run_per_metric.items():
|
|
386
|
+
html += f" <li><strong>{metric}:</strong> {run_id}</li>\n"
|
|
387
|
+
|
|
388
|
+
html += """ </ul>
|
|
389
|
+
|
|
390
|
+
<h2>Win/Loss Matrices</h2>
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
for metric, matrix in report.win_loss_matrices.items():
|
|
394
|
+
html += f" <h3>{metric}</h3>\n"
|
|
395
|
+
html += " <table>\n"
|
|
396
|
+
html += " <tr><th>Run</th>"
|
|
397
|
+
for rid in matrix.run_ids:
|
|
398
|
+
html += f"<th>{rid}</th>"
|
|
399
|
+
html += "</tr>\n"
|
|
400
|
+
|
|
401
|
+
for i, run_id in enumerate(matrix.run_ids):
|
|
402
|
+
html += f" <tr><td><strong>{run_id}</strong></td>"
|
|
403
|
+
for j in range(len(matrix.run_ids)):
|
|
404
|
+
result = matrix.matrix[i][j]
|
|
405
|
+
css_class = result if result in ["win", "loss", "tie"] else ""
|
|
406
|
+
html += f'<td class="{css_class}">{result}</td>'
|
|
407
|
+
html += "</tr>\n"
|
|
408
|
+
|
|
409
|
+
html += " </table>\n"
|
|
410
|
+
|
|
411
|
+
html += """</body>
|
|
412
|
+
</html>"""
|
|
413
|
+
|
|
414
|
+
return html
|
|
75
415
|
|
|
76
|
-
# Register leaderboard command
|
|
77
|
-
app.command(name="leaderboard")(leaderboard.leaderboard_command)
|
|
78
416
|
|
|
79
|
-
|
|
80
|
-
|
|
417
|
+
def _generate_comparison_markdown(report) -> str:
|
|
418
|
+
"""Generate Markdown report for comparison."""
|
|
419
|
+
lines = []
|
|
420
|
+
lines.append("# Comparison Report")
|
|
421
|
+
lines.append("")
|
|
422
|
+
lines.append(f"**Runs:** {', '.join(report.run_ids)}")
|
|
423
|
+
lines.append(f"**Metrics:** {', '.join(report.metrics)}")
|
|
424
|
+
lines.append(f"**Overall Best:** {report.overall_best_run}")
|
|
425
|
+
lines.append("")
|
|
426
|
+
|
|
427
|
+
lines.append("## Best Run Per Metric")
|
|
428
|
+
lines.append("")
|
|
429
|
+
for metric, run_id in report.best_run_per_metric.items():
|
|
430
|
+
lines.append(f"- **{metric}:** {run_id}")
|
|
431
|
+
lines.append("")
|
|
432
|
+
|
|
433
|
+
lines.append("## Win/Loss Matrices")
|
|
434
|
+
lines.append("")
|
|
435
|
+
for metric, matrix in report.win_loss_matrices.items():
|
|
436
|
+
lines.append(f"### {metric}")
|
|
437
|
+
lines.append("")
|
|
438
|
+
lines.append(matrix.to_table())
|
|
439
|
+
lines.append("")
|
|
440
|
+
|
|
441
|
+
return "\n".join(lines)
|
|
81
442
|
|
|
82
443
|
|
|
83
444
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
445
|
+
"""Main CLI entry point."""
|
|
84
446
|
parsed_argv = list(argv) if argv is not None else None
|
|
85
447
|
try:
|
|
86
448
|
result = app(parsed_argv)
|
|
87
449
|
except SystemExit as exc: # pragma: no cover - CLI integration path
|
|
88
450
|
return int(exc.code or 0)
|
|
451
|
+
except KeyboardInterrupt:
|
|
452
|
+
print("\nInterrupted by user")
|
|
453
|
+
return 130
|
|
454
|
+
except Exception as e:
|
|
455
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
456
|
+
import traceback
|
|
457
|
+
traceback.print_exc()
|
|
458
|
+
return 1
|
|
89
459
|
return int(result) if isinstance(result, int) else 0
|
|
90
460
|
|
|
91
461
|
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Comparison engine for analyzing multiple experiment runs.
|
|
2
|
+
|
|
3
|
+
This module provides tools for comparing different models, prompts, or
|
|
4
|
+
configurations across multiple runs with statistical rigor.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from themis.comparison.engine import ComparisonEngine, compare_runs
|
|
8
|
+
from themis.comparison.reports import ComparisonReport, ComparisonResult
|
|
9
|
+
from themis.comparison.statistics import (
|
|
10
|
+
StatisticalTest,
|
|
11
|
+
bootstrap_confidence_interval,
|
|
12
|
+
permutation_test,
|
|
13
|
+
t_test,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ComparisonEngine",
|
|
18
|
+
"compare_runs",
|
|
19
|
+
"ComparisonReport",
|
|
20
|
+
"ComparisonResult",
|
|
21
|
+
"StatisticalTest",
|
|
22
|
+
"bootstrap_confidence_interval",
|
|
23
|
+
"permutation_test",
|
|
24
|
+
"t_test",
|
|
25
|
+
]
|