themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +16 -0
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/cli/main.py
CHANGED
|
@@ -1,16 +1,19 @@
|
|
|
1
|
-
"""Simplified CLI for Themis -
|
|
1
|
+
"""Simplified CLI for Themis - seven focused commands.
|
|
2
2
|
|
|
3
|
-
This is the
|
|
4
|
-
It replaces 20+ commands with
|
|
3
|
+
This is the unified CLI that leverages the themis.evaluate() API.
|
|
4
|
+
It replaces 20+ commands with a smaller, task-oriented set.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
import os
|
|
9
10
|
import sys
|
|
11
|
+
from datetime import datetime, timedelta
|
|
10
12
|
from pathlib import Path
|
|
11
13
|
from typing import Annotated, Sequence
|
|
12
14
|
|
|
13
15
|
from cyclopts import App, Parameter
|
|
16
|
+
from themis._version import __version__
|
|
14
17
|
|
|
15
18
|
# Import provider modules to ensure they register themselves
|
|
16
19
|
try:
|
|
@@ -25,10 +28,24 @@ except ImportError:
|
|
|
25
28
|
app = App(
|
|
26
29
|
name="themis",
|
|
27
30
|
help="Dead simple LLM evaluation platform",
|
|
28
|
-
version=
|
|
31
|
+
version=__version__,
|
|
29
32
|
)
|
|
30
33
|
|
|
31
34
|
|
|
35
|
+
@app.command
|
|
36
|
+
def demo(
|
|
37
|
+
*,
|
|
38
|
+
model: Annotated[str, Parameter(help="Model identifier")] = "fake-math-llm",
|
|
39
|
+
limit: Annotated[int, Parameter(help="Maximum number of samples")] = 10,
|
|
40
|
+
) -> int:
|
|
41
|
+
"""Run the built-in demo benchmark."""
|
|
42
|
+
return eval(
|
|
43
|
+
"demo",
|
|
44
|
+
model=model,
|
|
45
|
+
limit=limit,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
32
49
|
@app.command
|
|
33
50
|
def eval(
|
|
34
51
|
benchmark_or_dataset: Annotated[str, Parameter(name="BENCHMARK_OR_DATASET", show_default=False)],
|
|
@@ -57,7 +74,6 @@ def eval(
|
|
|
57
74
|
# Distributed execution
|
|
58
75
|
themis eval gsm8k --model gpt-4 --distributed --workers 8
|
|
59
76
|
"""
|
|
60
|
-
import themis
|
|
61
77
|
from themis.experiment import export as experiment_export
|
|
62
78
|
|
|
63
79
|
print(f"Running evaluation: {benchmark_or_dataset}")
|
|
@@ -72,21 +88,47 @@ def eval(
|
|
|
72
88
|
# TODO: Load dataset from file
|
|
73
89
|
print("Error: Custom dataset files not yet implemented")
|
|
74
90
|
return 1
|
|
75
|
-
|
|
91
|
+
|
|
76
92
|
try:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
93
|
+
if distributed:
|
|
94
|
+
print("Error: distributed execution is not supported in vNext CLI yet")
|
|
95
|
+
return 1
|
|
96
|
+
|
|
97
|
+
from themis.evaluation.pipeline import EvaluationPipeline
|
|
98
|
+
from themis.generation.templates import PromptTemplate
|
|
99
|
+
from themis.presets import get_benchmark_preset
|
|
100
|
+
from themis.session import ExperimentSession
|
|
101
|
+
from themis.specs import ExecutionSpec, ExperimentSpec, StorageSpec
|
|
102
|
+
|
|
103
|
+
# Resolve benchmark preset
|
|
104
|
+
preset = get_benchmark_preset(benchmark_or_dataset)
|
|
105
|
+
|
|
106
|
+
dataset = preset.load_dataset(limit=limit)
|
|
107
|
+
|
|
108
|
+
if prompt is None:
|
|
109
|
+
prompt_template = preset.prompt_template
|
|
110
|
+
else:
|
|
111
|
+
prompt_template = PromptTemplate(name="custom", template=prompt)
|
|
112
|
+
|
|
113
|
+
pipeline = EvaluationPipeline(
|
|
114
|
+
extractor=preset.extractor,
|
|
115
|
+
metrics=preset.metrics,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
spec = ExperimentSpec(
|
|
119
|
+
dataset=dataset,
|
|
120
|
+
prompt=prompt_template.template,
|
|
80
121
|
model=model,
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
temperature=temperature,
|
|
84
|
-
max_tokens=max_tokens,
|
|
85
|
-
storage=storage,
|
|
122
|
+
sampling={"temperature": temperature, "max_tokens": max_tokens},
|
|
123
|
+
pipeline=pipeline,
|
|
86
124
|
run_id=run_id,
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
storage_root = _resolve_storage_root(storage)
|
|
128
|
+
report = ExperimentSession().run(
|
|
129
|
+
spec,
|
|
130
|
+
execution=ExecutionSpec(workers=workers),
|
|
131
|
+
storage=StorageSpec(path=storage_root, cache=resume),
|
|
90
132
|
)
|
|
91
133
|
|
|
92
134
|
# Print results
|
|
@@ -96,10 +138,18 @@ def eval(
|
|
|
96
138
|
|
|
97
139
|
# Print metrics
|
|
98
140
|
eval_report = report.evaluation_report
|
|
99
|
-
if eval_report
|
|
141
|
+
if eval_report:
|
|
100
142
|
print("\nMetrics:")
|
|
101
|
-
|
|
102
|
-
|
|
143
|
+
if getattr(eval_report, "aggregates", None):
|
|
144
|
+
for agg in eval_report.aggregates:
|
|
145
|
+
std = getattr(agg, "std", None)
|
|
146
|
+
if std is None:
|
|
147
|
+
print(f" {agg.metric_name}: {agg.mean:.4f}")
|
|
148
|
+
else:
|
|
149
|
+
print(f" {agg.metric_name}: {agg.mean:.4f} (±{std:.4f})")
|
|
150
|
+
elif getattr(eval_report, "metrics", None):
|
|
151
|
+
for name, agg in sorted(eval_report.metrics.items()):
|
|
152
|
+
print(f" {name}: {agg.mean:.4f} (n={agg.count})")
|
|
103
153
|
|
|
104
154
|
# Print sample counts
|
|
105
155
|
total = len(report.generation_results)
|
|
@@ -113,13 +163,13 @@ def eval(
|
|
|
113
163
|
suffix = output_path.suffix.lower()
|
|
114
164
|
|
|
115
165
|
if suffix == ".csv":
|
|
116
|
-
experiment_export.
|
|
166
|
+
experiment_export.export_report_csv(report, output_path)
|
|
117
167
|
print(f"\nExported to CSV: {output_path}")
|
|
118
168
|
elif suffix == ".json":
|
|
119
|
-
experiment_export.
|
|
169
|
+
experiment_export.export_report_json(report, output_path)
|
|
120
170
|
print(f"\nExported to JSON: {output_path}")
|
|
121
171
|
elif suffix in [".html", ".htm"]:
|
|
122
|
-
experiment_export.
|
|
172
|
+
experiment_export.export_html_report(report, output_path)
|
|
123
173
|
print(f"\nExported to HTML: {output_path}")
|
|
124
174
|
else:
|
|
125
175
|
print(f"\nWarning: Unknown output format: {suffix}")
|
|
@@ -138,6 +188,7 @@ def compare(
|
|
|
138
188
|
run_ids: Annotated[list[str], Parameter(name="RUN_IDS", show_default=False)],
|
|
139
189
|
*,
|
|
140
190
|
metric: Annotated[str | None, Parameter(help="Metric to compare")] = None,
|
|
191
|
+
storage: Annotated[str | None, Parameter(help="Storage location (local path or s3://...)")] = None,
|
|
141
192
|
output: Annotated[str | None, Parameter(help="Output file (HTML or Markdown)")] = None,
|
|
142
193
|
show_diff: Annotated[bool, Parameter(help="Show examples where results differ")] = False,
|
|
143
194
|
) -> int:
|
|
@@ -162,7 +213,7 @@ def compare(
|
|
|
162
213
|
return 1
|
|
163
214
|
|
|
164
215
|
# Determine storage path (default to .cache/experiments)
|
|
165
|
-
storage_path =
|
|
216
|
+
storage_path = _resolve_storage_root(storage)
|
|
166
217
|
|
|
167
218
|
if not storage_path.exists():
|
|
168
219
|
print(f"Error: Storage path not found: {storage_path}", file=sys.stderr)
|
|
@@ -219,6 +270,62 @@ def compare(
|
|
|
219
270
|
return 1
|
|
220
271
|
|
|
221
272
|
|
|
273
|
+
@app.command
|
|
274
|
+
def share(
|
|
275
|
+
run_id: Annotated[str, Parameter(name="RUN_ID", show_default=False)],
|
|
276
|
+
*,
|
|
277
|
+
storage: Annotated[str | None, Parameter(help="Storage location (defaults to .cache/experiments)")] = None,
|
|
278
|
+
metric: Annotated[str | None, Parameter(help="Metric to highlight (default: first available)")] = None,
|
|
279
|
+
output_dir: Annotated[Path, Parameter(help="Directory to write share assets")] = Path("."),
|
|
280
|
+
) -> int:
|
|
281
|
+
"""Generate a shareable results badge + Markdown snippet for a run.
|
|
282
|
+
|
|
283
|
+
Examples:
|
|
284
|
+
# Create share assets in current directory
|
|
285
|
+
themis share run-20260118-032014
|
|
286
|
+
|
|
287
|
+
# Highlight a specific metric
|
|
288
|
+
themis share run-20260118-032014 --metric accuracy
|
|
289
|
+
|
|
290
|
+
# Write to a dedicated folder
|
|
291
|
+
themis share run-20260118-032014 --output-dir share
|
|
292
|
+
"""
|
|
293
|
+
from themis.experiment.share import create_share_pack
|
|
294
|
+
|
|
295
|
+
storage_root = Path(storage) if storage else Path(".cache/experiments")
|
|
296
|
+
if not storage_root.exists():
|
|
297
|
+
print(f"Error: Storage path not found: {storage_root}", file=sys.stderr)
|
|
298
|
+
return 1
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
share_pack = create_share_pack(
|
|
302
|
+
run_id=run_id,
|
|
303
|
+
storage_root=storage_root,
|
|
304
|
+
output_dir=output_dir,
|
|
305
|
+
metric=metric,
|
|
306
|
+
)
|
|
307
|
+
except FileNotFoundError as e:
|
|
308
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
309
|
+
return 1
|
|
310
|
+
except ValueError as e:
|
|
311
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
312
|
+
return 1
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f"Unexpected error: {e}", file=sys.stderr)
|
|
315
|
+
import traceback
|
|
316
|
+
traceback.print_exc()
|
|
317
|
+
return 1
|
|
318
|
+
|
|
319
|
+
print("✓ Share assets created")
|
|
320
|
+
print(f" SVG: {share_pack.svg_path}")
|
|
321
|
+
print(f" Markdown: {share_pack.markdown_path}")
|
|
322
|
+
print("\nSnippet:")
|
|
323
|
+
print(share_pack.markdown_snippet)
|
|
324
|
+
if share_pack.event_log_path:
|
|
325
|
+
print(f"\nEvent logged to: {share_pack.event_log_path}")
|
|
326
|
+
return 0
|
|
327
|
+
|
|
328
|
+
|
|
222
329
|
@app.command
|
|
223
330
|
def serve(
|
|
224
331
|
*,
|
|
@@ -254,8 +361,8 @@ def serve(
|
|
|
254
361
|
print(" or: uv pip install themis[server]", file=sys.stderr)
|
|
255
362
|
return 1
|
|
256
363
|
|
|
257
|
-
|
|
258
|
-
|
|
364
|
+
# Determine storage path
|
|
365
|
+
storage_path = _resolve_storage_root(storage)
|
|
259
366
|
|
|
260
367
|
print(f"Starting Themis API server...")
|
|
261
368
|
print(f" URL: http://{host}:{port}")
|
|
@@ -284,6 +391,7 @@ def list(
|
|
|
284
391
|
*,
|
|
285
392
|
storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
|
|
286
393
|
limit: Annotated[int | None, Parameter(help="Limit number of results")] = None,
|
|
394
|
+
verbose: Annotated[bool, Parameter(help="Show detailed information")] = False,
|
|
287
395
|
) -> int:
|
|
288
396
|
"""List runs, benchmarks, or available metrics.
|
|
289
397
|
|
|
@@ -306,28 +414,65 @@ def list(
|
|
|
306
414
|
return 1
|
|
307
415
|
|
|
308
416
|
if what == "benchmarks":
|
|
309
|
-
from themis.presets import list_benchmarks
|
|
417
|
+
from themis.presets import get_benchmark_preset, list_benchmarks
|
|
310
418
|
|
|
311
419
|
benchmarks = list_benchmarks()
|
|
420
|
+
if limit is not None:
|
|
421
|
+
benchmarks = benchmarks[:limit]
|
|
312
422
|
print("Available benchmarks:")
|
|
313
423
|
for benchmark in benchmarks:
|
|
314
|
-
|
|
424
|
+
if verbose:
|
|
425
|
+
preset = get_benchmark_preset(benchmark)
|
|
426
|
+
description = preset.description or "No description"
|
|
427
|
+
print(f" - {benchmark}: {description}")
|
|
428
|
+
else:
|
|
429
|
+
print(f" - {benchmark}")
|
|
315
430
|
return 0
|
|
316
431
|
|
|
317
432
|
elif what == "metrics":
|
|
318
433
|
print("Available metrics:")
|
|
434
|
+
print(" Core:")
|
|
435
|
+
print(" - exact_match (no extra dependencies)")
|
|
436
|
+
print(" - response_length (no extra dependencies)")
|
|
319
437
|
print(" Math:")
|
|
320
|
-
print(" -
|
|
321
|
-
print("
|
|
322
|
-
print("
|
|
323
|
-
print(" -
|
|
324
|
-
print("
|
|
438
|
+
print(" - math_verify (requires: themis-eval[math], math-verify)")
|
|
439
|
+
print(" NLP (requires: themis-eval[nlp]):")
|
|
440
|
+
print(" - bleu (sacrebleu)")
|
|
441
|
+
print(" - rouge1 / rouge2 / rougeL (rouge-score)")
|
|
442
|
+
print(" - bertscore (bert-score)")
|
|
443
|
+
print(" - meteor (nltk)")
|
|
444
|
+
print(" Code:")
|
|
445
|
+
print(" - pass_at_k (no extra dependencies)")
|
|
446
|
+
print(" - execution_accuracy (no extra dependencies)")
|
|
447
|
+
print(" - codebleu (requires: themis-eval[code], codebleu)")
|
|
448
|
+
print("\nInstall extras: pip install themis-eval[math,nlp,code]")
|
|
325
449
|
return 0
|
|
326
450
|
|
|
327
451
|
elif what == "runs":
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
452
|
+
from themis.storage import ExperimentStorage
|
|
453
|
+
|
|
454
|
+
storage_root = _resolve_storage_root(storage)
|
|
455
|
+
if not storage_root.exists():
|
|
456
|
+
print(f"No storage found at {storage_root}")
|
|
457
|
+
return 1
|
|
458
|
+
|
|
459
|
+
storage_backend = ExperimentStorage(storage_root)
|
|
460
|
+
runs = storage_backend.list_runs(limit=limit)
|
|
461
|
+
if not runs:
|
|
462
|
+
print("No runs found.")
|
|
463
|
+
return 0
|
|
464
|
+
|
|
465
|
+
print("Runs:")
|
|
466
|
+
for run in runs:
|
|
467
|
+
status = run.status.value if hasattr(run.status, "value") else str(run.status)
|
|
468
|
+
if verbose:
|
|
469
|
+
print(
|
|
470
|
+
f" - {run.run_id} [{status}] samples={run.total_samples} "
|
|
471
|
+
f"created={run.created_at}"
|
|
472
|
+
)
|
|
473
|
+
else:
|
|
474
|
+
print(f" - {run.run_id}")
|
|
475
|
+
return 0
|
|
331
476
|
|
|
332
477
|
return 0
|
|
333
478
|
|
|
@@ -348,10 +493,56 @@ def clean(
|
|
|
348
493
|
# Remove runs older than 30 days
|
|
349
494
|
themis clean --older-than 30
|
|
350
495
|
"""
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
496
|
+
from themis.storage import ExperimentStorage
|
|
497
|
+
|
|
498
|
+
storage_root = _resolve_storage_root(storage)
|
|
499
|
+
if not storage_root.exists():
|
|
500
|
+
print(f"No storage found at {storage_root}")
|
|
501
|
+
return 1
|
|
502
|
+
|
|
503
|
+
if older_than is None:
|
|
504
|
+
print("Error: --older-than is required to clean runs")
|
|
505
|
+
return 1
|
|
506
|
+
|
|
507
|
+
storage_backend = ExperimentStorage(storage_root)
|
|
508
|
+
runs = storage_backend.list_runs()
|
|
509
|
+
cutoff = datetime.now() - timedelta(days=older_than)
|
|
510
|
+
|
|
511
|
+
candidates = []
|
|
512
|
+
for run in runs:
|
|
513
|
+
try:
|
|
514
|
+
created_at = datetime.fromisoformat(run.created_at)
|
|
515
|
+
except ValueError:
|
|
516
|
+
continue
|
|
517
|
+
if created_at < cutoff:
|
|
518
|
+
candidates.append(run)
|
|
519
|
+
|
|
520
|
+
if not candidates:
|
|
521
|
+
print("No runs matched the cleanup criteria.")
|
|
522
|
+
return 0
|
|
523
|
+
|
|
524
|
+
if dry_run:
|
|
525
|
+
print("Runs to delete:")
|
|
526
|
+
for run in candidates:
|
|
527
|
+
print(f" - {run.run_id} (created {run.created_at})")
|
|
528
|
+
return 0
|
|
529
|
+
|
|
530
|
+
deleted = 0
|
|
531
|
+
for run in candidates:
|
|
532
|
+
storage_backend.delete_run(run.run_id)
|
|
533
|
+
deleted += 1
|
|
534
|
+
|
|
535
|
+
print(f"Deleted {deleted} run(s).")
|
|
536
|
+
return 0
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _resolve_storage_root(storage: str | None) -> Path:
|
|
540
|
+
if storage:
|
|
541
|
+
return Path(storage).expanduser()
|
|
542
|
+
env_storage = os.getenv("THEMIS_STORAGE")
|
|
543
|
+
if env_storage:
|
|
544
|
+
return Path(env_storage).expanduser()
|
|
545
|
+
return Path(".cache/experiments")
|
|
355
546
|
|
|
356
547
|
|
|
357
548
|
def _generate_comparison_html(report) -> str:
|
themis/comparison/engine.py
CHANGED
|
@@ -11,7 +11,7 @@ from typing import Sequence
|
|
|
11
11
|
|
|
12
12
|
from themis.comparison import reports, statistics
|
|
13
13
|
from themis.comparison.statistics import StatisticalTest
|
|
14
|
-
from themis.
|
|
14
|
+
from themis.storage import ExperimentStorage
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class ComparisonEngine:
|
|
@@ -24,7 +24,7 @@ class ComparisonEngine:
|
|
|
24
24
|
def __init__(
|
|
25
25
|
self,
|
|
26
26
|
*,
|
|
27
|
-
storage:
|
|
27
|
+
storage: ExperimentStorage | None = None,
|
|
28
28
|
storage_path: str | Path | None = None,
|
|
29
29
|
statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
|
|
30
30
|
alpha: float = 0.05,
|
|
@@ -44,7 +44,7 @@ class ComparisonEngine:
|
|
|
44
44
|
if storage is None and storage_path is None:
|
|
45
45
|
raise ValueError("Either storage or storage_path must be provided")
|
|
46
46
|
|
|
47
|
-
self._storage = storage or
|
|
47
|
+
self._storage = storage or ExperimentStorage(storage_path)
|
|
48
48
|
self._statistical_test = statistical_test
|
|
49
49
|
self._alpha = alpha
|
|
50
50
|
self._n_bootstrap = n_bootstrap
|
|
@@ -161,18 +161,12 @@ class ComparisonEngine:
|
|
|
161
161
|
|
|
162
162
|
# eval_dict is a dict, so iterate over values
|
|
163
163
|
for record in eval_dict.values():
|
|
164
|
-
for
|
|
164
|
+
for score_obj in record.scores:
|
|
165
|
+
metric_name = score_obj.metric_name
|
|
165
166
|
if metric_name not in metric_scores:
|
|
166
167
|
metric_scores[metric_name] = []
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if hasattr(score_obj, 'value'):
|
|
170
|
-
score = score_obj.value
|
|
171
|
-
elif isinstance(score_obj, (int, float)):
|
|
172
|
-
score = float(score_obj)
|
|
173
|
-
else:
|
|
174
|
-
continue # Skip non-numeric scores
|
|
175
|
-
|
|
168
|
+
|
|
169
|
+
score = score_obj.value
|
|
176
170
|
metric_scores[metric_name].append(score)
|
|
177
171
|
|
|
178
172
|
return metric_scores
|
themis/core/entities.py
CHANGED
|
@@ -26,6 +26,10 @@ class ModelSpec:
|
|
|
26
26
|
default_sampling: SamplingConfig | None = None
|
|
27
27
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
28
28
|
|
|
29
|
+
@property
|
|
30
|
+
def model_key(self) -> str:
|
|
31
|
+
return f"{self.provider}:{self.identifier}"
|
|
32
|
+
|
|
29
33
|
|
|
30
34
|
@dataclass(frozen=True)
|
|
31
35
|
class PromptSpec:
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Primary metric evaluation pipeline for vNext workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MetricPipeline(EvaluationPipeline):
|
|
9
|
+
"""Primary evaluation pipeline for vNext (alias of standard pipeline)."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = ["MetricPipeline"]
|
themis/evaluation/pipeline.py
CHANGED
|
@@ -25,22 +25,44 @@ Example (Composable):
|
|
|
25
25
|
|
|
26
26
|
from __future__ import annotations
|
|
27
27
|
|
|
28
|
+
# vNext: protocol definition for evaluation pipelines
|
|
29
|
+
from typing import Protocol, Sequence, runtime_checkable
|
|
30
|
+
|
|
28
31
|
# Re-export pipeline implementations for backward compatibility
|
|
29
32
|
from themis.evaluation.pipelines.composable_pipeline import (
|
|
30
33
|
ComposableEvaluationPipeline,
|
|
34
|
+
ComposableEvaluationReportPipeline,
|
|
31
35
|
EvaluationResult,
|
|
32
36
|
EvaluationStep,
|
|
33
37
|
)
|
|
34
38
|
from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
|
|
39
|
+
from themis.evaluation.metric_pipeline import MetricPipeline
|
|
35
40
|
from themis.evaluation.reports import (
|
|
36
41
|
EvaluationFailure,
|
|
37
42
|
EvaluationReport,
|
|
38
43
|
MetricAggregate,
|
|
39
44
|
)
|
|
45
|
+
from themis.core import entities as core_entities
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@runtime_checkable
|
|
49
|
+
class EvaluationPipelineContract(Protocol):
|
|
50
|
+
"""Contract for evaluation pipelines."""
|
|
51
|
+
|
|
52
|
+
def evaluate(
|
|
53
|
+
self, records: Sequence[core_entities.GenerationRecord]
|
|
54
|
+
) -> EvaluationReport: # pragma: no cover - protocol
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
def evaluation_fingerprint(self) -> dict: # pragma: no cover - protocol
|
|
58
|
+
...
|
|
40
59
|
|
|
41
60
|
__all__ = [
|
|
42
61
|
"EvaluationPipeline",
|
|
62
|
+
"EvaluationPipelineContract",
|
|
63
|
+
"MetricPipeline",
|
|
43
64
|
"ComposableEvaluationPipeline",
|
|
65
|
+
"ComposableEvaluationReportPipeline",
|
|
44
66
|
"EvaluationStep",
|
|
45
67
|
"EvaluationResult",
|
|
46
68
|
"MetricAggregate",
|
|
@@ -2,14 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
from themis.evaluation.pipelines.composable_pipeline import (
|
|
4
4
|
ComposableEvaluationPipeline,
|
|
5
|
+
ComposableEvaluationReportPipeline,
|
|
5
6
|
EvaluationResult,
|
|
6
7
|
EvaluationStep,
|
|
7
8
|
)
|
|
8
9
|
from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
|
|
10
|
+
from themis.evaluation.metric_pipeline import MetricPipeline
|
|
9
11
|
|
|
10
12
|
__all__ = [
|
|
11
13
|
"EvaluationPipeline",
|
|
14
|
+
"MetricPipeline",
|
|
12
15
|
"ComposableEvaluationPipeline",
|
|
16
|
+
"ComposableEvaluationReportPipeline",
|
|
13
17
|
"EvaluationStep",
|
|
14
18
|
"EvaluationResult",
|
|
15
19
|
]
|
|
@@ -6,6 +6,7 @@ from dataclasses import dataclass, field
|
|
|
6
6
|
from typing import Any, Callable, Generic, Sequence, TypeVar
|
|
7
7
|
|
|
8
8
|
from themis.core import entities as core_entities
|
|
9
|
+
from themis.evaluation.reports import EvaluationFailure, EvaluationReport, MetricAggregate
|
|
9
10
|
from themis.interfaces import Metric as MetricInterface
|
|
10
11
|
from themis.utils import tracing
|
|
11
12
|
|
|
@@ -355,3 +356,57 @@ class ComposableEvaluationPipeline:
|
|
|
355
356
|
"""
|
|
356
357
|
self._steps.clear()
|
|
357
358
|
return self
|
|
359
|
+
|
|
360
|
+
def evaluation_fingerprint(self) -> dict:
|
|
361
|
+
"""Return a fingerprint based on the configured steps."""
|
|
362
|
+
return {"steps": self.get_step_names()}
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
class ComposableEvaluationReportPipeline:
|
|
366
|
+
"""Adapter that makes a ComposableEvaluationPipeline compatible with EvaluationPipeline."""
|
|
367
|
+
|
|
368
|
+
def __init__(self, pipeline: ComposableEvaluationPipeline):
|
|
369
|
+
self._pipeline = pipeline
|
|
370
|
+
|
|
371
|
+
def evaluate(
|
|
372
|
+
self, records: Sequence[core_entities.GenerationRecord]
|
|
373
|
+
) -> EvaluationReport:
|
|
374
|
+
per_metric: dict[str, list[core_entities.MetricScore]] = {}
|
|
375
|
+
failures: list[EvaluationFailure] = []
|
|
376
|
+
per_record: list[core_entities.EvaluationRecord] = []
|
|
377
|
+
|
|
378
|
+
for record in records:
|
|
379
|
+
result = self._pipeline.evaluate(record)
|
|
380
|
+
sample_id = record.task.metadata.get("dataset_id") or record.task.metadata.get(
|
|
381
|
+
"sample_id"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if result.errors:
|
|
385
|
+
for error in result.errors:
|
|
386
|
+
failures.append(EvaluationFailure(sample_id=sample_id, message=error))
|
|
387
|
+
|
|
388
|
+
for score in result.scores:
|
|
389
|
+
per_metric.setdefault(score.metric_name, []).append(score)
|
|
390
|
+
|
|
391
|
+
per_record.append(
|
|
392
|
+
core_entities.EvaluationRecord(
|
|
393
|
+
sample_id=sample_id,
|
|
394
|
+
scores=result.scores,
|
|
395
|
+
failures=list(result.errors),
|
|
396
|
+
)
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
aggregates = {
|
|
400
|
+
name: MetricAggregate.from_scores(name, scores)
|
|
401
|
+
for name, scores in per_metric.items()
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return EvaluationReport(
|
|
405
|
+
metrics=aggregates,
|
|
406
|
+
failures=failures,
|
|
407
|
+
records=per_record,
|
|
408
|
+
slices={},
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def evaluation_fingerprint(self) -> dict:
|
|
412
|
+
return self._pipeline.evaluation_fingerprint()
|
|
@@ -309,6 +309,22 @@ class EvaluationPipeline:
|
|
|
309
309
|
slices=self._compute_slice_aggregates(per_metric, slice_members),
|
|
310
310
|
)
|
|
311
311
|
|
|
312
|
+
def evaluation_fingerprint(self) -> dict:
|
|
313
|
+
"""Return a deterministic fingerprint for cache invalidation."""
|
|
314
|
+
config: dict[str, object] = {}
|
|
315
|
+
config["metrics"] = sorted(
|
|
316
|
+
[
|
|
317
|
+
f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
|
|
318
|
+
for metric in self._metrics
|
|
319
|
+
]
|
|
320
|
+
)
|
|
321
|
+
extractor = self._extractor
|
|
322
|
+
extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
|
|
323
|
+
config["extractor"] = extractor_type
|
|
324
|
+
if hasattr(extractor, "field_name"):
|
|
325
|
+
config["extractor_field"] = extractor.field_name
|
|
326
|
+
return config
|
|
327
|
+
|
|
312
328
|
def register_slice(
|
|
313
329
|
self, name: str, fn: Callable[[core_entities.GenerationRecord], bool]
|
|
314
330
|
) -> None:
|
themis/experiment/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""Experiment orchestration layer."""
|
|
2
2
|
|
|
3
|
-
from themis.experiment import
|
|
3
|
+
from themis.experiment import definitions, export, math, orchestrator, storage
|
|
4
4
|
|
|
5
|
-
__all__ = ["math", "orchestrator", "storage", "
|
|
5
|
+
__all__ = ["math", "orchestrator", "storage", "definitions", "export"]
|
|
@@ -81,6 +81,17 @@ class CacheManager:
|
|
|
81
81
|
return {}
|
|
82
82
|
return self._storage.load_cached_evaluations(run_id, evaluation_config=evaluation_config)
|
|
83
83
|
|
|
84
|
+
def run_metadata_exists(self, run_id: str) -> bool:
|
|
85
|
+
"""Check if run metadata exists in storage."""
|
|
86
|
+
if self._storage is None:
|
|
87
|
+
return False
|
|
88
|
+
return self._storage.run_metadata_exists(run_id)
|
|
89
|
+
|
|
90
|
+
def start_run(self, run_id: str, *, experiment_id: str = "default") -> None:
|
|
91
|
+
"""Start a run in storage."""
|
|
92
|
+
if self._storage is not None:
|
|
93
|
+
self._storage.start_run(run_id, experiment_id=experiment_id)
|
|
94
|
+
|
|
84
95
|
def save_generation_record(
|
|
85
96
|
self,
|
|
86
97
|
run_id: str,
|
|
@@ -128,7 +139,10 @@ class CacheManager:
|
|
|
128
139
|
"""
|
|
129
140
|
if self._storage is None:
|
|
130
141
|
return None
|
|
131
|
-
|
|
142
|
+
run_path = self._storage.get_run_path(run_id)
|
|
143
|
+
if run_path is None:
|
|
144
|
+
return None
|
|
145
|
+
return str(run_path)
|
|
132
146
|
|
|
133
147
|
|
|
134
148
|
__all__ = ["CacheManager"]
|
themis/experiment/definitions.py
CHANGED