themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +16 -0
  14. themis/experiment/__init__.py +2 -2
  15. themis/experiment/cache_manager.py +15 -1
  16. themis/experiment/definitions.py +1 -1
  17. themis/experiment/orchestrator.py +21 -11
  18. themis/experiment/share.py +264 -0
  19. themis/experiment/storage.py +345 -298
  20. themis/generation/router.py +22 -4
  21. themis/generation/runner.py +16 -1
  22. themis/presets/benchmarks.py +602 -17
  23. themis/server/app.py +38 -26
  24. themis/session.py +125 -0
  25. themis/specs/__init__.py +7 -0
  26. themis/specs/execution.py +26 -0
  27. themis/specs/experiment.py +33 -0
  28. themis/specs/storage.py +18 -0
  29. themis/storage/__init__.py +6 -0
  30. themis/storage/experiment_storage.py +7 -0
  31. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  32. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
  33. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  34. themis/experiment/builder.py +0 -151
  35. themis/experiment/export_csv.py +0 -159
  36. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/cli/main.py CHANGED
@@ -1,16 +1,19 @@
1
- """Simplified CLI for Themis - Five core commands only.
1
+ """Simplified CLI for Themis - seven focused commands.
2
2
 
3
- This is the new unified CLI that leverages the themis.evaluate() API.
4
- It replaces 20+ commands with 5 essential ones.
3
+ This is the unified CLI that leverages the themis.evaluate() API.
4
+ It replaces 20+ commands with a smaller, task-oriented set.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ import os
9
10
  import sys
11
+ from datetime import datetime, timedelta
10
12
  from pathlib import Path
11
13
  from typing import Annotated, Sequence
12
14
 
13
15
  from cyclopts import App, Parameter
16
+ from themis._version import __version__
14
17
 
15
18
  # Import provider modules to ensure they register themselves
16
19
  try:
@@ -25,10 +28,24 @@ except ImportError:
25
28
  app = App(
26
29
  name="themis",
27
30
  help="Dead simple LLM evaluation platform",
28
- version="2.0.0-alpha.1",
31
+ version=__version__,
29
32
  )
30
33
 
31
34
 
35
+ @app.command
36
+ def demo(
37
+ *,
38
+ model: Annotated[str, Parameter(help="Model identifier")] = "fake-math-llm",
39
+ limit: Annotated[int, Parameter(help="Maximum number of samples")] = 10,
40
+ ) -> int:
41
+ """Run the built-in demo benchmark."""
42
+ return eval(
43
+ "demo",
44
+ model=model,
45
+ limit=limit,
46
+ )
47
+
48
+
32
49
  @app.command
33
50
  def eval(
34
51
  benchmark_or_dataset: Annotated[str, Parameter(name="BENCHMARK_OR_DATASET", show_default=False)],
@@ -57,7 +74,6 @@ def eval(
57
74
  # Distributed execution
58
75
  themis eval gsm8k --model gpt-4 --distributed --workers 8
59
76
  """
60
- import themis
61
77
  from themis.experiment import export as experiment_export
62
78
 
63
79
  print(f"Running evaluation: {benchmark_or_dataset}")
@@ -72,21 +88,47 @@ def eval(
72
88
  # TODO: Load dataset from file
73
89
  print("Error: Custom dataset files not yet implemented")
74
90
  return 1
75
-
91
+
76
92
  try:
77
- # Run evaluation using unified API
78
- report = themis.evaluate(
79
- benchmark_or_dataset,
93
+ if distributed:
94
+ print("Error: distributed execution is not supported in vNext CLI yet")
95
+ return 1
96
+
97
+ from themis.evaluation.pipeline import EvaluationPipeline
98
+ from themis.generation.templates import PromptTemplate
99
+ from themis.presets import get_benchmark_preset
100
+ from themis.session import ExperimentSession
101
+ from themis.specs import ExecutionSpec, ExperimentSpec, StorageSpec
102
+
103
+ # Resolve benchmark preset
104
+ preset = get_benchmark_preset(benchmark_or_dataset)
105
+
106
+ dataset = preset.load_dataset(limit=limit)
107
+
108
+ if prompt is None:
109
+ prompt_template = preset.prompt_template
110
+ else:
111
+ prompt_template = PromptTemplate(name="custom", template=prompt)
112
+
113
+ pipeline = EvaluationPipeline(
114
+ extractor=preset.extractor,
115
+ metrics=preset.metrics,
116
+ )
117
+
118
+ spec = ExperimentSpec(
119
+ dataset=dataset,
120
+ prompt=prompt_template.template,
80
121
  model=model,
81
- limit=limit,
82
- prompt=prompt,
83
- temperature=temperature,
84
- max_tokens=max_tokens,
85
- storage=storage,
122
+ sampling={"temperature": temperature, "max_tokens": max_tokens},
123
+ pipeline=pipeline,
86
124
  run_id=run_id,
87
- resume=resume,
88
- distributed=distributed,
89
- workers=workers,
125
+ )
126
+
127
+ storage_root = _resolve_storage_root(storage)
128
+ report = ExperimentSession().run(
129
+ spec,
130
+ execution=ExecutionSpec(workers=workers),
131
+ storage=StorageSpec(path=storage_root, cache=resume),
90
132
  )
91
133
 
92
134
  # Print results
@@ -96,10 +138,18 @@ def eval(
96
138
 
97
139
  # Print metrics
98
140
  eval_report = report.evaluation_report
99
- if eval_report and eval_report.aggregates:
141
+ if eval_report:
100
142
  print("\nMetrics:")
101
- for agg in eval_report.aggregates:
102
- print(f" {agg.metric_name}: {agg.mean:.4f} (±{agg.std:.4f})")
143
+ if getattr(eval_report, "aggregates", None):
144
+ for agg in eval_report.aggregates:
145
+ std = getattr(agg, "std", None)
146
+ if std is None:
147
+ print(f" {agg.metric_name}: {agg.mean:.4f}")
148
+ else:
149
+ print(f" {agg.metric_name}: {agg.mean:.4f} (±{std:.4f})")
150
+ elif getattr(eval_report, "metrics", None):
151
+ for name, agg in sorted(eval_report.metrics.items()):
152
+ print(f" {name}: {agg.mean:.4f} (n={agg.count})")
103
153
 
104
154
  # Print sample counts
105
155
  total = len(report.generation_results)
@@ -113,13 +163,13 @@ def eval(
113
163
  suffix = output_path.suffix.lower()
114
164
 
115
165
  if suffix == ".csv":
116
- experiment_export.export_csv(report, output_path)
166
+ experiment_export.export_report_csv(report, output_path)
117
167
  print(f"\nExported to CSV: {output_path}")
118
168
  elif suffix == ".json":
119
- experiment_export.export_json(report, output_path)
169
+ experiment_export.export_report_json(report, output_path)
120
170
  print(f"\nExported to JSON: {output_path}")
121
171
  elif suffix in [".html", ".htm"]:
122
- experiment_export.export_html(report, output_path)
172
+ experiment_export.export_html_report(report, output_path)
123
173
  print(f"\nExported to HTML: {output_path}")
124
174
  else:
125
175
  print(f"\nWarning: Unknown output format: {suffix}")
@@ -138,6 +188,7 @@ def compare(
138
188
  run_ids: Annotated[list[str], Parameter(name="RUN_IDS", show_default=False)],
139
189
  *,
140
190
  metric: Annotated[str | None, Parameter(help="Metric to compare")] = None,
191
+ storage: Annotated[str | None, Parameter(help="Storage location (local path or s3://...)")] = None,
141
192
  output: Annotated[str | None, Parameter(help="Output file (HTML or Markdown)")] = None,
142
193
  show_diff: Annotated[bool, Parameter(help="Show examples where results differ")] = False,
143
194
  ) -> int:
@@ -162,7 +213,7 @@ def compare(
162
213
  return 1
163
214
 
164
215
  # Determine storage path (default to .cache/experiments)
165
- storage_path = Path(".cache/experiments")
216
+ storage_path = _resolve_storage_root(storage)
166
217
 
167
218
  if not storage_path.exists():
168
219
  print(f"Error: Storage path not found: {storage_path}", file=sys.stderr)
@@ -219,6 +270,62 @@ def compare(
219
270
  return 1
220
271
 
221
272
 
273
+ @app.command
274
+ def share(
275
+ run_id: Annotated[str, Parameter(name="RUN_ID", show_default=False)],
276
+ *,
277
+ storage: Annotated[str | None, Parameter(help="Storage location (defaults to .cache/experiments)")] = None,
278
+ metric: Annotated[str | None, Parameter(help="Metric to highlight (default: first available)")] = None,
279
+ output_dir: Annotated[Path, Parameter(help="Directory to write share assets")] = Path("."),
280
+ ) -> int:
281
+ """Generate a shareable results badge + Markdown snippet for a run.
282
+
283
+ Examples:
284
+ # Create share assets in current directory
285
+ themis share run-20260118-032014
286
+
287
+ # Highlight a specific metric
288
+ themis share run-20260118-032014 --metric accuracy
289
+
290
+ # Write to a dedicated folder
291
+ themis share run-20260118-032014 --output-dir share
292
+ """
293
+ from themis.experiment.share import create_share_pack
294
+
295
+ storage_root = Path(storage) if storage else Path(".cache/experiments")
296
+ if not storage_root.exists():
297
+ print(f"Error: Storage path not found: {storage_root}", file=sys.stderr)
298
+ return 1
299
+
300
+ try:
301
+ share_pack = create_share_pack(
302
+ run_id=run_id,
303
+ storage_root=storage_root,
304
+ output_dir=output_dir,
305
+ metric=metric,
306
+ )
307
+ except FileNotFoundError as e:
308
+ print(f"Error: {e}", file=sys.stderr)
309
+ return 1
310
+ except ValueError as e:
311
+ print(f"Error: {e}", file=sys.stderr)
312
+ return 1
313
+ except Exception as e:
314
+ print(f"Unexpected error: {e}", file=sys.stderr)
315
+ import traceback
316
+ traceback.print_exc()
317
+ return 1
318
+
319
+ print("✓ Share assets created")
320
+ print(f" SVG: {share_pack.svg_path}")
321
+ print(f" Markdown: {share_pack.markdown_path}")
322
+ print("\nSnippet:")
323
+ print(share_pack.markdown_snippet)
324
+ if share_pack.event_log_path:
325
+ print(f"\nEvent logged to: {share_pack.event_log_path}")
326
+ return 0
327
+
328
+
222
329
  @app.command
223
330
  def serve(
224
331
  *,
@@ -254,8 +361,8 @@ def serve(
254
361
  print(" or: uv pip install themis[server]", file=sys.stderr)
255
362
  return 1
256
363
 
257
- # Determine storage path
258
- storage_path = Path(storage) if storage else Path(".cache/experiments")
364
+ # Determine storage path
365
+ storage_path = _resolve_storage_root(storage)
259
366
 
260
367
  print(f"Starting Themis API server...")
261
368
  print(f" URL: http://{host}:{port}")
@@ -284,6 +391,7 @@ def list(
284
391
  *,
285
392
  storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
286
393
  limit: Annotated[int | None, Parameter(help="Limit number of results")] = None,
394
+ verbose: Annotated[bool, Parameter(help="Show detailed information")] = False,
287
395
  ) -> int:
288
396
  """List runs, benchmarks, or available metrics.
289
397
 
@@ -306,28 +414,65 @@ def list(
306
414
  return 1
307
415
 
308
416
  if what == "benchmarks":
309
- from themis.presets import list_benchmarks
417
+ from themis.presets import get_benchmark_preset, list_benchmarks
310
418
 
311
419
  benchmarks = list_benchmarks()
420
+ if limit is not None:
421
+ benchmarks = benchmarks[:limit]
312
422
  print("Available benchmarks:")
313
423
  for benchmark in benchmarks:
314
- print(f" - {benchmark}")
424
+ if verbose:
425
+ preset = get_benchmark_preset(benchmark)
426
+ description = preset.description or "No description"
427
+ print(f" - {benchmark}: {description}")
428
+ else:
429
+ print(f" - {benchmark}")
315
430
  return 0
316
431
 
317
432
  elif what == "metrics":
318
433
  print("Available metrics:")
434
+ print(" Core:")
435
+ print(" - exact_match (no extra dependencies)")
436
+ print(" - response_length (no extra dependencies)")
319
437
  print(" Math:")
320
- print(" - exact_match")
321
- print(" - math_verify")
322
- print(" General:")
323
- print(" - response_length")
324
- print("\n Note: NLP and code metrics will be added in Phase 2")
438
+ print(" - math_verify (requires: themis-eval[math], math-verify)")
439
+ print(" NLP (requires: themis-eval[nlp]):")
440
+ print(" - bleu (sacrebleu)")
441
+ print(" - rouge1 / rouge2 / rougeL (rouge-score)")
442
+ print(" - bertscore (bert-score)")
443
+ print(" - meteor (nltk)")
444
+ print(" Code:")
445
+ print(" - pass_at_k (no extra dependencies)")
446
+ print(" - execution_accuracy (no extra dependencies)")
447
+ print(" - codebleu (requires: themis-eval[code], codebleu)")
448
+ print("\nInstall extras: pip install themis-eval[math,nlp,code]")
325
449
  return 0
326
450
 
327
451
  elif what == "runs":
328
- print("Listing runs...")
329
- print("Note: Run listing not yet fully implemented")
330
- return 1
452
+ from themis.storage import ExperimentStorage
453
+
454
+ storage_root = _resolve_storage_root(storage)
455
+ if not storage_root.exists():
456
+ print(f"No storage found at {storage_root}")
457
+ return 1
458
+
459
+ storage_backend = ExperimentStorage(storage_root)
460
+ runs = storage_backend.list_runs(limit=limit)
461
+ if not runs:
462
+ print("No runs found.")
463
+ return 0
464
+
465
+ print("Runs:")
466
+ for run in runs:
467
+ status = run.status.value if hasattr(run.status, "value") else str(run.status)
468
+ if verbose:
469
+ print(
470
+ f" - {run.run_id} [{status}] samples={run.total_samples} "
471
+ f"created={run.created_at}"
472
+ )
473
+ else:
474
+ print(f" - {run.run_id}")
475
+ return 0
331
476
 
332
477
  return 0
333
478
 
@@ -348,10 +493,56 @@ def clean(
348
493
  # Remove runs older than 30 days
349
494
  themis clean --older-than 30
350
495
  """
351
- print("Cleaning storage...")
352
- print("Note: Storage cleanup not yet implemented")
353
- print("This will be implemented in Phase 6")
354
- return 1
496
+ from themis.storage import ExperimentStorage
497
+
498
+ storage_root = _resolve_storage_root(storage)
499
+ if not storage_root.exists():
500
+ print(f"No storage found at {storage_root}")
501
+ return 1
502
+
503
+ if older_than is None:
504
+ print("Error: --older-than is required to clean runs")
505
+ return 1
506
+
507
+ storage_backend = ExperimentStorage(storage_root)
508
+ runs = storage_backend.list_runs()
509
+ cutoff = datetime.now() - timedelta(days=older_than)
510
+
511
+ candidates = []
512
+ for run in runs:
513
+ try:
514
+ created_at = datetime.fromisoformat(run.created_at)
515
+ except ValueError:
516
+ continue
517
+ if created_at < cutoff:
518
+ candidates.append(run)
519
+
520
+ if not candidates:
521
+ print("No runs matched the cleanup criteria.")
522
+ return 0
523
+
524
+ if dry_run:
525
+ print("Runs to delete:")
526
+ for run in candidates:
527
+ print(f" - {run.run_id} (created {run.created_at})")
528
+ return 0
529
+
530
+ deleted = 0
531
+ for run in candidates:
532
+ storage_backend.delete_run(run.run_id)
533
+ deleted += 1
534
+
535
+ print(f"Deleted {deleted} run(s).")
536
+ return 0
537
+
538
+
539
+ def _resolve_storage_root(storage: str | None) -> Path:
540
+ if storage:
541
+ return Path(storage).expanduser()
542
+ env_storage = os.getenv("THEMIS_STORAGE")
543
+ if env_storage:
544
+ return Path(env_storage).expanduser()
545
+ return Path(".cache/experiments")
355
546
 
356
547
 
357
548
  def _generate_comparison_html(report) -> str:
@@ -11,7 +11,7 @@ from typing import Sequence
11
11
 
12
12
  from themis.comparison import reports, statistics
13
13
  from themis.comparison.statistics import StatisticalTest
14
- from themis.experiment import storage as experiment_storage
14
+ from themis.storage import ExperimentStorage
15
15
 
16
16
 
17
17
  class ComparisonEngine:
@@ -24,7 +24,7 @@ class ComparisonEngine:
24
24
  def __init__(
25
25
  self,
26
26
  *,
27
- storage: experiment_storage.ExperimentStorage | None = None,
27
+ storage: ExperimentStorage | None = None,
28
28
  storage_path: str | Path | None = None,
29
29
  statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
30
30
  alpha: float = 0.05,
@@ -44,7 +44,7 @@ class ComparisonEngine:
44
44
  if storage is None and storage_path is None:
45
45
  raise ValueError("Either storage or storage_path must be provided")
46
46
 
47
- self._storage = storage or experiment_storage.ExperimentStorage(storage_path)
47
+ self._storage = storage or ExperimentStorage(storage_path)
48
48
  self._statistical_test = statistical_test
49
49
  self._alpha = alpha
50
50
  self._n_bootstrap = n_bootstrap
@@ -161,18 +161,12 @@ class ComparisonEngine:
161
161
 
162
162
  # eval_dict is a dict, so iterate over values
163
163
  for record in eval_dict.values():
164
- for metric_name, score_obj in record.scores.items():
164
+ for score_obj in record.scores:
165
+ metric_name = score_obj.metric_name
165
166
  if metric_name not in metric_scores:
166
167
  metric_scores[metric_name] = []
167
-
168
- # Get numeric score
169
- if hasattr(score_obj, 'value'):
170
- score = score_obj.value
171
- elif isinstance(score_obj, (int, float)):
172
- score = float(score_obj)
173
- else:
174
- continue # Skip non-numeric scores
175
-
168
+
169
+ score = score_obj.value
176
170
  metric_scores[metric_name].append(score)
177
171
 
178
172
  return metric_scores
themis/core/entities.py CHANGED
@@ -26,6 +26,10 @@ class ModelSpec:
26
26
  default_sampling: SamplingConfig | None = None
27
27
  metadata: Dict[str, Any] = field(default_factory=dict)
28
28
 
29
+ @property
30
+ def model_key(self) -> str:
31
+ return f"{self.provider}:{self.identifier}"
32
+
29
33
 
30
34
  @dataclass(frozen=True)
31
35
  class PromptSpec:
@@ -0,0 +1,12 @@
1
+ """Primary metric evaluation pipeline for vNext workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
6
+
7
+
8
+ class MetricPipeline(EvaluationPipeline):
9
+ """Primary evaluation pipeline for vNext (alias of standard pipeline)."""
10
+
11
+
12
+ __all__ = ["MetricPipeline"]
@@ -25,22 +25,44 @@ Example (Composable):
25
25
 
26
26
  from __future__ import annotations
27
27
 
28
+ # vNext: protocol definition for evaluation pipelines
29
+ from typing import Protocol, Sequence, runtime_checkable
30
+
28
31
  # Re-export pipeline implementations for backward compatibility
29
32
  from themis.evaluation.pipelines.composable_pipeline import (
30
33
  ComposableEvaluationPipeline,
34
+ ComposableEvaluationReportPipeline,
31
35
  EvaluationResult,
32
36
  EvaluationStep,
33
37
  )
34
38
  from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
39
+ from themis.evaluation.metric_pipeline import MetricPipeline
35
40
  from themis.evaluation.reports import (
36
41
  EvaluationFailure,
37
42
  EvaluationReport,
38
43
  MetricAggregate,
39
44
  )
45
+ from themis.core import entities as core_entities
46
+
47
+
48
+ @runtime_checkable
49
+ class EvaluationPipelineContract(Protocol):
50
+ """Contract for evaluation pipelines."""
51
+
52
+ def evaluate(
53
+ self, records: Sequence[core_entities.GenerationRecord]
54
+ ) -> EvaluationReport: # pragma: no cover - protocol
55
+ ...
56
+
57
+ def evaluation_fingerprint(self) -> dict: # pragma: no cover - protocol
58
+ ...
40
59
 
41
60
  __all__ = [
42
61
  "EvaluationPipeline",
62
+ "EvaluationPipelineContract",
63
+ "MetricPipeline",
43
64
  "ComposableEvaluationPipeline",
65
+ "ComposableEvaluationReportPipeline",
44
66
  "EvaluationStep",
45
67
  "EvaluationResult",
46
68
  "MetricAggregate",
@@ -2,14 +2,18 @@
2
2
 
3
3
  from themis.evaluation.pipelines.composable_pipeline import (
4
4
  ComposableEvaluationPipeline,
5
+ ComposableEvaluationReportPipeline,
5
6
  EvaluationResult,
6
7
  EvaluationStep,
7
8
  )
8
9
  from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
10
+ from themis.evaluation.metric_pipeline import MetricPipeline
9
11
 
10
12
  __all__ = [
11
13
  "EvaluationPipeline",
14
+ "MetricPipeline",
12
15
  "ComposableEvaluationPipeline",
16
+ "ComposableEvaluationReportPipeline",
13
17
  "EvaluationStep",
14
18
  "EvaluationResult",
15
19
  ]
@@ -6,6 +6,7 @@ from dataclasses import dataclass, field
6
6
  from typing import Any, Callable, Generic, Sequence, TypeVar
7
7
 
8
8
  from themis.core import entities as core_entities
9
+ from themis.evaluation.reports import EvaluationFailure, EvaluationReport, MetricAggregate
9
10
  from themis.interfaces import Metric as MetricInterface
10
11
  from themis.utils import tracing
11
12
 
@@ -355,3 +356,57 @@ class ComposableEvaluationPipeline:
355
356
  """
356
357
  self._steps.clear()
357
358
  return self
359
+
360
+ def evaluation_fingerprint(self) -> dict:
361
+ """Return a fingerprint based on the configured steps."""
362
+ return {"steps": self.get_step_names()}
363
+
364
+
365
+ class ComposableEvaluationReportPipeline:
366
+ """Adapter that makes a ComposableEvaluationPipeline compatible with EvaluationPipeline."""
367
+
368
+ def __init__(self, pipeline: ComposableEvaluationPipeline):
369
+ self._pipeline = pipeline
370
+
371
+ def evaluate(
372
+ self, records: Sequence[core_entities.GenerationRecord]
373
+ ) -> EvaluationReport:
374
+ per_metric: dict[str, list[core_entities.MetricScore]] = {}
375
+ failures: list[EvaluationFailure] = []
376
+ per_record: list[core_entities.EvaluationRecord] = []
377
+
378
+ for record in records:
379
+ result = self._pipeline.evaluate(record)
380
+ sample_id = record.task.metadata.get("dataset_id") or record.task.metadata.get(
381
+ "sample_id"
382
+ )
383
+
384
+ if result.errors:
385
+ for error in result.errors:
386
+ failures.append(EvaluationFailure(sample_id=sample_id, message=error))
387
+
388
+ for score in result.scores:
389
+ per_metric.setdefault(score.metric_name, []).append(score)
390
+
391
+ per_record.append(
392
+ core_entities.EvaluationRecord(
393
+ sample_id=sample_id,
394
+ scores=result.scores,
395
+ failures=list(result.errors),
396
+ )
397
+ )
398
+
399
+ aggregates = {
400
+ name: MetricAggregate.from_scores(name, scores)
401
+ for name, scores in per_metric.items()
402
+ }
403
+
404
+ return EvaluationReport(
405
+ metrics=aggregates,
406
+ failures=failures,
407
+ records=per_record,
408
+ slices={},
409
+ )
410
+
411
+ def evaluation_fingerprint(self) -> dict:
412
+ return self._pipeline.evaluation_fingerprint()
@@ -309,6 +309,22 @@ class EvaluationPipeline:
309
309
  slices=self._compute_slice_aggregates(per_metric, slice_members),
310
310
  )
311
311
 
312
+ def evaluation_fingerprint(self) -> dict:
313
+ """Return a deterministic fingerprint for cache invalidation."""
314
+ config: dict[str, object] = {}
315
+ config["metrics"] = sorted(
316
+ [
317
+ f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
318
+ for metric in self._metrics
319
+ ]
320
+ )
321
+ extractor = self._extractor
322
+ extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
323
+ config["extractor"] = extractor_type
324
+ if hasattr(extractor, "field_name"):
325
+ config["extractor_field"] = extractor.field_name
326
+ return config
327
+
312
328
  def register_slice(
313
329
  self, name: str, fn: Callable[[core_entities.GenerationRecord], bool]
314
330
  ) -> None:
@@ -1,5 +1,5 @@
1
1
  """Experiment orchestration layer."""
2
2
 
3
- from themis.experiment import builder, export, math, orchestrator, storage
3
+ from themis.experiment import definitions, export, math, orchestrator, storage
4
4
 
5
- __all__ = ["math", "orchestrator", "storage", "builder", "export"]
5
+ __all__ = ["math", "orchestrator", "storage", "definitions", "export"]
@@ -81,6 +81,17 @@ class CacheManager:
81
81
  return {}
82
82
  return self._storage.load_cached_evaluations(run_id, evaluation_config=evaluation_config)
83
83
 
84
+ def run_metadata_exists(self, run_id: str) -> bool:
85
+ """Check if run metadata exists in storage."""
86
+ if self._storage is None:
87
+ return False
88
+ return self._storage.run_metadata_exists(run_id)
89
+
90
+ def start_run(self, run_id: str, *, experiment_id: str = "default") -> None:
91
+ """Start a run in storage."""
92
+ if self._storage is not None:
93
+ self._storage.start_run(run_id, experiment_id=experiment_id)
94
+
84
95
  def save_generation_record(
85
96
  self,
86
97
  run_id: str,
@@ -128,7 +139,10 @@ class CacheManager:
128
139
  """
129
140
  if self._storage is None:
130
141
  return None
131
- return str(self._storage.get_run_path(run_id))
142
+ run_path = self._storage.get_run_path(run_id)
143
+ if run_path is None:
144
+ return None
145
+ return str(run_path)
132
146
 
133
147
 
134
148
  __all__ = ["CacheManager"]
@@ -1,4 +1,4 @@
1
- """Shared experiment definitions used by the builder."""
1
+ """Shared experiment definitions for orchestration assembly."""
2
2
 
3
3
  from __future__ import annotations
4
4