PyPI - themis-eval - Versions diffs - 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

themis-eval 0.2.3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

themis/__init__.py +5 -2
themis/_version.py +14 -1
themis/api.py +83 -145
themis/backends/storage.py +5 -0
themis/cli/commands/info.py +2 -11
themis/cli/main.py +231 -40
themis/comparison/engine.py +7 -13
themis/core/entities.py +4 -0
themis/evaluation/metric_pipeline.py +12 -0
themis/evaluation/pipeline.py +22 -0
themis/evaluation/pipelines/__init__.py +4 -0
themis/evaluation/pipelines/composable_pipeline.py +55 -0
themis/evaluation/pipelines/standard_pipeline.py +16 -0
themis/experiment/__init__.py +2 -2
themis/experiment/cache_manager.py +15 -1
themis/experiment/definitions.py +1 -1
themis/experiment/orchestrator.py +21 -11
themis/experiment/share.py +264 -0
themis/experiment/storage.py +345 -298
themis/generation/router.py +22 -4
themis/generation/runner.py +16 -1
themis/presets/benchmarks.py +602 -17
themis/server/app.py +38 -26
themis/session.py +125 -0
themis/specs/__init__.py +7 -0
themis/specs/execution.py +26 -0
themis/specs/experiment.py +33 -0
themis/specs/storage.py +18 -0
themis/storage/__init__.py +6 -0
themis/storage/experiment_storage.py +7 -0
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
themis/experiment/builder.py +0 -151
themis/experiment/export_csv.py +0 -159
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0

themis/cli/main.py CHANGED Viewed

@@ -1,16 +1,19 @@
-"""Simplified CLI for Themis - Five core commands only.
+"""Simplified CLI for Themis - seven focused commands.
-This is the new unified CLI that leverages the themis.evaluate() API.
-It replaces 20+ commands with 5 essential ones.
+This is the unified CLI that leverages the themis.evaluate() API.
+It replaces 20+ commands with a smaller, task-oriented set.
 """
 from __future__ import annotations
+import os
 import sys
+from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Annotated, Sequence
 from cyclopts import App, Parameter
+from themis._version import __version__
 # Import provider modules to ensure they register themselves
 try:
@@ -25,10 +28,24 @@ except ImportError:
 app = App(
     name="themis",
     help="Dead simple LLM evaluation platform",
-    version="2.0.0-alpha.1",
+    version=__version__,
 )
+@app.command
+def demo(
+    *,
+    model: Annotated[str, Parameter(help="Model identifier")] = "fake-math-llm",
+    limit: Annotated[int, Parameter(help="Maximum number of samples")] = 10,
+) -> int:
+    """Run the built-in demo benchmark."""
+    return eval(
+        "demo",
+        model=model,
+        limit=limit,
+    )
 @app.command
 def eval(
     benchmark_or_dataset: Annotated[str, Parameter(name="BENCHMARK_OR_DATASET", show_default=False)],
@@ -57,7 +74,6 @@ def eval(
         # Distributed execution
         themis eval gsm8k --model gpt-4 --distributed --workers 8
     """
-    import themis
     from themis.experiment import export as experiment_export
     print(f"Running evaluation: {benchmark_or_dataset}")
@@ -72,21 +88,47 @@ def eval(
         # TODO: Load dataset from file
         print("Error: Custom dataset files not yet implemented")
         return 1
     try:
-        # Run evaluation using unified API
-        report = themis.evaluate(
-            benchmark_or_dataset,
+        if distributed:
+            print("Error: distributed execution is not supported in vNext CLI yet")
+            return 1
+        from themis.evaluation.pipeline import EvaluationPipeline
+        from themis.generation.templates import PromptTemplate
+        from themis.presets import get_benchmark_preset
+        from themis.session import ExperimentSession
+        from themis.specs import ExecutionSpec, ExperimentSpec, StorageSpec
+        # Resolve benchmark preset
+        preset = get_benchmark_preset(benchmark_or_dataset)
+        dataset = preset.load_dataset(limit=limit)
+        if prompt is None:
+            prompt_template = preset.prompt_template
+        else:
+            prompt_template = PromptTemplate(name="custom", template=prompt)
+        pipeline = EvaluationPipeline(
+            extractor=preset.extractor,
+            metrics=preset.metrics,
+        )
+        spec = ExperimentSpec(
+            dataset=dataset,
+            prompt=prompt_template.template,
             model=model,
-            limit=limit,
-            prompt=prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            storage=storage,
+            sampling={"temperature": temperature, "max_tokens": max_tokens},
+            pipeline=pipeline,
             run_id=run_id,
-            resume=resume,
-            distributed=distributed,
-            workers=workers,
+        )
+        storage_root = _resolve_storage_root(storage)
+        report = ExperimentSession().run(
+            spec,
+            execution=ExecutionSpec(workers=workers),
+            storage=StorageSpec(path=storage_root, cache=resume),
         )
         # Print results
@@ -96,10 +138,18 @@ def eval(
         # Print metrics
         eval_report = report.evaluation_report
-        if eval_report and eval_report.aggregates:
+        if eval_report:
             print("\nMetrics:")
-            for agg in eval_report.aggregates:
-                print(f"  {agg.metric_name}: {agg.mean:.4f} (±{agg.std:.4f})")
+            if getattr(eval_report, "aggregates", None):
+                for agg in eval_report.aggregates:
+                    std = getattr(agg, "std", None)
+                    if std is None:
+                        print(f"  {agg.metric_name}: {agg.mean:.4f}")
+                    else:
+                        print(f"  {agg.metric_name}: {agg.mean:.4f} (±{std:.4f})")
+            elif getattr(eval_report, "metrics", None):
+                for name, agg in sorted(eval_report.metrics.items()):
+                    print(f"  {name}: {agg.mean:.4f} (n={agg.count})")
         # Print sample counts
         total = len(report.generation_results)
@@ -113,13 +163,13 @@ def eval(
             suffix = output_path.suffix.lower()
             if suffix == ".csv":
-                experiment_export.export_csv(report, output_path)
+                experiment_export.export_report_csv(report, output_path)
                 print(f"\nExported to CSV: {output_path}")
             elif suffix == ".json":
-                experiment_export.export_json(report, output_path)
+                experiment_export.export_report_json(report, output_path)
                 print(f"\nExported to JSON: {output_path}")
             elif suffix in [".html", ".htm"]:
-                experiment_export.export_html(report, output_path)
+                experiment_export.export_html_report(report, output_path)
                 print(f"\nExported to HTML: {output_path}")
             else:
                 print(f"\nWarning: Unknown output format: {suffix}")
@@ -138,6 +188,7 @@ def compare(
     run_ids: Annotated[list[str], Parameter(name="RUN_IDS", show_default=False)],
     *,
     metric: Annotated[str | None, Parameter(help="Metric to compare")] = None,
+    storage: Annotated[str | None, Parameter(help="Storage location (local path or s3://...)")] = None,
     output: Annotated[str | None, Parameter(help="Output file (HTML or Markdown)")] = None,
     show_diff: Annotated[bool, Parameter(help="Show examples where results differ")] = False,
 ) -> int:
@@ -162,7 +213,7 @@ def compare(
             return 1
         # Determine storage path (default to .cache/experiments)
-        storage_path = Path(".cache/experiments")
+        storage_path = _resolve_storage_root(storage)
         if not storage_path.exists():
             print(f"Error: Storage path not found: {storage_path}", file=sys.stderr)
@@ -219,6 +270,62 @@ def compare(
         return 1
+@app.command
+def share(
+    run_id: Annotated[str, Parameter(name="RUN_ID", show_default=False)],
+    *,
+    storage: Annotated[str | None, Parameter(help="Storage location (defaults to .cache/experiments)")] = None,
+    metric: Annotated[str | None, Parameter(help="Metric to highlight (default: first available)")] = None,
+    output_dir: Annotated[Path, Parameter(help="Directory to write share assets")] = Path("."),
+) -> int:
+    """Generate a shareable results badge + Markdown snippet for a run.
+    Examples:
+        # Create share assets in current directory
+        themis share run-20260118-032014
+        # Highlight a specific metric
+        themis share run-20260118-032014 --metric accuracy
+        # Write to a dedicated folder
+        themis share run-20260118-032014 --output-dir share
+    """
+    from themis.experiment.share import create_share_pack
+    storage_root = Path(storage) if storage else Path(".cache/experiments")
+    if not storage_root.exists():
+        print(f"Error: Storage path not found: {storage_root}", file=sys.stderr)
+        return 1
+    try:
+        share_pack = create_share_pack(
+            run_id=run_id,
+            storage_root=storage_root,
+            output_dir=output_dir,
+            metric=metric,
+        )
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    except ValueError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f"Unexpected error: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        return 1
+    print("✓ Share assets created")
+    print(f"  SVG: {share_pack.svg_path}")
+    print(f"  Markdown: {share_pack.markdown_path}")
+    print("\nSnippet:")
+    print(share_pack.markdown_snippet)
+    if share_pack.event_log_path:
+        print(f"\nEvent logged to: {share_pack.event_log_path}")
+    return 0
 @app.command
 def serve(
     *,
@@ -254,8 +361,8 @@ def serve(
         print("           or: uv pip install themis[server]", file=sys.stderr)
         return 1
-    # Determine storage path
-    storage_path = Path(storage) if storage else Path(".cache/experiments")
+        # Determine storage path
+        storage_path = _resolve_storage_root(storage)
     print(f"Starting Themis API server...")
     print(f"  URL:     http://{host}:{port}")
@@ -284,6 +391,7 @@ def list(
     *,
     storage: Annotated[str | None, Parameter(help="Storage path for runs")] = None,
     limit: Annotated[int | None, Parameter(help="Limit number of results")] = None,
+    verbose: Annotated[bool, Parameter(help="Show detailed information")] = False,
 ) -> int:
     """List runs, benchmarks, or available metrics.
@@ -306,28 +414,65 @@ def list(
         return 1
     if what == "benchmarks":
-        from themis.presets import list_benchmarks
+        from themis.presets import get_benchmark_preset, list_benchmarks
         benchmarks = list_benchmarks()
+        if limit is not None:
+            benchmarks = benchmarks[:limit]
         print("Available benchmarks:")
         for benchmark in benchmarks:
-            print(f"  - {benchmark}")
+            if verbose:
+                preset = get_benchmark_preset(benchmark)
+                description = preset.description or "No description"
+                print(f"  - {benchmark}: {description}")
+            else:
+                print(f"  - {benchmark}")
         return 0
     elif what == "metrics":
         print("Available metrics:")
+        print("  Core:")
+        print("    - exact_match (no extra dependencies)")
+        print("    - response_length (no extra dependencies)")
         print("  Math:")
-        print("    - exact_match")
-        print("    - math_verify")
-        print("  General:")
-        print("    - response_length")
-        print("\n  Note: NLP and code metrics will be added in Phase 2")
+        print("    - math_verify (requires: themis-eval[math], math-verify)")
+        print("  NLP (requires: themis-eval[nlp]):")
+        print("    - bleu (sacrebleu)")
+        print("    - rouge1 / rouge2 / rougeL (rouge-score)")
+        print("    - bertscore (bert-score)")
+        print("    - meteor (nltk)")
+        print("  Code:")
+        print("    - pass_at_k (no extra dependencies)")
+        print("    - execution_accuracy (no extra dependencies)")
+        print("    - codebleu (requires: themis-eval[code], codebleu)")
+        print("\nInstall extras: pip install themis-eval[math,nlp,code]")
         return 0
     elif what == "runs":
-        print("Listing runs...")
-        print("Note: Run listing not yet fully implemented")
-        return 1
+        from themis.storage import ExperimentStorage
+        storage_root = _resolve_storage_root(storage)
+        if not storage_root.exists():
+            print(f"No storage found at {storage_root}")
+            return 1
+        storage_backend = ExperimentStorage(storage_root)
+        runs = storage_backend.list_runs(limit=limit)
+        if not runs:
+            print("No runs found.")
+            return 0
+        print("Runs:")
+        for run in runs:
+            status = run.status.value if hasattr(run.status, "value") else str(run.status)
+            if verbose:
+                print(
+                    f"  - {run.run_id} [{status}] samples={run.total_samples} "
+                    f"created={run.created_at}"
+                )
+            else:
+                print(f"  - {run.run_id}")
+        return 0
     return 0
@@ -348,10 +493,56 @@ def clean(
         # Remove runs older than 30 days
         themis clean --older-than 30
     """
-    print("Cleaning storage...")
-    print("Note: Storage cleanup not yet implemented")
-    print("This will be implemented in Phase 6")
-    return 1
+    from themis.storage import ExperimentStorage
+    storage_root = _resolve_storage_root(storage)
+    if not storage_root.exists():
+        print(f"No storage found at {storage_root}")
+        return 1
+    if older_than is None:
+        print("Error: --older-than is required to clean runs")
+        return 1
+    storage_backend = ExperimentStorage(storage_root)
+    runs = storage_backend.list_runs()
+    cutoff = datetime.now() - timedelta(days=older_than)
+    candidates = []
+    for run in runs:
+        try:
+            created_at = datetime.fromisoformat(run.created_at)
+        except ValueError:
+            continue
+        if created_at < cutoff:
+            candidates.append(run)
+    if not candidates:
+        print("No runs matched the cleanup criteria.")
+        return 0
+    if dry_run:
+        print("Runs to delete:")
+        for run in candidates:
+            print(f"  - {run.run_id} (created {run.created_at})")
+        return 0
+    deleted = 0
+    for run in candidates:
+        storage_backend.delete_run(run.run_id)
+        deleted += 1
+    print(f"Deleted {deleted} run(s).")
+    return 0
+def _resolve_storage_root(storage: str | None) -> Path:
+    if storage:
+        return Path(storage).expanduser()
+    env_storage = os.getenv("THEMIS_STORAGE")
+    if env_storage:
+        return Path(env_storage).expanduser()
+    return Path(".cache/experiments")
 def _generate_comparison_html(report) -> str:

themis/comparison/engine.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import Sequence
 from themis.comparison import reports, statistics
 from themis.comparison.statistics import StatisticalTest
-from themis.experiment import storage as experiment_storage
+from themis.storage import ExperimentStorage
 class ComparisonEngine:
@@ -24,7 +24,7 @@ class ComparisonEngine:
     def __init__(
         self,
         *,
-        storage: experiment_storage.ExperimentStorage | None = None,
+        storage: ExperimentStorage | None = None,
         storage_path: str | Path | None = None,
         statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
         alpha: float = 0.05,
@@ -44,7 +44,7 @@ class ComparisonEngine:
         if storage is None and storage_path is None:
             raise ValueError("Either storage or storage_path must be provided")
-        self._storage = storage or experiment_storage.ExperimentStorage(storage_path)
+        self._storage = storage or ExperimentStorage(storage_path)
         self._statistical_test = statistical_test
         self._alpha = alpha
         self._n_bootstrap = n_bootstrap
@@ -161,18 +161,12 @@ class ComparisonEngine:
         # eval_dict is a dict, so iterate over values
         for record in eval_dict.values():
-            for metric_name, score_obj in record.scores.items():
+            for score_obj in record.scores:
+                metric_name = score_obj.metric_name
                 if metric_name not in metric_scores:
                     metric_scores[metric_name] = []
-                # Get numeric score
-                if hasattr(score_obj, 'value'):
-                    score = score_obj.value
-                elif isinstance(score_obj, (int, float)):
-                    score = float(score_obj)
-                else:
-                    continue  # Skip non-numeric scores
+                score = score_obj.value
                 metric_scores[metric_name].append(score)
         return metric_scores

themis/core/entities.py CHANGED Viewed

@@ -26,6 +26,10 @@ class ModelSpec:
     default_sampling: SamplingConfig | None = None
     metadata: Dict[str, Any] = field(default_factory=dict)
+    @property
+    def model_key(self) -> str:
+        return f"{self.provider}:{self.identifier}"
 @dataclass(frozen=True)
 class PromptSpec:

themis/evaluation/metric_pipeline.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Primary metric evaluation pipeline for vNext workflows."""
+from __future__ import annotations
+from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
+class MetricPipeline(EvaluationPipeline):
+    """Primary evaluation pipeline for vNext (alias of standard pipeline)."""
+__all__ = ["MetricPipeline"]

themis/evaluation/pipeline.py CHANGED Viewed

@@ -25,22 +25,44 @@ Example (Composable):
 from __future__ import annotations
+# vNext: protocol definition for evaluation pipelines
+from typing import Protocol, Sequence, runtime_checkable
 # Re-export pipeline implementations for backward compatibility
 from themis.evaluation.pipelines.composable_pipeline import (
     ComposableEvaluationPipeline,
+    ComposableEvaluationReportPipeline,
     EvaluationResult,
     EvaluationStep,
 )
 from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
+from themis.evaluation.metric_pipeline import MetricPipeline
 from themis.evaluation.reports import (
     EvaluationFailure,
     EvaluationReport,
     MetricAggregate,
 )
+from themis.core import entities as core_entities
+@runtime_checkable
+class EvaluationPipelineContract(Protocol):
+    """Contract for evaluation pipelines."""
+    def evaluate(
+        self, records: Sequence[core_entities.GenerationRecord]
+    ) -> EvaluationReport:  # pragma: no cover - protocol
+        ...
+    def evaluation_fingerprint(self) -> dict:  # pragma: no cover - protocol
+        ...
 __all__ = [
     "EvaluationPipeline",
+    "EvaluationPipelineContract",
+    "MetricPipeline",
     "ComposableEvaluationPipeline",
+    "ComposableEvaluationReportPipeline",
     "EvaluationStep",
     "EvaluationResult",
     "MetricAggregate",

themis/evaluation/pipelines/__init__.py CHANGED Viewed

@@ -2,14 +2,18 @@
 from themis.evaluation.pipelines.composable_pipeline import (
     ComposableEvaluationPipeline,
+    ComposableEvaluationReportPipeline,
     EvaluationResult,
     EvaluationStep,
 )
 from themis.evaluation.pipelines.standard_pipeline import EvaluationPipeline
+from themis.evaluation.metric_pipeline import MetricPipeline
 __all__ = [
     "EvaluationPipeline",
+    "MetricPipeline",
     "ComposableEvaluationPipeline",
+    "ComposableEvaluationReportPipeline",
     "EvaluationStep",
     "EvaluationResult",
 ]

themis/evaluation/pipelines/composable_pipeline.py CHANGED Viewed

@@ -6,6 +6,7 @@ from dataclasses import dataclass, field
 from typing import Any, Callable, Generic, Sequence, TypeVar
 from themis.core import entities as core_entities
+from themis.evaluation.reports import EvaluationFailure, EvaluationReport, MetricAggregate
 from themis.interfaces import Metric as MetricInterface
 from themis.utils import tracing
@@ -355,3 +356,57 @@ class ComposableEvaluationPipeline:
         """
         self._steps.clear()
         return self
+    def evaluation_fingerprint(self) -> dict:
+        """Return a fingerprint based on the configured steps."""
+        return {"steps": self.get_step_names()}
+class ComposableEvaluationReportPipeline:
+    """Adapter that makes a ComposableEvaluationPipeline compatible with EvaluationPipeline."""
+    def __init__(self, pipeline: ComposableEvaluationPipeline):
+        self._pipeline = pipeline
+    def evaluate(
+        self, records: Sequence[core_entities.GenerationRecord]
+    ) -> EvaluationReport:
+        per_metric: dict[str, list[core_entities.MetricScore]] = {}
+        failures: list[EvaluationFailure] = []
+        per_record: list[core_entities.EvaluationRecord] = []
+        for record in records:
+            result = self._pipeline.evaluate(record)
+            sample_id = record.task.metadata.get("dataset_id") or record.task.metadata.get(
+                "sample_id"
+            )
+            if result.errors:
+                for error in result.errors:
+                    failures.append(EvaluationFailure(sample_id=sample_id, message=error))
+            for score in result.scores:
+                per_metric.setdefault(score.metric_name, []).append(score)
+            per_record.append(
+                core_entities.EvaluationRecord(
+                    sample_id=sample_id,
+                    scores=result.scores,
+                    failures=list(result.errors),
+                )
+            )
+        aggregates = {
+            name: MetricAggregate.from_scores(name, scores)
+            for name, scores in per_metric.items()
+        }
+        return EvaluationReport(
+            metrics=aggregates,
+            failures=failures,
+            records=per_record,
+            slices={},
+        )
+    def evaluation_fingerprint(self) -> dict:
+        return self._pipeline.evaluation_fingerprint()

themis/evaluation/pipelines/standard_pipeline.py CHANGED Viewed

@@ -309,6 +309,22 @@ class EvaluationPipeline:
                 slices=self._compute_slice_aggregates(per_metric, slice_members),
             )
+    def evaluation_fingerprint(self) -> dict:
+        """Return a deterministic fingerprint for cache invalidation."""
+        config: dict[str, object] = {}
+        config["metrics"] = sorted(
+            [
+                f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
+                for metric in self._metrics
+            ]
+        )
+        extractor = self._extractor
+        extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
+        config["extractor"] = extractor_type
+        if hasattr(extractor, "field_name"):
+            config["extractor_field"] = extractor.field_name
+        return config
     def register_slice(
         self, name: str, fn: Callable[[core_entities.GenerationRecord], bool]
     ) -> None:

themis/experiment/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Experiment orchestration layer."""
-from themis.experiment import builder, export, math, orchestrator, storage
+from themis.experiment import definitions, export, math, orchestrator, storage
-__all__ = ["math", "orchestrator", "storage", "builder", "export"]
+__all__ = ["math", "orchestrator", "storage", "definitions", "export"]

themis/experiment/cache_manager.py CHANGED Viewed

@@ -81,6 +81,17 @@ class CacheManager:
             return {}
         return self._storage.load_cached_evaluations(run_id, evaluation_config=evaluation_config)
+    def run_metadata_exists(self, run_id: str) -> bool:
+        """Check if run metadata exists in storage."""
+        if self._storage is None:
+            return False
+        return self._storage.run_metadata_exists(run_id)
+    def start_run(self, run_id: str, *, experiment_id: str = "default") -> None:
+        """Start a run in storage."""
+        if self._storage is not None:
+            self._storage.start_run(run_id, experiment_id=experiment_id)
     def save_generation_record(
         self,
         run_id: str,
@@ -128,7 +139,10 @@ class CacheManager:
         """
         if self._storage is None:
             return None
-        return str(self._storage.get_run_path(run_id))
+        run_path = self._storage.get_run_path(run_id)
+        if run_path is None:
+            return None
+        return str(run_path)
 __all__ = ["CacheManager"]

themis/experiment/definitions.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Shared experiment definitions used by the builder."""
+"""Shared experiment definitions for orchestration assembly."""
 from __future__ import annotations

themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

themis-eval 0.2.3py3-none-any.whl → 1.0.0py3-none-any.whl