PyPI - themis-eval - Versions diffs - 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

themis-eval 0.2.3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

themis/__init__.py +5 -2
themis/_version.py +14 -1
themis/api.py +83 -145
themis/backends/storage.py +5 -0
themis/cli/commands/info.py +2 -11
themis/cli/main.py +231 -40
themis/comparison/engine.py +7 -13
themis/core/entities.py +4 -0
themis/evaluation/metric_pipeline.py +12 -0
themis/evaluation/pipeline.py +22 -0
themis/evaluation/pipelines/__init__.py +4 -0
themis/evaluation/pipelines/composable_pipeline.py +55 -0
themis/evaluation/pipelines/standard_pipeline.py +16 -0
themis/experiment/__init__.py +2 -2
themis/experiment/cache_manager.py +15 -1
themis/experiment/definitions.py +1 -1
themis/experiment/orchestrator.py +21 -11
themis/experiment/share.py +264 -0
themis/experiment/storage.py +345 -298
themis/generation/router.py +22 -4
themis/generation/runner.py +16 -1
themis/presets/benchmarks.py +602 -17
themis/server/app.py +38 -26
themis/session.py +125 -0
themis/specs/__init__.py +7 -0
themis/specs/execution.py +26 -0
themis/specs/experiment.py +33 -0
themis/specs/storage.py +18 -0
themis/storage/__init__.py +6 -0
themis/storage/experiment_storage.py +7 -0
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
themis/experiment/builder.py +0 -151
themis/experiment/export_csv.py +0 -159
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0

themis/server/app.py CHANGED Viewed

@@ -17,7 +17,8 @@ from pydantic import BaseModel, Field
 from themis.comparison import compare_runs
 from themis.comparison.statistics import StatisticalTest
-from themis.experiment.storage import ExperimentStorage
+from themis.storage import ExperimentStorage
+from themis._version import __version__
 class RunSummary(BaseModel):
@@ -71,7 +72,7 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
     app = FastAPI(
         title="Themis API",
         description="REST API for Themis experiment management",
-        version="2.0.0",
+        version=__version__,
     )
     # Enable CORS for web dashboard
@@ -117,13 +118,16 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
         return {
             "status": "ok",
             "service": "themis-api",
-            "version": "2.0.0",
+            "version": __version__,
         }
     @app.get("/api/runs", response_model=List[RunSummary], tags=["runs"])
     async def list_runs():
         """List all experiment runs."""
-        run_ids = storage.list_runs()
+        run_entries = storage.list_runs()
+        run_ids = [
+            entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
+        ]
         summaries = []
         for run_id in run_ids:
@@ -133,15 +137,12 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
             # Calculate average metrics
             metrics_dict: Dict[str, List[float]] = {}
             for record in eval_records.values():
-                for metric_name, score_obj in record.scores.items():
+                for score_obj in record.scores:
+                    metric_name = score_obj.metric_name
                     if metric_name not in metrics_dict:
                         metrics_dict[metric_name] = []
-                    # Extract numeric score
-                    if hasattr(score_obj, 'value'):
-                        metrics_dict[metric_name].append(score_obj.value)
-                    elif isinstance(score_obj, (int, float)):
-                        metrics_dict[metric_name].append(float(score_obj))
+                    metrics_dict[metric_name].append(score_obj.value)
             # Average metrics
             avg_metrics = {
@@ -162,7 +163,11 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
     @app.get("/api/runs/{run_id}", response_model=RunDetail, tags=["runs"])
     async def get_run(run_id: str):
         """Get detailed information about a run."""
-        if run_id not in storage.list_runs():
+        run_entries = storage.list_runs()
+        run_ids = [
+            entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
+        ]
+        if run_id not in run_ids:
             raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
         # Load records
@@ -179,25 +184,25 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
             # Extract scores
             scores = {}
-            for metric_name, score_obj in eval_record.scores.items():
-                if hasattr(score_obj, 'value'):
-                    value = score_obj.value
-                elif isinstance(score_obj, (int, float)):
-                    value = float(score_obj)
-                else:
-                    continue
+            for score_obj in eval_record.scores:
+                metric_name = score_obj.metric_name
+                value = score_obj.value
                 scores[metric_name] = value
                 if metric_name not in metrics_dict:
                     metrics_dict[metric_name] = []
                 metrics_dict[metric_name].append(value)
             # Build sample
+            sample_id = eval_record.sample_id
+            if sample_id is None and gen_record is not None:
+                sample_id = gen_record.task.metadata.get("dataset_id")
             sample = {
-                "id": gen_record.id if gen_record else cache_key,
-                "prompt": gen_record.prompt if gen_record else "",
-                "response": gen_record.response if gen_record else "",
+                "id": sample_id or cache_key,
+                "prompt": gen_record.task.prompt.text if gen_record else "",
+                "response": gen_record.output.text if gen_record and gen_record.output else "",
                 "scores": scores,
             }
             samples.append(sample)
@@ -220,7 +225,11 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
     @app.delete("/api/runs/{run_id}", tags=["runs"])
     async def delete_run(run_id: str):
         """Delete a run."""
-        if run_id not in storage.list_runs():
+        run_entries = storage.list_runs()
+        run_ids = [
+            entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
+        ]
+        if run_id not in run_ids:
             raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
         # Note: Current storage doesn't implement delete
@@ -234,7 +243,10 @@ def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
     async def compare_runs_api(request: ComparisonRequest):
         """Compare multiple runs."""
         # Validate runs exist
-        existing_runs = set(storage.list_runs())
+        run_entries = storage.list_runs()
+        existing_runs = set(
+            entry.run_id if hasattr(entry, "run_id") else entry for entry in run_entries
+        )
         for run_id in request.run_ids:
             if run_id not in existing_runs:
                 raise HTTPException(

themis/session.py ADDED Viewed

@@ -0,0 +1,125 @@
+"""Experiment session orchestration for vNext workflows."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Sequence
+from themis.core.entities import ExperimentReport, ModelSpec, SamplingConfig
+from themis.evaluation.pipeline import EvaluationPipelineContract
+from themis.experiment.orchestrator import ExperimentOrchestrator
+from themis.generation.plan import GenerationPlan
+from themis.generation.router import ProviderRouter
+from themis.generation.runner import GenerationRunner
+from themis.generation.templates import PromptTemplate
+from themis.interfaces import DatasetAdapter
+from themis.presets import parse_model_name
+from themis.providers import create_provider
+from themis.specs import ExecutionSpec, ExperimentSpec, StorageSpec
+@dataclass
+class ExperimentSession:
+    """Main entry point for running experiments with vNext specs."""
+    def run(
+        self,
+        spec: ExperimentSpec,
+        *,
+        execution: ExecutionSpec | None = None,
+        storage: StorageSpec | None = None,
+    ) -> ExperimentReport:
+        execution = execution or ExecutionSpec()
+        storage = storage or StorageSpec()
+        pipeline = spec.pipeline
+        if not isinstance(pipeline, EvaluationPipelineContract):
+            raise TypeError(
+                "spec.pipeline must implement EvaluationPipelineContract."
+            )
+        dataset = _resolve_dataset(spec.dataset)
+        provider_name, model_id, provider_options = _parse_model(spec.model)
+        model_spec = ModelSpec(identifier=model_id, provider=provider_name)
+        sampling = _build_sampling(spec.sampling)
+        plan = GenerationPlan(
+            templates=[PromptTemplate(name="default", template=spec.prompt)],
+            models=[model_spec],
+            sampling_parameters=[sampling],
+            dataset_id_field="id",
+            reference_field="answer",
+        )
+        provider = create_provider(provider_name, **provider_options)
+        router = ProviderRouter({(provider_name, model_id): provider})
+        runner = GenerationRunner(
+            provider=router,
+            max_parallel=execution.workers,
+            max_retries=execution.max_retries,
+            retry_initial_delay=execution.retry_initial_delay,
+            retry_backoff_multiplier=execution.retry_backoff_multiplier,
+            retry_max_delay=execution.retry_max_delay,
+            execution_backend=execution.backend,
+        )
+        storage_backend = _resolve_storage(storage)
+        orchestrator = ExperimentOrchestrator(
+            generation_plan=plan,
+            generation_runner=runner,
+            evaluation_pipeline=pipeline,
+            storage=storage_backend,
+        )
+        return orchestrator.run(
+            dataset=dataset,
+            run_id=spec.run_id,
+            resume=storage.cache,
+            cache_results=storage.cache,
+        )
+def _parse_model(model: str) -> tuple[str, str, dict]:
+    if ":" in model:
+        provider_name, model_id = model.split(":", 1)
+        return provider_name, model_id, {}
+    return parse_model_name(model)
+def _build_sampling(data: dict) -> SamplingConfig:
+    return SamplingConfig(
+        temperature=float(data.get("temperature", 0.0)),
+        top_p=float(data.get("top_p", 0.95)),
+        max_tokens=int(data.get("max_tokens", 512)),
+    )
+def _resolve_dataset(dataset: object) -> list[dict]:
+    if isinstance(dataset, DatasetAdapter):
+        return list(dataset.iter_samples())
+    if isinstance(dataset, Iterable):
+        return list(dataset)  # type: ignore[arg-type]
+    raise TypeError("spec.dataset must be iterable or implement DatasetAdapter.")
+def _resolve_storage(storage: StorageSpec):
+    if storage.backend is not None:
+        backend = storage.backend
+        if hasattr(backend, "experiment_storage"):
+            return backend.experiment_storage
+        if not hasattr(backend, "start_run"):
+            raise TypeError(
+                "storage.backend must be ExperimentStorage-compatible."
+            )
+        return backend
+    root = Path(storage.path) if storage.path is not None else Path(".cache/experiments")
+    from themis.storage import ExperimentStorage
+    return ExperimentStorage(root)
+__all__ = ["ExperimentSession"]

themis/specs/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Specification models for vNext workflows."""
+from themis.specs.experiment import ExperimentSpec
+from themis.specs.execution import ExecutionSpec
+from themis.specs.storage import StorageSpec
+__all__ = ["ExperimentSpec", "ExecutionSpec", "StorageSpec"]

themis/specs/execution.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Execution specification for vNext workflows."""
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class ExecutionSpec:
+    """Execution configuration for running experiments."""
+    backend: object | None = None
+    workers: int = 4
+    max_retries: int = 3
+    retry_initial_delay: float = 0.5
+    retry_backoff_multiplier: float = 2.0
+    retry_max_delay: float | None = 2.0
+    def __post_init__(self) -> None:
+        if self.workers < 1:
+            raise ValueError("ExecutionSpec.workers must be >= 1.")
+        if self.max_retries < 1:
+            raise ValueError("ExecutionSpec.max_retries must be >= 1.")
+__all__ = ["ExecutionSpec"]

themis/specs/experiment.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Experiment specification for vNext workflows."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Mapping
+@dataclass(frozen=True)
+class ExperimentSpec:
+    """Canonical experiment specification.
+    This spec is the single source of truth for the experiment's
+    dataset, prompt, model, sampling config, and evaluation pipeline.
+    """
+    dataset: object
+    prompt: str
+    model: str
+    sampling: Mapping[str, Any] = field(default_factory=dict)
+    pipeline: object | None = None
+    run_id: str | None = None
+    def __post_init__(self) -> None:
+        if not self.prompt:
+            raise ValueError("ExperimentSpec.prompt must be a non-empty string.")
+        if not self.model:
+            raise ValueError("ExperimentSpec.model must be a non-empty string.")
+        if self.pipeline is None:
+            raise ValueError("ExperimentSpec.pipeline must be provided.")
+__all__ = ["ExperimentSpec"]

themis/specs/storage.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Storage specification for vNext workflows."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class StorageSpec:
+    """Storage configuration for experiment persistence and caching."""
+    backend: object | None = None
+    path: str | Path | None = None
+    cache: bool = True
+__all__ = ["StorageSpec"]

themis/storage/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Storage backends and adapters for vNext workflows."""
+from themis.backends.storage import LocalFileStorageBackend, StorageBackend
+from themis.storage.experiment_storage import ExperimentStorage
+__all__ = ["StorageBackend", "LocalFileStorageBackend", "ExperimentStorage"]

themis/storage/experiment_storage.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Storage adapter module for vNext workflows."""
+from __future__ import annotations
+from themis.experiment.storage import ExperimentStorage as ExperimentStorage
+__all__ = ["ExperimentStorage"]

{themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: themis-eval
-Version: 0.2.3
+Version: 1.0.0
 Summary: Lightweight evaluation platform for LLM experiments
 Author: Pittawat Taveekitworachai
 License: MIT
@@ -100,13 +100,14 @@ pip install themis-eval[math,nlp,code,server]
 from themis import evaluate
 # Evaluate any model on any benchmark
-result = evaluate(
-    benchmark="gsm8k",
+report = evaluate(
+    "gsm8k",
     model="gpt-4",
-    limit=100
+    limit=100,
 )
-print(f"Accuracy: {result.metrics['exact_match']:.2%}")
+accuracy = report.evaluation_report.metrics["ExactMatch"].mean
+print(f"Accuracy: {accuracy:.2%}")
 ```
 ### CLI Usage
@@ -122,6 +123,9 @@ themis compare gpt4-run claude-run
 # Start web dashboard
 themis serve
+# Share a run
+themis share gpt4-run --output-dir share
 ```
 ---
@@ -130,20 +134,28 @@ themis serve
 ### 🎯 Built-in Benchmarks
-Themis includes 6 popular benchmarks out-of-the-box:
+Themis includes 19 built-in benchmarks out-of-the-box:
 ```python
 # Math reasoning
-evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
-evaluate(benchmark="math500", model="gpt-4", limit=50)
-evaluate(benchmark="aime24", model="gpt-4")
+evaluate("gsm8k", model="gpt-4", limit=100)
+evaluate("math500", model="gpt-4", limit=50)
+evaluate("aime24", model="gpt-4")
 # General knowledge
-evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
-evaluate(benchmark="supergpqa", model="gpt-4")
+evaluate("mmlu-pro", model="gpt-4", limit=1000)
+evaluate("supergpqa", model="gpt-4")
+# Science & medical
+evaluate("gpqa", model="gpt-4", limit=200)
+evaluate("medmcqa", model="gpt-4", limit=200)
+# Commonsense & conversational
+evaluate("commonsense_qa", model="gpt-4", limit=200)
+evaluate("coqa", model="gpt-4", limit=200)
 # Quick testing
-evaluate(benchmark="demo", model="fake-math-llm", limit=10)
+evaluate("demo", model="fake-math-llm", limit=10)
 ```
 **See all available benchmarks:**
@@ -165,8 +177,7 @@ themis list benchmarks
 ```python
 # Use specific metrics
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     metrics=["exact_match", "bleu", "rouge1"],
 )
@@ -192,7 +203,7 @@ print(report.summary())
 **CLI:**
 ```bash
-themis compare run-1 run-2 --test bootstrap --output comparison.html
+themis compare run-1 run-2 --output comparison.html
 ```
 ### 🌐 Web Dashboard
@@ -218,19 +229,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
 ```python
 # OpenAI
-evaluate(benchmark="gsm8k", model="gpt-4")
+evaluate("gsm8k", model="gpt-4")
 # Anthropic
-evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
+evaluate("gsm8k", model="claude-3-opus-20240229")
 # Azure OpenAI
-evaluate(benchmark="gsm8k", model="azure/gpt-4")
+evaluate("gsm8k", model="azure/gpt-4")
 # Local models (vLLM, Ollama, etc.)
-evaluate(benchmark="gsm8k", model="ollama/llama3")
+evaluate("gsm8k", model="ollama/llama3")
 # AWS Bedrock
-evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
+evaluate("gsm8k", model="bedrock/anthropic.claude-3")
 ```
 ### 💾 Smart Caching
@@ -239,8 +250,7 @@ Themis automatically caches results and resumes failed runs:
 ```python
 # Run with caching
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     limit=1000,
     run_id="my-experiment",
@@ -275,14 +285,13 @@ result = evaluate(
     metrics=["exact_match"],
 )
-print(result.report)
+print(result.evaluation_report.metrics["ExactMatch"].mean)
 ```
 ### Advanced Configuration
 ```python
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     temperature=0.7,
     max_tokens=512,
@@ -335,7 +344,7 @@ Themis is built on a clean, modular architecture:
          │                 │
     ┌────▼─────┐     ┌────▼─────┐
     │Benchmarks│     │Evaluation│
-    │(6 built- │     │ Pipeline │
+    │(19 built-│     │ Pipeline │
     │   in)    │     └────┬─────┘
     └──────────┘          │
                      ┌────▼─────┐
@@ -359,7 +368,7 @@ Themis is built on a clean, modular architecture:
 - **[API Reference](docs/index.md)** - Detailed API documentation
 - **[Examples](examples-simple/)** - Runnable code examples
-- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
+- **[Backends API](docs/api/backends.md)** - Custom storage and execution
 - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
 - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
@@ -382,14 +391,13 @@ class S3StorageBackend(StorageBackend):
     # ... implement other methods
 # Use custom backend
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     storage_backend=S3StorageBackend(bucket="my-bucket")
 )
 ```
-See [docs/customization/backends.md](docs/customization/backends.md) for details.
+See [docs/api/backends.md](docs/api/backends.md) for details.
 ### Distributed Execution
@@ -401,8 +409,7 @@ class RayExecutionBackend(ExecutionBackend):
     """Distributed execution with Ray"""
     # ... implementation
-result = evaluate(
-    benchmark="math500",
+result = evaluate("math500",
     model="gpt-4",
     execution_backend=RayExecutionBackend(num_cpus=32)
 )
@@ -454,10 +461,10 @@ themis eval <benchmark> --model <model> [options]
 themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
 # Options:
+#   --metric NAME          Restrict to one metric
 #   --storage PATH         Storage directory
-#   --test STR             Statistical test: t_test, bootstrap, permutation
-#   --alpha FLOAT          Significance level (default: 0.05)
 #   --output FILE          Export report (.json, .html, .md)
+#   --show-diff            Include detailed per-sample differences in summary
 ```
 ### Server
@@ -539,6 +546,12 @@ uv run python examples-simple/04_comparison.py
 # API server example
 uv run python examples-simple/05_api_server.py
+# Resume/cache example
+uv run python examples-simple/08_resume_cache.py
+# End-to-end research loop example
+uv run python examples-simple/09_research_loop.py
 ```
 ---

themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

themis-eval 0.2.3py3-none-any.whl → 1.0.0py3-none-any.whl