PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/backends/storage.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Storage backend interface for custom storage implementations.
+This module defines the abstract interface for storage backends, allowing
+users to implement custom storage solutions (cloud storage, databases, etc.)
+without modifying Themis core code.
+Example implementations:
+- S3Backend: Store results in AWS S3
+- GCSBackend: Store results in Google Cloud Storage
+- PostgresBackend: Store results in PostgreSQL
+- RedisBackend: Use Redis for distributed caching
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List
+from themis.core.entities import (
+    EvaluationRecord,
+    ExperimentReport,
+    GenerationRecord,
+)
+class StorageBackend(ABC):
+    """Abstract interface for storage backends.
+    Implement this interface to create custom storage solutions.
+    All methods should be thread-safe if used with concurrent workers.
+    Example:
+        >>> class S3StorageBackend(StorageBackend):
+        ...     def __init__(self, bucket: str):
+        ...         self.bucket = bucket
+        ...         self.s3_client = boto3.client('s3')
+        ...
+        ...     def save_run_metadata(self, run_id: str, metadata: RunMetadata) -> None:
+        ...         key = f"runs/{run_id}/metadata.json"
+        ...         self.s3_client.put_object(
+        ...             Bucket=self.bucket,
+        ...             Key=key,
+        ...             Body=metadata.to_json(),
+        ...         )
+        ...     # ... implement other methods
+    """
+    @abstractmethod
+    def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
+        """Save run metadata.
+        Args:
+            run_id: Unique identifier for the run
+            metadata: Run metadata to save (as dictionary)
+        """
+        pass
+    @abstractmethod
+    def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
+        """Load run metadata.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            Run metadata as dictionary
+        Raises:
+            FileNotFoundError: If run metadata doesn't exist
+        """
+        pass
+    @abstractmethod
+    def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
+        """Save a generation record.
+        Args:
+            run_id: Unique identifier for the run
+            record: Generation record to save
+        Note:
+            This method should be atomic and thread-safe.
+        """
+        pass
+    @abstractmethod
+    def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
+        """Load all generation records for a run.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            List of generation records
+        """
+        pass
+    @abstractmethod
+    def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
+        """Save an evaluation record.
+        Args:
+            run_id: Unique identifier for the run
+            record: Evaluation record to save
+        Note:
+            This method should be atomic and thread-safe.
+        """
+        pass
+    @abstractmethod
+    def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
+        """Load all evaluation records for a run.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            Dictionary mapping cache_key to EvaluationRecord
+        """
+        pass
+    @abstractmethod
+    def save_report(self, run_id: str, report: ExperimentReport) -> None:
+        """Save experiment report.
+        Args:
+            run_id: Unique identifier for the run
+            report: Experiment report to save
+        """
+        pass
+    @abstractmethod
+    def load_report(self, run_id: str) -> ExperimentReport:
+        """Load experiment report.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            Experiment report
+        Raises:
+            FileNotFoundError: If report doesn't exist
+        """
+        pass
+    @abstractmethod
+    def list_runs(self) -> List[str]:
+        """List all run IDs in storage.
+        Returns:
+            List of run IDs
+        """
+        pass
+    @abstractmethod
+    def run_exists(self, run_id: str) -> bool:
+        """Check if a run exists in storage.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            True if run exists, False otherwise
+        """
+        pass
+    @abstractmethod
+    def delete_run(self, run_id: str) -> None:
+        """Delete all data for a run.
+        Args:
+            run_id: Unique identifier for the run
+        """
+        pass
+    def close(self) -> None:
+        """Close the storage backend and release resources.
+        Optional method for cleanup. Called when storage is no longer needed.
+        """
+        pass
+class LocalFileStorageBackend(StorageBackend):
+    """Adapter for the existing ExperimentStorage implementation.
+    This class wraps the current file-based storage implementation
+    to conform to the StorageBackend interface.
+    Note:
+        This is a compatibility layer. New code should use the interface,
+        but existing storage logic is preserved.
+    """
+    def __init__(self, storage_path: str | Path):
+        """Initialize with path to storage directory.
+        Args:
+            storage_path: Path to storage directory
+        """
+        from themis.experiment.storage import ExperimentStorage
+        self._storage = ExperimentStorage(storage_path)
+    def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
+        """Save run metadata."""
+        experiment_id = metadata.get("experiment_id", "default")
+        self._storage.start_run(run_id, experiment_id=experiment_id)
+    def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
+        """Load run metadata."""
+        # Note: Current storage doesn't have a direct method for this
+        # This is a limitation of the adapter pattern
+        raise NotImplementedError("Use ExperimentStorage directly for now")
+    def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
+        """Save generation record."""
+        self._storage.append_record(run_id, record)
+    def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
+        """Load generation records."""
+        cached = self._storage.load_cached_records(run_id)
+        return list(cached.values())
+    def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
+        """Save evaluation record."""
+        self._storage.append_evaluation(run_id, record)
+    def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
+        """Load evaluation records."""
+        return self._storage.load_cached_evaluations(run_id)
+    def save_report(self, run_id: str, report: ExperimentReport) -> None:
+        """Save report."""
+        self._storage.save_report(run_id, report)
+    def load_report(self, run_id: str) -> ExperimentReport:
+        """Load report."""
+        return self._storage.load_report(run_id)
+    def list_runs(self) -> List[str]:
+        """List runs."""
+        return self._storage.list_runs()
+    def run_exists(self, run_id: str) -> bool:
+        """Check if run exists."""
+        return run_id in self._storage.list_runs()
+    def delete_run(self, run_id: str) -> None:
+        """Delete run."""
+        # Note: Current storage doesn't have delete functionality
+        raise NotImplementedError("Delete not implemented in current storage")
+__all__ = [
+    "StorageBackend",
+    "LocalFileStorageBackend",
+]

themis/cli/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Command-line helpers for running Themis experiments."""
+from . import main
+__all__ = ["main"]

themis/cli/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Entry point for running themis.cli as a module."""
+from .main import main
+if __name__ == "__main__":
+    raise SystemExit(main())

themis/cli/commands/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""CLI command modules."""
+from themis.cli.commands import (
+    benchmarks,
+    config_commands,
+    demo,
+    info,
+    math_benchmarks,
+    mcq_benchmarks,
+)
+__all__ = [
+    "benchmarks",
+    "config_commands",
+    "demo",
+    "info",
+    "math_benchmarks",
+    "mcq_benchmarks",
+]

themis/cli/commands/benchmarks.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""Benchmark listing commands."""
+from __future__ import annotations
+from typing import Annotated
+from cyclopts import Parameter
+from themis.providers.registry import _REGISTRY
+def list_providers(
+    *,
+    verbose: Annotated[
+        bool, Parameter(help="Show detailed provider information")
+    ] = False,
+) -> int:
+    """List available LLM providers."""
+    providers = sorted(_REGISTRY._factories.keys())
+    if not providers:
+        print("No providers registered.")
+        return 0
+    print("Available Providers:")
+    print("=" * 60)
+    provider_info = {
+        "fake": "Built-in fake provider for testing (no API required)",
+        "openai-compatible": "OpenAI-compatible API (LM Studio, Ollama, vLLM, OpenAI)",
+        "vllm": "vLLM server provider for local model hosting",
+    }
+    for provider in providers:
+        status = "✓" if provider in provider_info else "·"
+        print(f"{status} {provider}")
+        if verbose and provider in provider_info:
+            print(f"  {provider_info[provider]}")
+    if not verbose:
+        print("\nUse --verbose for more details")
+    return 0
+def list_benchmarks(
+    *,
+    verbose: Annotated[
+        bool, Parameter(help="Show detailed benchmark information")
+    ] = False,
+) -> int:
+    """List available datasets and benchmarks."""
+    benchmarks = [
+        {
+            "name": "math500",
+            "description": "MATH-500 dataset for mathematical reasoning",
+            "source": "huggingface (default) or local",
+            "subjects": [
+                "algebra",
+                "counting_and_probability",
+                "geometry",
+                "intermediate_algebra",
+                "number_theory",
+                "prealgebra",
+                "precalculus",
+            ],
+            "command": "uv run python -m themis.cli math500",
+        },
+        {
+            "name": "gsm8k",
+            "description": "GSM8K dataset for grade school math word problems",
+            "source": "huggingface (default) or local",
+            "subjects": "math",
+            "command": "uv run python -m themis.cli gsm8k",
+        },
+        {
+            "name": "gpqa",
+            "description": "GPQA dataset for graduate-level science questions",
+            "source": "huggingface (default) or local",
+            "subjects": "science",
+            "command": "uv run python -m themis.cli gpqa",
+        },
+        {
+            "name": "gsm-symbolic",
+            "description": "GSM-Symbolic dataset for symbolic math reasoning",
+            "source": "huggingface (default) or local",
+            "subjects": "math",
+            "command": "uv run python -m themis.cli gsm-symbolic",
+        },
+        {
+            "name": "medmcqa",
+            "description": "MedMCQA dataset for medical entrance exams",
+            "source": "huggingface (default) or local",
+            "subjects": "medicine",
+            "command": "uv run python -m themis.cli medmcqa",
+        },
+        {
+            "name": "med_qa",
+            "description": "MedQA dataset for medical question answering",
+            "source": "huggingface (default) or local",
+            "subjects": "medicine",
+            "command": "uv run python -m themis.cli med_qa",
+        },
+        {
+            "name": "sciq",
+            "description": "SciQ dataset for science questions",
+            "source": "huggingface (default) or local",
+            "subjects": "science",
+            "command": "uv run python -m themis.cli sciq",
+        },
+        {
+            "name": "commonsense_qa",
+            "description": "CommonsenseQA dataset for commonsense reasoning",
+            "source": "huggingface (default) or local",
+            "subjects": "commonsense",
+            "command": "uv run python -m themis.cli commonsense_qa",
+        },
+        {
+            "name": "piqa",
+            "description": "PIQA dataset for physical commonsense reasoning",
+            "source": "huggingface (default) or local",
+            "subjects": "commonsense",
+            "command": "uv run python -m themis.cli piqa",
+        },
+        {
+            "name": "social_i_qa",
+            "description": "Social IQA dataset for social commonsense reasoning",
+            "source": "huggingface (default) or local",
+            "subjects": "commonsense",
+            "command": "uv run python -m themis.cli social_i_qa",
+        },
+        {
+            "name": "coqa",
+            "description": "CoQA dataset for conversational question answering",
+            "source": "huggingface (default) or local",
+            "subjects": "conversational",
+            "command": "uv run python -m themis.cli coqa",
+        },
+        {
+            "name": "supergpqa",
+            "description": "Graduate-level QA benchmark with multiple-choice questions",
+            "source": "huggingface (default) or local",
+            "subjects": "category filter via --subjects",
+            "command": "uv run python -m themis.cli supergpqa",
+        },
+        {
+            "name": "mmlu-pro",
+            "description": "Professional-level MMLU benchmark with refined distractors",
+            "source": "huggingface (default) or local",
+            "subjects": "subject filter via --subjects",
+            "command": "uv run python -m themis.cli mmlu-pro",
+        },
+        {
+            "name": "aime24",
+            "description": "AIME 2024 competition problems",
+            "source": "huggingface (default) or local",
+            "subjects": "problem set",
+            "command": "uv run python -m themis.cli aime24",
+        },
+        {
+            "name": "aime25",
+            "description": "AIME 2025 competition problems",
+            "source": "huggingface (default) or local",
+            "subjects": "problem set",
+            "command": "uv run python -m themis.cli aime25",
+        },
+        {
+            "name": "amc23",
+            "description": "AMC 2023 competition problems",
+            "source": "huggingface (default) or local",
+            "subjects": "problem set",
+            "command": "uv run python -m themis.cli amc23",
+        },
+        {
+            "name": "olympiadbench",
+            "description": "Mixed Olympiad-style math benchmark",
+            "source": "huggingface (default) or local",
+            "subjects": "competition metadata",
+            "command": "uv run python -m themis.cli olympiadbench",
+        },
+        {
+            "name": "beyondaime",
+            "description": "BeyondAIME advanced math competition set",
+            "source": "huggingface (default) or local",
+            "subjects": "problem set",
+            "command": "uv run python -m themis.cli beyondaime",
+        },
+        {
+            "name": "demo",
+            "description": "Built-in demo with 2 math problems",
+            "source": "inline",
+            "subjects": ["precalculus", "arithmetic"],
+            "command": "uv run python -m themis.cli demo",
+        },
+        {
+            "name": "inline",
+            "description": "Custom inline dataset (via config file)",
+            "source": "config file",
+            "subjects": "user-defined",
+            "command": "uv run python -m themis.cli run-config --config your_config.yaml",
+        },
+    ]
+    print("Available Datasets & Benchmarks:")
+    print("=" * 60)
+    for bench in benchmarks:
+        print(f"\n📊 {bench['name']}")
+        print(f"   {bench['description']}")
+        if verbose:
+            print(f"   Source: {bench['source']}")
+            if isinstance(bench["subjects"], list):
+                print(f"   Subjects: {', '.join(bench['subjects'])}")
+            else:
+                print(f"   Subjects: {bench['subjects']}")
+            print(f"   Command: {bench['command']}")
+    if not verbose:
+        print("\nUse --verbose for more details and example commands")
+    return 0

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl