PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/datasets/piqa.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Helpers for working with the ybisk/piqa dataset."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Iterable, Iterator, List, Sequence
+from pydantic import BaseModel, Field, field_validator
+_DATASET_NAME = "ybisk/piqa"
+class PiqaSample(BaseModel):
+    unique_id: str
+    goal: str
+    choices: list[str]
+    answer: str
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("choices", mode="before")
+    @classmethod
+    def _ensure_choices(cls, value: Any) -> list[str]:
+        if value is None:
+            return []
+        if isinstance(value, (list, tuple)):
+            return [str(item) for item in value]
+        raise TypeError("choices must be a sequence")
+    def to_generation_example(self) -> dict[str, Any]:
+        return {
+            "unique_id": self.unique_id,
+            "goal": self.goal,
+            "choices": list(self.choices),
+            "answer": self.answer,
+            "metadata": dict(self.metadata),
+        }
+def load_piqa(
+    *,
+    split: str = "validation", # Test set usually has no labels
+    limit: int | None = None,
+    source: str = "huggingface",
+    data_dir: str | Path | None = None,
+) -> List[PiqaSample]:
+    """Load PIQA samples from Hugging Face or a local directory."""
+    if source not in {"huggingface", "local"}:
+        raise ValueError(
+            f"Unsupported source '{source}'. Expected one of: 'huggingface', 'local'."
+        )
+    if source == "huggingface":
+        rows = _load_from_huggingface(split=split)
+    else:
+        if data_dir is None:
+            raise ValueError(
+                "data_dir must be provided when source='local'. "
+                "Pass dataset.data_dir in configs or --data-dir on the CLI."
+            )
+        rows = _load_from_local(Path(data_dir))
+    samples: list[PiqaSample] = []
+    for index, row in enumerate(rows, start=1):
+        sample = _row_to_sample(row, index=index)
+        samples.append(sample)
+        if limit is not None and len(samples) >= limit:
+            break
+    return samples
+def _row_to_sample(row: dict[str, Any], *, index: int) -> PiqaSample:
+    unique_id = (
+        row.get("id")
+        or row.get("unique_id")
+        or f"piqa-{index:05d}"
+    )
+    goal = row.get("goal") or ""
+    # PIQA has 'sol1', 'sol2'
+    choices = [
+        str(row.get("sol1") or ""),
+        str(row.get("sol2") or ""),
+    ]
+    # label is integer 0 or 1
+    label = row.get("label")
+    answer = ""
+    if label is not None:
+        try:
+            label_int = int(label)
+            if 0 <= label_int < len(choices):
+                answer = choices[label_int]
+        except (ValueError, TypeError):
+            pass
+    metadata_keys = {
+        "goal", "sol1", "sol2", "label", "id"
+    }
+    metadata = {key: value for key, value in row.items() if key not in metadata_keys}
+    return PiqaSample(
+        unique_id=str(unique_id),
+        goal=str(goal),
+        choices=choices,
+        answer=answer,
+        metadata=metadata,
+    )
+def _load_from_huggingface(*, split: str) -> Iterable[dict[str, Any]]:
+    try:
+        from datasets import load_dataset
+    except ImportError as exc:  # pragma: no cover - optional dependency
+        raise RuntimeError(
+            "datasets is required to load PIQA from Hugging Face. Install it via `uv pip install '.[hf]'`."
+        ) from exc
+    dataset = load_dataset(_DATASET_NAME, split=split)
+    for row in dataset:
+        yield dict(row)
+def _load_from_local(root: Path) -> Iterator[dict[str, Any]]:
+    if not root.exists():
+        raise FileNotFoundError(f"Local dataset directory not found: {root}")
+    for path in root.rglob("*"):
+        if path.suffix.lower() == ".json":
+            with path.open("r", encoding="utf-8") as handle:
+                row = json.load(handle)
+                row.setdefault("id", path.stem)
+                yield row
+        elif path.suffix.lower() in {".jsonl", ".ndjson"}:
+            with path.open("r", encoding="utf-8") as handle:
+                for line_num, line in enumerate(handle, start=1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    row = json.loads(line)
+                    row.setdefault("id", f"{path.stem}-{line_num}")
+                    yield row
+__all__ = ["PiqaSample", "load_piqa"]

themis/datasets/registry.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Registry for dataset loaders.
+This module provides a plugin-based registry system for datasets, allowing
+users to register custom datasets without modifying core Themis code.
+Example:
+    ```python
+    from themis.datasets import register_dataset
+    def create_my_dataset(**options):
+        from my_module import MyDataset
+        return MyDataset(path=options.get('path'))
+    register_dataset('my-dataset', create_my_dataset)
+    ```
+"""
+from __future__ import annotations
+from typing import Any, Callable
+# Factory type: takes config options, returns list of samples
+DatasetFactory = Callable[[dict[str, Any]], list[dict[str, Any]]]
+class DatasetRegistry:
+    """Registry for dataset loaders.
+    Maintains a mapping from dataset names to factory functions that
+    load and return dataset samples.
+    """
+    def __init__(self):
+        self._datasets: dict[str, DatasetFactory] = {}
+    def register(self, name: str, factory: DatasetFactory) -> None:
+        """Register a dataset factory.
+        Args:
+            name: Unique identifier for the dataset (e.g., 'math500', 'my-dataset')
+            factory: Callable that takes config options and returns list of samples
+        Raises:
+            ValueError: If dataset name is already registered
+        """
+        if name in self._datasets:
+            raise ValueError(
+                f"Dataset '{name}' is already registered. "
+                f"Use a different name or unregister the existing dataset first."
+            )
+        self._datasets[name] = factory
+    def unregister(self, name: str) -> None:
+        """Unregister a dataset.
+        Args:
+            name: Dataset identifier to remove
+        Raises:
+            ValueError: If dataset name is not registered
+        """
+        if name not in self._datasets:
+            raise ValueError(f"Dataset '{name}' is not registered")
+        del self._datasets[name]
+    def create(self, name: str, **options) -> list[dict[str, Any]]:
+        """Create a dataset instance by loading samples.
+        Args:
+            name: Registered dataset identifier
+            **options: Configuration options passed to the factory function
+                Common options include:
+                - source: 'huggingface', 'local', or custom source
+                - data_dir: Path for local datasets
+                - split: Dataset split (e.g., 'train', 'test')
+                - limit: Maximum number of samples to load
+                - subjects: List of subjects to filter
+        Returns:
+            List of sample dictionaries ready for generation
+        Raises:
+            ValueError: If dataset name is not registered
+        """
+        if name not in self._datasets:
+            available = list(self._datasets.keys())
+            raise ValueError(
+                f"Unknown dataset: '{name}'. "
+                f"Available datasets: {', '.join(sorted(available)) or 'none'}"
+            )
+        factory = self._datasets[name]
+        return factory(options)
+    def list_datasets(self) -> list[str]:
+        """List all registered dataset names.
+        Returns:
+            Sorted list of registered dataset identifiers
+        """
+        return sorted(self._datasets.keys())
+    def is_registered(self, name: str) -> bool:
+        """Check if a dataset is registered.
+        Args:
+            name: Dataset identifier to check
+        Returns:
+            True if the dataset is registered, False otherwise
+        """
+        return name in self._datasets
+# Global registry instance
+_REGISTRY = DatasetRegistry()
+def register_dataset(name: str, factory: DatasetFactory) -> None:
+    """Register a dataset factory in the global registry.
+    Args:
+        name: Unique identifier for the dataset
+        factory: Callable that takes config options and returns samples
+    Example:
+        ```python
+        def create_my_dataset(options):
+            from my_module import load_data
+            return load_data(
+                path=options.get('path'),
+                limit=options.get('limit')
+            )
+        register_dataset('my-dataset', create_my_dataset)
+        ```
+    """
+    _REGISTRY.register(name, factory)
+def unregister_dataset(name: str) -> None:
+    """Unregister a dataset from the global registry.
+    Args:
+        name: Dataset identifier to remove
+    """
+    _REGISTRY.unregister(name)
+def create_dataset(name: str, **options) -> list[dict[str, Any]]:
+    """Create a dataset by loading samples from a registered factory.
+    Args:
+        name: Registered dataset identifier
+        **options: Configuration options for the dataset
+    Returns:
+        List of sample dictionaries
+    Example:
+        ```python
+        samples = create_dataset(
+            'math500',
+            source='huggingface',
+            split='test',
+            limit=10
+        )
+        ```
+    """
+    return _REGISTRY.create(name, **options)
+def list_datasets() -> list[str]:
+    """List all registered datasets.
+    Returns:
+        Sorted list of dataset names
+    """
+    return _REGISTRY.list_datasets()
+def is_dataset_registered(name: str) -> bool:
+    """Check if a dataset is registered.
+    Args:
+        name: Dataset identifier
+    Returns:
+        True if registered, False otherwise
+    """
+    return _REGISTRY.is_registered(name)
+__all__ = [
+    "DatasetFactory",
+    "DatasetRegistry",
+    "register_dataset",
+    "unregister_dataset",
+    "create_dataset",
+    "list_datasets",
+    "is_dataset_registered",
+]

themis/datasets/schema.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""Dataset schema and metadata definitions.
+This module provides enhanced dataset abstractions with schema validation,
+metadata, and filtering capabilities while maintaining backward compatibility.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Callable, Iterable, Protocol, runtime_checkable
+@dataclass
+class DatasetSchema:
+    """Describes the structure and validation rules for dataset samples.
+    Examples:
+        # Basic schema
+        schema = DatasetSchema(
+            id_field="unique_id",
+            reference_field="answer",
+            required_fields={"unique_id", "problem", "answer"},
+        )
+        # Schema with validation
+        def validate_problem(sample: dict) -> None:
+            if len(sample.get("problem", "")) < 10:
+                raise ValueError("Problem text too short")
+        schema = DatasetSchema(
+            id_field="id",
+            reference_field="expected",
+            required_fields={"id", "problem", "expected"},
+            validators=[validate_problem],
+        )
+    """
+    id_field: str
+    reference_field: str | None
+    required_fields: set[str] = field(default_factory=set)
+    optional_fields: set[str] = field(default_factory=set)
+    metadata_fields: set[str] = field(default_factory=set)
+    validators: list[Callable[[dict], None]] = field(default_factory=list)
+    def validate_sample(self, sample: dict[str, Any]) -> None:
+        """Validate a single sample against this schema.
+        Args:
+            sample: Sample to validate
+        Raises:
+            ValueError: If validation fails
+        """
+        # Check required fields
+        for field_name in self.required_fields:
+            if field_name not in sample:
+                raise ValueError(
+                    f"Missing required field '{field_name}' in sample {sample.get(self.id_field)}"
+                )
+        # Run custom validators
+        for validator in self.validators:
+            validator(sample)
+    def get_all_fields(self) -> set[str]:
+        """Get all known fields (required + optional + metadata)."""
+        return self.required_fields | self.optional_fields | self.metadata_fields
+@dataclass
+class DatasetMetadata:
+    """Metadata about the entire dataset.
+    This provides information useful for experiment planning, reporting,
+    and understanding dataset characteristics.
+    Examples:
+        metadata = DatasetMetadata(
+            name="MATH-500",
+            version="1.0",
+            total_samples=500,
+            categories={
+                "subject": ["algebra", "geometry", "number_theory"],
+                "difficulty": ["easy", "medium", "hard"],
+            },
+            difficulty_distribution={
+                "easy": 100,
+                "medium": 250,
+                "hard": 150,
+            },
+            description="Math problems from competition mathematics",
+        )
+    """
+    name: str
+    version: str = "1.0"
+    total_samples: int | None = None
+    categories: dict[str, list[str]] = field(default_factory=dict)
+    difficulty_distribution: dict[str, int] | None = None
+    description: str = ""
+    source_url: str | None = None
+    license: str | None = None
+    citation: str | None = None
+    custom_metadata: dict[str, Any] = field(default_factory=dict)
+    def get_category_values(self, category: str) -> list[str]:
+        """Get all possible values for a category."""
+        return self.categories.get(category, [])
+    def has_category(self, category: str) -> bool:
+        """Check if dataset has a specific category."""
+        return category in self.categories
+@runtime_checkable
+class EnhancedDatasetAdapter(Protocol):
+    """Extended dataset interface with schema and metadata support.
+    This protocol extends the basic DatasetAdapter with additional
+    capabilities for schema validation, filtering, and stratification.
+    """
+    def iter_samples(self) -> Iterable[dict[str, Any]]:
+        """Iterate over dataset samples."""
+        ...
+    def get_schema(self) -> DatasetSchema:
+        """Get the dataset schema."""
+        ...
+    def get_metadata(self) -> DatasetMetadata:
+        """Get dataset metadata."""
+        ...
+    def filter(
+        self, predicate: Callable[[dict[str, Any]], bool]
+    ) -> EnhancedDatasetAdapter:
+        """Return filtered view of dataset.
+        Args:
+            predicate: Function that returns True for samples to keep
+        Returns:
+            New dataset adapter with filtered samples
+        """
+        ...
+    def limit(self, n: int) -> EnhancedDatasetAdapter:
+        """Return dataset limited to first n samples.
+        Args:
+            n: Maximum number of samples
+        Returns:
+            New dataset adapter with limited samples
+        """
+        ...
+    def stratify(
+        self, field: str, distribution: dict[str, float]
+    ) -> EnhancedDatasetAdapter:
+        """Return stratified sample of dataset.
+        Args:
+            field: Field to stratify by
+            distribution: Desired distribution (values should sum to 1.0)
+        Returns:
+            New dataset adapter with stratified samples
+        """
+        ...
+# Common validators
+def validate_non_empty_field(field_name: str) -> Callable[[dict], None]:
+    """Create validator that ensures field is non-empty.
+    Args:
+        field_name: Name of field to validate
+    Returns:
+        Validator function
+    """
+    def validator(sample: dict) -> None:
+        value = sample.get(field_name)
+        if not value:
+            raise ValueError(f"Field '{field_name}' cannot be empty")
+    return validator
+def validate_field_type(field_name: str, expected_type: type) -> Callable[[dict], None]:
+    """Create validator that ensures field has correct type.
+    Args:
+        field_name: Name of field to validate
+        expected_type: Expected type
+    Returns:
+        Validator function
+    """
+    def validator(sample: dict) -> None:
+        value = sample.get(field_name)
+        if value is not None and not isinstance(value, expected_type):
+            raise ValueError(
+                f"Field '{field_name}' expected type {expected_type.__name__}, "
+                f"got {type(value).__name__}"
+            )
+    return validator
+def validate_field_in_choices(
+    field_name: str, choices: set[str]
+) -> Callable[[dict], None]:
+    """Create validator that ensures field value is in allowed choices.
+    Args:
+        field_name: Name of field to validate
+        choices: Set of allowed values
+    Returns:
+        Validator function
+    """
+    def validator(sample: dict) -> None:
+        value = sample.get(field_name)
+        if value is not None and value not in choices:
+            raise ValueError(
+                f"Field '{field_name}' value '{value}' not in allowed choices: {choices}"
+            )
+    return validator
+__all__ = [
+    "DatasetSchema",
+    "DatasetMetadata",
+    "EnhancedDatasetAdapter",
+    "validate_non_empty_field",
+    "validate_field_type",
+    "validate_field_in_choices",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl