PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/datasets/__init__.py ADDED Viewed

@@ -0,0 +1,273 @@
+"""Dataset helpers for Themis experiments."""
+from __future__ import annotations
+from typing import Any
+from . import (
+    competition_math,
+    commonsense_qa,
+    coqa,
+    gpqa,
+    gsm_symbolic,
+    gsm8k,
+    math500,
+    med_qa,
+    medmcqa,
+    mmlu_pro,
+    piqa,
+    sciq,
+    social_i_qa,
+    super_gpqa,
+)
+from .registry import (
+    create_dataset,
+    is_dataset_registered,
+    list_datasets,
+    register_dataset,
+    unregister_dataset,
+)
+# Factory functions for built-in datasets
+def _create_math500(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for MATH-500 dataset."""
+    samples = math500.load_math500(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subjects=options.get("subjects"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_competition_math(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for competition math datasets (AIME, AMC, etc.)."""
+    # Get dataset and subset from options
+    dataset = options.get("dataset")
+    if not dataset:
+        raise ValueError(
+            "Competition math requires 'dataset' option "
+            "(e.g., 'math-ai/aime24', 'math-ai/amc23')"
+        )
+    samples = competition_math.load_competition_math(
+        dataset=dataset,
+        subset=options.get("subset"),
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subjects=options.get("subjects"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_super_gpqa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for SuperGPQA dataset."""
+    samples = super_gpqa.load_super_gpqa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subjects=options.get("subjects"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_mmlu_pro(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for MMLU-Pro dataset."""
+    samples = mmlu_pro.load_mmlu_pro(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subjects=options.get("subjects"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_gsm8k(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for GSM8K dataset."""
+    samples = gsm8k.load_gsm8k(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subset=options.get("subset", "main"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_gpqa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for GPQA dataset."""
+    samples = gpqa.load_gpqa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subset=options.get("subset", "gpqa_diamond"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_gsm_symbolic(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for GSM-Symbolic dataset."""
+    samples = gsm_symbolic.load_gsm_symbolic(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subset=options.get("subset", "main"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_medmcqa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for MedMCQA dataset."""
+    samples = medmcqa.load_medmcqa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subset=options.get("subset"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_med_qa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for MedQA dataset."""
+    samples = med_qa.load_med_qa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+        subset=options.get("subset", "med_qa_en_bigbio_qa"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_sciq(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for SciQ dataset."""
+    samples = sciq.load_sciq(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "test"),
+        limit=options.get("limit"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_commonsense_qa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for CommonsenseQA dataset."""
+    samples = commonsense_qa.load_commonsense_qa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "validation"),
+        limit=options.get("limit"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_piqa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for PIQA dataset."""
+    samples = piqa.load_piqa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "validation"),
+        limit=options.get("limit"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_social_i_qa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for Social IQA dataset."""
+    samples = social_i_qa.load_social_i_qa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "validation"),
+        limit=options.get("limit"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+def _create_coqa(options: dict[str, Any]) -> list[dict[str, Any]]:
+    """Factory for CoQA dataset."""
+    samples = coqa.load_coqa(
+        source=options.get("source", "huggingface"),
+        data_dir=options.get("data_dir"),
+        split=options.get("split", "validation"),
+        limit=options.get("limit"),
+    )
+    return [sample.to_generation_example() for sample in samples]
+# Auto-register built-in datasets
+register_dataset("math500", _create_math500)
+register_dataset("competition_math", _create_competition_math)
+register_dataset("supergpqa", _create_super_gpqa)
+register_dataset("mmlu-pro", _create_mmlu_pro)
+register_dataset("gsm8k", _create_gsm8k)
+register_dataset("gpqa", _create_gpqa)
+register_dataset("gsm-symbolic", _create_gsm_symbolic)
+register_dataset("medmcqa", _create_medmcqa)
+register_dataset("med_qa", _create_med_qa)
+register_dataset("sciq", _create_sciq)
+register_dataset("commonsense_qa", _create_commonsense_qa)
+register_dataset("piqa", _create_piqa)
+register_dataset("social_i_qa", _create_social_i_qa)
+register_dataset("coqa", _create_coqa)
+# Also register specific competition datasets as aliases
+def _create_aime24(options: dict[str, Any]) -> list[dict[str, Any]]:
+    return _create_competition_math({**options, "dataset": "math-ai/aime24"})
+def _create_aime25(options: dict[str, Any]) -> list[dict[str, Any]]:
+    return _create_competition_math({**options, "dataset": "math-ai/aime25"})
+def _create_amc23(options: dict[str, Any]) -> list[dict[str, Any]]:
+    return _create_competition_math({**options, "dataset": "math-ai/amc23"})
+def _create_olympiadbench(options: dict[str, Any]) -> list[dict[str, Any]]:
+    return _create_competition_math({**options, "dataset": "math-ai/olympiadbench"})
+def _create_beyondaime(options: dict[str, Any]) -> list[dict[str, Any]]:
+    return _create_competition_math({**options, "dataset": "ByteDance-Seed/BeyondAIME"})
+register_dataset("aime24", _create_aime24)
+register_dataset("aime25", _create_aime25)
+register_dataset("amc23", _create_amc23)
+register_dataset("olympiadbench", _create_olympiadbench)
+register_dataset("beyondaime", _create_beyondaime)
+__all__ = [
+    # Legacy module exports
+    "competition_math",
+    "commonsense_qa",
+    "coqa",
+    "gpqa",
+    "gsm_symbolic",
+    "gsm8k",
+    "math500",
+    "med_qa",
+    "medmcqa",
+    "mmlu_pro",
+    "piqa",
+    "sciq",
+    "social_i_qa",
+    "super_gpqa",
+    # Registry functions
+    "register_dataset",
+    "unregister_dataset",
+    "create_dataset",
+    "list_datasets",
+    "is_dataset_registered",
+]

themis/datasets/base.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""Base dataset implementation with schema support.
+This module provides a base class that implements common dataset operations
+like filtering, limiting, and stratification.
+"""
+from __future__ import annotations
+import logging
+import random
+from collections import defaultdict
+from typing import Any, Callable, Iterable
+from themis.datasets import schema as dataset_schema
+logger = logging.getLogger(__name__)
+class BaseDataset:
+    """Base implementation for dataset classes that implement DatasetAdapter protocol.
+    This class provides a reusable implementation of common dataset operations
+    including filtering, limiting, and stratification. It satisfies the
+    DatasetAdapter protocol by implementing iter_samples().
+    The class implements the structural DatasetAdapter protocol without
+    explicit inheritance, using duck typing. At runtime, instances will
+    satisfy isinstance(obj, DatasetAdapter) checks.
+    Subclasses should provide the initial samples, schema, and metadata.
+    Protocol Compliance:
+        Implements DatasetAdapter protocol via iter_samples() method
+    Examples:
+        class MyDataset(BaseDataset):
+            def __init__(self):
+                samples = [
+                    {"id": "1", "problem": "What is 2+2?", "answer": "4"},
+                    {"id": "2", "problem": "What is 3+3?", "answer": "6"},
+                ]
+                schema = DatasetSchema(
+                    id_field="id",
+                    reference_field="answer",
+                    required_fields={"id", "problem", "answer"},
+                )
+                metadata = DatasetMetadata(
+                    name="SimpleArithmetic",
+                    version="1.0",
+                    total_samples=2,
+                )
+                super().__init__(samples, schema, metadata)
+        # Verify protocol compliance
+        >>> from themis.interfaces import DatasetAdapter
+        >>> dataset = MyDataset()
+        >>> isinstance(dataset, DatasetAdapter)  # True
+    """
+    def __init__(
+        self,
+        samples: Iterable[dict[str, Any]],
+        schema: dataset_schema.DatasetSchema,
+        metadata: dataset_schema.DatasetMetadata,
+        validate: bool = True,
+    ):
+        """Initialize dataset.
+        Args:
+            samples: Iterable of sample dictionaries
+            schema: Dataset schema
+            metadata: Dataset metadata
+            validate: Whether to validate samples against schema (default: True)
+        Raises:
+            ValueError: If validation is enabled and samples don't match schema
+        """
+        self._samples = list(samples)
+        self._schema = schema
+        self._metadata = metadata
+        if validate:
+            self._validate_all()
+        # Update metadata total if not set
+        if self._metadata.total_samples is None:
+            self._metadata = dataset_schema.DatasetMetadata(
+                **{**self._metadata.__dict__, "total_samples": len(self._samples)}
+            )
+    def _validate_all(self) -> None:
+        """Validate all samples against schema."""
+        logger.debug(
+            "Validating %d samples for dataset %s",
+            len(self._samples),
+            self._metadata.name,
+        )
+        for i, sample in enumerate(self._samples):
+            try:
+                self._schema.validate_sample(sample)
+            except ValueError as e:
+                logger.error("Validation failed for sample %d: %s", i, e)
+                raise ValueError(f"Sample {i} validation failed: {e}") from e
+        logger.debug("All samples validated successfully")
+    def iter_samples(self) -> Iterable[dict[str, Any]]:
+        """Iterate over dataset samples."""
+        return iter(self._samples)
+    def get_schema(self) -> dataset_schema.DatasetSchema:
+        """Get the dataset schema."""
+        return self._schema
+    def get_metadata(self) -> dataset_schema.DatasetMetadata:
+        """Get dataset metadata."""
+        return self._metadata
+    def filter(self, predicate: Callable[[dict[str, Any]], bool]) -> BaseDataset:
+        """Return filtered view of dataset.
+        Args:
+            predicate: Function that returns True for samples to keep
+        Returns:
+            New BaseDataset with filtered samples
+        """
+        filtered_samples = [s for s in self._samples if predicate(s)]
+        logger.debug(
+            "Filtered dataset from %d to %d samples",
+            len(self._samples),
+            len(filtered_samples),
+        )
+        return BaseDataset(
+            samples=filtered_samples,
+            schema=self._schema,
+            metadata=self._metadata,
+            validate=False,  # Already validated
+        )
+    def limit(self, n: int) -> BaseDataset:
+        """Return dataset limited to first n samples.
+        Args:
+            n: Maximum number of samples
+        Returns:
+            New BaseDataset with limited samples
+        """
+        limited_samples = self._samples[:n]
+        logger.debug(
+            "Limited dataset from %d to %d samples",
+            len(self._samples),
+            len(limited_samples),
+        )
+        return BaseDataset(
+            samples=limited_samples,
+            schema=self._schema,
+            metadata=self._metadata,
+            validate=False,
+        )
+    def stratify(
+        self, field: str, distribution: dict[str, float], seed: int | None = None
+    ) -> BaseDataset:
+        """Return stratified sample of dataset.
+        Args:
+            field: Field to stratify by
+            distribution: Desired distribution (values should sum to ~1.0)
+            seed: Random seed for reproducibility
+        Returns:
+            New BaseDataset with stratified samples
+        Raises:
+            ValueError: If field doesn't exist or distribution is invalid
+        """
+        # Group samples by field value
+        groups: dict[Any, list[dict[str, Any]]] = defaultdict(list)
+        for sample in self._samples:
+            if field not in sample:
+                raise ValueError(f"Field '{field}' not found in sample")
+            groups[sample[field]].append(sample)
+        # Validate distribution
+        total_dist = sum(distribution.values())
+        if not (0.99 <= total_dist <= 1.01):
+            logger.warning("Distribution values sum to %f, expected ~1.0", total_dist)
+        # Calculate sample sizes for each group
+        total_samples = len(self._samples)
+        stratified_samples = []
+        if seed is not None:
+            rng = random.Random(seed)
+        else:
+            rng = random.Random()
+        for value, desired_ratio in distribution.items():
+            if value not in groups:
+                logger.warning(
+                    "Value '%s' specified in distribution but not found in dataset",
+                    value,
+                )
+                continue
+            group_samples = groups[value]
+            n_samples = int(total_samples * desired_ratio)
+            n_samples = min(n_samples, len(group_samples))  # Can't exceed available
+            # Sample from group
+            sampled = rng.sample(group_samples, n_samples)
+            stratified_samples.extend(sampled)
+        logger.debug(
+            "Stratified dataset by field '%s' from %d to %d samples",
+            field,
+            len(self._samples),
+            len(stratified_samples),
+        )
+        return BaseDataset(
+            samples=stratified_samples,
+            schema=self._schema,
+            metadata=self._metadata,
+            validate=False,
+        )
+    def shuffle(self, seed: int | None = None) -> BaseDataset:
+        """Return shuffled dataset.
+        Args:
+            seed: Random seed for reproducibility
+        Returns:
+            New BaseDataset with shuffled samples
+        """
+        shuffled = list(self._samples)
+        if seed is not None:
+            random.Random(seed).shuffle(shuffled)
+        else:
+            random.shuffle(shuffled)
+        return BaseDataset(
+            samples=shuffled,
+            schema=self._schema,
+            metadata=self._metadata,
+            validate=False,
+        )
+    def __len__(self) -> int:
+        """Return number of samples in dataset."""
+        return len(self._samples)
+    def __getitem__(self, idx: int) -> dict[str, Any]:
+        """Get sample by index."""
+        return self._samples[idx]
+__all__ = ["BaseDataset"]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl