PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/generation/turn_strategies.py ADDED Viewed

@@ -0,0 +1,393 @@
+"""Turn strategies for multi-turn conversations.
+This module provides strategies for determining the next turn in a conversation.
+Strategies can be fixed (predefined sequences), dynamic (generated based on context),
+or interactive.
+Examples:
+    # Fixed sequence
+    strategy = FixedSequenceTurnStrategy([
+        "What is 2+2?",
+        "What about 3+3?",
+        "And 5+5?"
+    ])
+    # Dynamic strategy
+    def planner(context):
+        if len(context) < 2:
+            return "Can you explain more?"
+        return None  # Stop
+    strategy = DynamicTurnStrategy(planner)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Protocol
+from themis.core import conversation as conv
+from themis.core import entities as core_entities
+class TurnStrategy(Protocol):
+    """Strategy for determining the next turn in a conversation.
+    A turn strategy decides what the user's next message should be
+    based on the current conversation state and the last model response.
+    """
+    def next_turn(
+        self,
+        context: conv.ConversationContext,
+        last_record: core_entities.GenerationRecord,
+    ) -> str | None:
+        """Determine the next user message.
+        Args:
+            context: Current conversation context
+            last_record: Last generation record
+        Returns:
+            Next user message, or None to end conversation
+        """
+        ...
+@dataclass
+class FixedSequenceTurnStrategy:
+    """Pre-determined sequence of user messages.
+    This strategy iterates through a fixed list of user messages,
+    useful for scripted conversations or testing.
+    Examples:
+        strategy = FixedSequenceTurnStrategy([
+            "Hello!",
+            "How are you?",
+            "Goodbye!"
+        ])
+    """
+    messages: list[str]
+    _index: int = 0
+    def next_turn(
+        self,
+        context: conv.ConversationContext,
+        last_record: core_entities.GenerationRecord,
+    ) -> str | None:
+        """Return next message from sequence.
+        Args:
+            context: Current conversation context
+            last_record: Last generation record
+        Returns:
+            Next message or None if sequence exhausted
+        """
+        if self._index >= len(self.messages):
+            return None
+        message = self.messages[self._index]
+        self._index += 1
+        return message
+    def reset(self) -> None:
+        """Reset strategy to beginning of sequence."""
+        self._index = 0
+@dataclass
+class DynamicTurnStrategy:
+    """Generate next message based on conversation state.
+    This strategy uses a function to dynamically determine the next
+    user message based on the conversation context.
+    Examples:
+        def planner(context, record):
+            outputs = [msg.content for msg in context.get_messages_by_role("assistant")]
+            if "error" in outputs[-1].lower():
+                return "Can you try again?"
+            elif len(context) >= 10:
+                return None  # Stop after 10 messages
+            else:
+                return "Please continue."
+        strategy = DynamicTurnStrategy(planner)
+    """
+    planner: Callable[
+        [conv.ConversationContext, core_entities.GenerationRecord], str | None
+    ]
+    def next_turn(
+        self,
+        context: conv.ConversationContext,
+        last_record: core_entities.GenerationRecord,
+    ) -> str | None:
+        """Generate next message using planner function.
+        Args:
+            context: Current conversation context
+            last_record: Last generation record
+        Returns:
+            Next message or None to stop
+        """
+        return self.planner(context, last_record)
+@dataclass
+class RepeatUntilSuccessTurnStrategy:
+    """Repeat the same question until getting a successful response.
+    This strategy is useful for testing robustness or debugging.
+    Examples:
+        strategy = RepeatUntilSuccessTurnStrategy(
+            question="What is 2+2?",
+            success_checker=lambda output: "4" in output,
+            max_attempts=5
+        )
+    """
+    question: str
+    success_checker: Callable[[str], bool]
+    max_attempts: int = 5
+    _attempts: int = 0
+    def next_turn(
+        self,
+        context: conv.ConversationContext,
+        last_record: core_entities.GenerationRecord,
+    ) -> str | None:
+        """Repeat question until success or max attempts.
+        Args:
+            context: Current conversation context
+            last_record: Last generation record
+        Returns:
+            Question or None if success/max attempts reached
+        """
+        # Check if this is first turn
+        if self._attempts == 0:
+            self._attempts += 1
+            return self.question
+        # Check if last response was successful
+        if last_record.output:
+            if self.success_checker(last_record.output.text):
+                return None  # Success, stop
+        # Check if we've exhausted attempts
+        if self._attempts >= self.max_attempts:
+            return None  # Give up
+        self._attempts += 1
+        return self.question
+    def reset(self) -> None:
+        """Reset attempt counter."""
+        self._attempts = 0
+@dataclass
+class ConditionalTurnStrategy:
+    """Choose next message based on conditions.
+    This strategy evaluates conditions and returns different messages
+    based on which condition matches.
+    Examples:
+        strategy = ConditionalTurnStrategy(
+            conditions=[
+                (lambda ctx, rec: "error" in rec.output.text.lower(), "Please try again."),
+                (lambda ctx, rec: len(ctx) >= 5, None),  # Stop after 5 turns
+            ],
+            default="Continue."
+        )
+    """
+    conditions: list[
+        tuple[
+            Callable[[conv.ConversationContext, core_entities.GenerationRecord], bool],
+            str | None,
+        ]
+    ]
+    default: str | None = None
+    def next_turn(
+        self,
+        context: conv.ConversationContext,
+        last_record: core_entities.GenerationRecord,
+    ) -> str | None:
+        """Evaluate conditions and return matching message.
+        Args:
+            context: Current conversation context
+            last_record: Last generation record
+        Returns:
+            Message from first matching condition, or default
+        """
+        for condition, message in self.conditions:
+            try:
+                if condition(context, last_record):
+                    return message
+            except Exception:
+                # Skip conditions that fail
+                continue
+        return self.default
+@dataclass
+class ChainedTurnStrategy:
+    """Chain multiple strategies together.
+    This strategy tries strategies in sequence until one returns
+    a non-None message.
+    Examples:
+        strategy = ChainedTurnStrategy([
+            FixedSequenceTurnStrategy(["Hello", "How are you?"]),
+            DynamicTurnStrategy(lambda ctx, rec: "Goodbye" if len(ctx) > 5 else None)
+        ])
+    """
+    strategies: list[TurnStrategy]
+    def next_turn(
+        self,
+        context: conv.ConversationContext,
+        last_record: core_entities.GenerationRecord,
+    ) -> str | None:
+        """Try each strategy until one returns a message.
+        Args:
+            context: Current conversation context
+            last_record: Last generation record
+        Returns:
+            First non-None message, or None if all return None
+        """
+        for strategy in self.strategies:
+            message = strategy.next_turn(context, last_record)
+            if message is not None:
+                return message
+        return None
+# Helper functions for creating common strategies
+def create_qa_strategy(questions: list[str]) -> FixedSequenceTurnStrategy:
+    """Create a simple Q&A strategy from a list of questions.
+    Args:
+        questions: List of questions to ask
+    Returns:
+        FixedSequenceTurnStrategy with questions
+    """
+    return FixedSequenceTurnStrategy(messages=questions)
+def create_max_turns_strategy(
+    max_turns: int, message: str = "Continue."
+) -> DynamicTurnStrategy:
+    """Create strategy that stops after max turns.
+    Args:
+        max_turns: Maximum number of turns
+        message: Message to send each turn
+    Returns:
+        DynamicTurnStrategy that stops after max_turns
+    """
+    def planner(
+        context: conv.ConversationContext, record: core_entities.GenerationRecord
+    ) -> str | None:
+        if len(context) >= max_turns:
+            return None
+        return message
+    return DynamicTurnStrategy(planner=planner)
+def create_keyword_stop_strategy(
+    keywords: list[str], message: str = "Continue."
+) -> DynamicTurnStrategy:
+    """Create strategy that stops when any keyword appears in response.
+    Args:
+        keywords: List of keywords to trigger stop
+        message: Message to send each turn
+    Returns:
+        DynamicTurnStrategy that stops on keywords
+    """
+    def planner(
+        context: conv.ConversationContext, record: core_entities.GenerationRecord
+    ) -> str | None:
+        if record.output:
+            text_lower = record.output.text.lower()
+            if any(kw.lower() in text_lower for kw in keywords):
+                return None
+        return message
+    return DynamicTurnStrategy(planner=planner)
+# Prompt perturbation and seed helpers for robustness sweeps
+import random
+def set_sampling_seed(task_metadata: dict[str, object], seed: int) -> dict[str, object]:
+    """Attach a deterministic seed to task metadata for providers that support it.
+    This does not enforce provider behavior but offers a convention: 'sampling_seed'.
+    """
+    md = dict(task_metadata)
+    md["sampling_seed"] = int(seed)
+    return md
+def perturb_prompt(text: str, *, seed: int | None = None, max_changes: int = 2) -> str:
+    """Apply small, semantics-preserving perturbations to a prompt.
+    Changes include optional punctuation tweaks and inserting polite filler words.
+    """
+    rng = random.Random(seed)
+    t = text
+    changes = 0
+    # Optional punctuation swap
+    if "?" in t and changes < max_changes and rng.random() < 0.5:
+        t = t.replace("?", "??", 1)
+        changes += 1
+    # Optional polite filler insertion
+    fillers = ["please", "kindly", "if possible"]
+    if changes < max_changes and rng.random() < 0.5:
+        words = t.split()
+        if words:
+            idx = rng.randint(0, len(words) - 1)
+            words.insert(idx, rng.choice(fillers))
+            t = " ".join(words)
+            changes += 1
+    return t
+def create_prompt_variants(base_text: str, *, count: int, seed: int) -> list[str]:
+    """Create multiple perturbed variants of a base prompt with deterministic seeding."""
+    rng = random.Random(seed)
+    return [
+        perturb_prompt(base_text, seed=rng.randint(0, 1_000_000))
+        for _ in range(max(1, count))
+    ]

themis/generation/types.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Backwards-compatible aliases for core entities."""
+from themis.core import entities as core_entities
+SamplingParameters = core_entities.SamplingConfig
+ModelOutput = core_entities.ModelOutput
+GenerationError = core_entities.ModelError
+GenerationRequest = core_entities.GenerationTask
+GenerationResult = core_entities.GenerationRecord

themis/integrations/__init__.py ADDED Viewed

File without changes

themis/integrations/huggingface.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+import json
+from dataclasses import asdict, is_dataclass
+from pathlib import Path
+from huggingface_hub import HfApi
+from themis.config.schema import HuggingFaceHubConfig
+from themis.core.entities import ExperimentReport
+def to_dict(obj):
+    if is_dataclass(obj):
+        return asdict(obj)
+    if hasattr(obj, "to_dict"):
+        return obj.to_dict()
+    if isinstance(obj, (list, tuple)):
+        return [to_dict(item) for item in obj]
+    if isinstance(obj, dict):
+        return {key: to_dict(value) for key, value in obj.items()}
+    return obj
+class HuggingFaceHubUploader:
+    def __init__(self, config: HuggingFaceHubConfig):
+        self.config = config
+        self.api = HfApi()
+    def upload_results(self, report: ExperimentReport, storage_path: Path) -> None:
+        if not self.config.enable or not self.config.repository:
+            return
+        report_dict = to_dict(report)
+        # Upload the full report as a JSON file
+        report_path = storage_path / "report.json"
+        with open(report_path, "w") as f:
+            json.dump(report_dict, f, indent=4)
+        self.api.upload_file(
+            path_or_fileobj=str(report_path),
+            path_in_repo=f"{report.metadata.get('run_id')}/report.json",
+            repo_id=self.config.repository,
+            repo_type="dataset",
+        )
+        # Upload individual generation results
+        for record in report.generation_results:
+            record_dict = to_dict(record)
+            record_path = (
+                storage_path / f"{record.task.metadata.get('dataset_id')}.json"
+            )
+            with open(record_path, "w") as f:
+                json.dump(record_dict, f, indent=4)
+            self.api.upload_file(
+                path_or_fileobj=str(record_path),
+                path_in_repo=f"{report.metadata.get('run_id')}/generations/{record.task.metadata.get('dataset_id')}.json",
+                repo_id=self.config.repository,
+                repo_type="dataset",
+            )

themis/integrations/wandb.py ADDED Viewed

@@ -0,0 +1,65 @@
+from __future__ import annotations
+import wandb
+from themis.config.schema import WandbConfig
+from themis.core.entities import ExperimentReport
+class WandbTracker:
+    def __init__(self, config: WandbConfig):
+        self.config = config
+    def init(self, experiment_config: dict) -> None:
+        if not self.config.enable:
+            return
+        wandb.init(
+            project=self.config.project,
+            entity=self.config.entity,
+            tags=self.config.tags,
+            config=experiment_config,
+        )
+    def log_results(self, report: ExperimentReport) -> None:
+        if not self.config.enable:
+            return
+        summary = {
+            "total_samples": report.metadata.get("total_samples"),
+            "successful_generations": report.metadata.get("successful_generations"),
+            "failed_generations": report.metadata.get("failed_generations"),
+            "evaluation_failures": report.metadata.get("evaluation_failures"),
+        }
+        for name, aggregate in report.evaluation_report.metrics.items():
+            summary[f"{name}_mean"] = aggregate.mean
+        wandb.summary.update(summary)
+        records_table = wandb.Table(
+            columns=[
+                "sample_id",
+                "prompt",
+                "raw_response",
+                "parsed_response",
+                "error",
+                "metric_scores",
+            ]
+        )
+        for record in report.generation_results:
+            eval_record = next(
+                (
+                    r
+                    for r in report.evaluation_report.records
+                    if r.sample_id == record.task.metadata.get("dataset_id")
+                ),
+                None,
+            )
+            records_table.add_data(
+                record.task.metadata.get("dataset_id"),
+                record.task.prompt,
+                [resp.text for resp in record.responses],
+                eval_record.parsed_response if eval_record else None,
+                record.error.message if record.error else None,
+                {s.metric_name: s.value for s in eval_record.scores}
+                if eval_record
+                else None,
+            )
+        wandb.log({"generation_results": records_table})

themis/interfaces/__init__.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Interfaces (ports) that external adapters must implement."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Any, Iterable, Protocol, Sequence, runtime_checkable
+from themis.core import entities
+class ModelProvider(ABC):
+    """Abstract interface for anything capable of fulfilling generation tasks."""
+    @abstractmethod
+    def generate(
+        self, task: entities.GenerationTask
+    ) -> entities.GenerationRecord:  # pragma: no cover - abstract
+        raise NotImplementedError
+@runtime_checkable
+class DatasetAdapter(Protocol):
+    """Protocol for dataset adapters that produce raw samples for experiments.
+    This is a structural protocol that can be satisfied by any class implementing
+    the required methods, without explicit inheritance. The @runtime_checkable
+    decorator allows isinstance() checks at runtime.
+    Required Methods:
+        iter_samples: Returns an iterable of sample dictionaries
+    Example:
+        >>> class MyDataset:
+        ...     def iter_samples(self):
+        ...         return iter([{"id": "1", "text": "sample"}])
+        ...
+        >>> isinstance(MyDataset(), DatasetAdapter)  # True at runtime
+    Note:
+        Classes do not need to explicitly inherit from this protocol.
+        Duck typing is sufficient - any class with an iter_samples() method
+        will be recognized as a DatasetAdapter at runtime.
+    """
+    def iter_samples(self) -> Iterable[dict[str, Any]]:  # pragma: no cover - protocol
+        """Iterate over dataset samples.
+        Returns:
+            Iterable of dictionaries, each representing a dataset sample
+        Example:
+            >>> for sample in dataset.iter_samples():
+            ...     print(sample["id"])
+        """
+        ...
+class Extractor(Protocol):
+    def extract(self, raw_output: str) -> Any:  # pragma: no cover - protocol
+        ...
+class Metric(ABC):
+    name: str
+    requires_reference: bool = True
+    @abstractmethod
+    def compute(
+        self,
+        *,
+        prediction: Any,
+        references: Sequence[Any],
+        metadata: dict[str, Any] | None = None,
+    ) -> entities.MetricScore:  # pragma: no cover - abstract
+        raise NotImplementedError
+__all__ = [
+    "ModelProvider",
+    "DatasetAdapter",
+    "Extractor",
+    "Metric",
+]

themis/project/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Project helpers for managing experiment collections."""
+from themis.project.definitions import Project, ProjectExperiment
+from themis.project.patterns import (
+    AblationChart,
+    AblationChartPoint,
+    AblationVariant,
+    XAbationPattern,
+    XAbationPatternApplication,
+)
+__all__ = [
+    "Project",
+    "ProjectExperiment",
+    "AblationChart",
+    "AblationChartPoint",
+    "AblationVariant",
+    "XAbationPattern",
+    "XAbationPatternApplication",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl