PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/config/runtime.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""Runtime helpers for executing experiments from Hydra configs."""
+from __future__ import annotations
+from dataclasses import asdict
+from pathlib import Path
+from typing import List
+from themis.core import entities as core_entities
+from themis.datasets import create_dataset
+from themis.experiment import math as math_experiment
+from themis.experiment import mcq as mcq_experiment
+from themis.experiment import orchestrator as experiment_orchestrator
+from themis.experiment import storage as experiment_storage
+from themis.providers import registry as provider_registry
+from . import registry, schema
+def run_experiment_from_config(
+    config: schema.ExperimentConfig,
+    *,
+    dataset: list[dict[str, object]] | None = None,
+    on_result=None,
+) -> experiment_orchestrator.ExperimentReport:
+    dataset_to_use = (
+        dataset
+        if dataset is not None
+        else _load_dataset(config.dataset, experiment_name=config.name)
+    )
+    experiment = _build_experiment(config)
+    return experiment.run(
+        dataset_to_use,
+        max_samples=config.max_samples,
+        run_id=config.run_id,
+        resume=config.resume,
+        on_result=on_result,
+    )
+def summarize_report_for_config(
+    config: schema.ExperimentConfig,
+    report: experiment_orchestrator.ExperimentReport,
+) -> str:
+    if config.task in {
+        "math500",
+        "aime24",
+        "aime25",
+        "amc23",
+        "olympiadbench",
+        "beyondaime",
+    }:
+        return math_experiment.summarize_report(report)
+    if config.task in {"supergpqa", "mmlu_pro"}:
+        return mcq_experiment.summarize_report(report)
+    raise ValueError(f"Unsupported task '{config.task}' for summarization.")
+def load_dataset_from_config(
+    config: schema.ExperimentConfig,
+) -> list[dict[str, object]]:
+    return _load_dataset(config.dataset, experiment_name=config.name)
+def _build_experiment(
+    config: schema.ExperimentConfig,
+) -> experiment_orchestrator.ExperimentOrchestrator:
+    if config.task:
+        builder = registry.get_experiment_builder(config.task)
+        return builder(config)
+    raise ValueError(
+        "Experiment configuration must specify a 'task'. "
+        f"Available tasks: {', '.join(sorted(registry._EXPERIMENT_BUILDERS.keys()))}"
+    )
+@registry.register_experiment_builder("math500")
+@registry.register_experiment_builder("aime24")
+@registry.register_experiment_builder("aime25")
+@registry.register_experiment_builder("amc23")
+@registry.register_experiment_builder("olympiadbench")
+@registry.register_experiment_builder("beyondaime")
+def _build_math_experiment(
+    config: schema.ExperimentConfig,
+) -> experiment_orchestrator.ExperimentOrchestrator:
+    # Use the specific path if provided, otherwise use the default path
+    storage_path = config.storage.path or config.storage.default_path
+    storage = (
+        experiment_storage.ExperimentStorage(Path(storage_path))
+        if storage_path
+        else None
+    )
+    sampling_cfg = core_entities.SamplingConfig(
+        temperature=config.generation.sampling.temperature,
+        top_p=config.generation.sampling.top_p,
+        max_tokens=config.generation.sampling.max_tokens,
+    )
+    provider = provider_registry.create_provider(
+        config.generation.provider.name, **config.generation.provider.options
+    )
+    runner_options = asdict(config.generation.runner)
+    # Use the task name from config as the default task name
+    task_name = config.task or "math500"
+    # Override task name if provided in task_options
+    if config.task_options and "task_name" in config.task_options:
+        task_name = config.task_options["task_name"]
+    return math_experiment.build_math500_zero_shot_experiment(
+        model_client=provider,
+        model_name=config.generation.model_identifier,
+        storage=storage,
+        sampling=sampling_cfg,
+        provider_name=config.generation.provider.name,
+        runner_options=runner_options,
+        task_name=task_name,
+    )
+@registry.register_experiment_builder("supergpqa")
+def _build_supergpqa_experiment(
+    config: schema.ExperimentConfig,
+) -> experiment_orchestrator.ExperimentOrchestrator:
+    return _build_mcq_experiment(config, "supergpqa", "supergpqa")
+@registry.register_experiment_builder("mmlu_pro")
+def _build_mmlu_pro_experiment(
+    config: schema.ExperimentConfig,
+) -> experiment_orchestrator.ExperimentOrchestrator:
+    return _build_mcq_experiment(config, "mmlu-pro", "mmlu_pro")
+def _build_mcq_experiment(
+    config: schema.ExperimentConfig, dataset_name: str, task_id: str
+) -> experiment_orchestrator.ExperimentOrchestrator:
+    # Use the specific path if provided, otherwise use the default path
+    storage_path = config.storage.path or config.storage.default_path
+    storage = (
+        experiment_storage.ExperimentStorage(Path(storage_path))
+        if storage_path
+        else None
+    )
+    sampling_cfg = core_entities.SamplingConfig(
+        temperature=config.generation.sampling.temperature,
+        top_p=config.generation.sampling.top_p,
+        max_tokens=config.generation.sampling.max_tokens,
+    )
+    provider = provider_registry.create_provider(
+        config.generation.provider.name, **config.generation.provider.options
+    )
+    runner_options = asdict(config.generation.runner)
+    return mcq_experiment.build_multiple_choice_json_experiment(
+        dataset_name=dataset_name,
+        task_id=task_id,
+        model_client=provider,
+        model_name=config.generation.model_identifier,
+        storage=storage,
+        sampling=sampling_cfg,
+        provider_name=config.generation.provider.name,
+        runner_options=runner_options,
+    )
+def _load_dataset(
+    config: schema.DatasetConfig, *, experiment_name: str
+) -> List[dict[str, object]]:
+    """Load dataset samples using the dataset registry.
+    Args:
+        config: Dataset configuration
+        experiment_name: Name of the experiment (used to map to dataset)
+    Returns:
+        List of sample dictionaries ready for generation
+    """
+    # Handle inline datasets (not in registry)
+    if config.source == "inline":
+        if not config.inline_samples:
+            raise ValueError(
+                "dataset.inline_samples must contain at least one row when"
+                " dataset.source='inline'."
+            )
+        return list(config.inline_samples)
+    # Use explicit dataset_id if provided
+    dataset_name = config.dataset_id
+    if not dataset_name:
+        # Fallback to task name if dataset_id is not provided
+        # This allows simple configs where task name matches dataset name
+        # But we should probably enforce dataset_id for clarity in the future
+        # For now, let's try to infer from task if available in config object passed to this function?
+        # Wait, _load_dataset only gets DatasetConfig and experiment_name.
+        # We should probably pass the full config or at least the task.
+        # But for now, let's rely on dataset_id being present or raise error.
+        raise ValueError(
+            "dataset.dataset_id must be provided when source is not 'inline'."
+        )
+    # Prepare options for dataset factory
+    options = {
+        "source": config.source,
+        "data_dir": config.data_dir,
+        "split": config.split,
+        "limit": config.limit,
+        "subjects": list(config.subjects) if config.subjects else None,
+    }
+    # Load samples via registry
+    return create_dataset(dataset_name, **options)

themis/config/schema.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Structured configuration definitions for Hydra/OmegaConf."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+@dataclass
+class ProviderConfig:
+    name: str = "fake"
+    options: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class RunnerConfig:
+    max_parallel: int = 1
+    max_retries: int = 3
+    retry_initial_delay: float = 0.5
+    retry_backoff_multiplier: float = 2.0
+    retry_max_delay: float | None = 2.0
+@dataclass
+class SamplingConfig:
+    temperature: float = 0.0
+    top_p: float = 0.95
+    max_tokens: int = 512
+@dataclass
+class GenerationConfig:
+    model_identifier: str = "fake-math-llm"
+    provider: ProviderConfig = field(default_factory=ProviderConfig)
+    sampling: SamplingConfig = field(default_factory=SamplingConfig)
+    runner: RunnerConfig = field(default_factory=RunnerConfig)
+@dataclass
+class DatasetConfig:
+    source: str = "huggingface"
+    dataset_id: str | None = None
+    data_dir: str | None = None
+    limit: int | None = None
+    split: str = "test"
+    subjects: list[str] = field(default_factory=list)
+    inline_samples: list[dict[str, Any]] = field(default_factory=list)
+@dataclass
+class StorageConfig:
+    path: str | None = None
+    default_path: str | None = None  # New field for default storage path
+@dataclass
+class WandbConfig:
+    enable: bool = False
+    project: str | None = None
+    entity: str | None = None
+    tags: list[str] = field(default_factory=list)
+@dataclass
+class HuggingFaceHubConfig:
+    enable: bool = False
+    repository: str | None = None
+@dataclass
+class IntegrationsConfig:
+    wandb: WandbConfig = field(default_factory=WandbConfig)
+    huggingface_hub: HuggingFaceHubConfig = field(default_factory=HuggingFaceHubConfig)
+@dataclass
+class ExperimentConfig:
+    name: str = "math500_zero_shot"
+    dataset: DatasetConfig = field(default_factory=DatasetConfig)
+    generation: GenerationConfig = field(default_factory=GenerationConfig)
+    storage: StorageConfig = field(default_factory=StorageConfig)
+    integrations: IntegrationsConfig = field(default_factory=IntegrationsConfig)
+    max_samples: int | None = None
+    run_id: str | None = None
+    resume: bool = True
+    task: str | None = None
+    task_options: dict[str, Any] = field(default_factory=dict)
+    @classmethod
+    def from_file(cls, path: str | Path) -> ExperimentConfig:
+        """Load configuration from a file."""
+        from .loader import load_experiment_config
+        return load_experiment_config(Path(path))
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ExperimentConfig:
+        """Create configuration from a dictionary."""
+        from omegaconf import OmegaConf
+        base = OmegaConf.structured(cls)
+        merged = OmegaConf.merge(base, OmegaConf.create(data))
+        return OmegaConf.to_object(merged)  # type: ignore
+    def to_file(self, path: str | Path) -> None:
+        """Save configuration to a file."""
+        from omegaconf import OmegaConf
+        conf = OmegaConf.structured(self)
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        OmegaConf.save(conf, Path(path))

themis/core/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Core datamodel for Themis."""
+from . import entities, serialization
+__all__ = ["entities", "serialization"]

themis/core/conversation.py ADDED Viewed

@@ -0,0 +1,354 @@
+"""Conversation primitives for multi-turn interactions.
+This module provides abstractions for managing multi-turn conversations,
+enabling research on dialogue systems, debugging interactions, and
+agentic workflows.
+Examples:
+    # Create a conversation
+    context = ConversationContext()
+    context.add_message("user", "What is 2+2?")
+    context.add_message("assistant", "2+2 equals 4.")
+    context.add_message("user", "What about 3+3?")
+    # Convert to prompt
+    prompt = context.to_prompt()
+    # Get conversation history
+    history = context.get_history(max_turns=2)
+"""
+from __future__ import annotations
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Literal
+from themis.core import entities as core_entities
+from themis.generation import templates
+MessageRole = Literal["system", "user", "assistant", "tool"]
+@dataclass
+class Message:
+    """Single message in a conversation.
+    Attributes:
+        role: Message role (system/user/assistant/tool)
+        content: Message text content
+        metadata: Additional metadata (tool calls, timestamps, etc.)
+        timestamp: Unix timestamp when message was created
+    """
+    role: MessageRole
+    content: str
+    metadata: dict[str, Any] = field(default_factory=dict)
+    timestamp: float = field(default_factory=time.time)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert message to dictionary."""
+        return {
+            "role": self.role,
+            "content": self.content,
+            "metadata": self.metadata,
+            "timestamp": self.timestamp,
+        }
+@dataclass
+class ConversationContext:
+    """Maintains conversation state across turns.
+    This class manages the conversation history and provides utilities
+    for rendering conversations as prompts.
+    Examples:
+        context = ConversationContext()
+        context.add_message("system", "You are a helpful assistant.")
+        context.add_message("user", "Hello!")
+        context.add_message("assistant", "Hi! How can I help you?")
+        # Get history
+        messages = context.get_history()
+        # Render to prompt
+        prompt = context.to_prompt()
+    """
+    messages: list[Message] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def add_message(self, role: MessageRole, content: str, **metadata: Any) -> None:
+        """Add a message to the conversation.
+        Args:
+            role: Message role (system/user/assistant/tool)
+            content: Message text content
+            **metadata: Additional metadata to attach to message
+        """
+        self.messages.append(Message(role=role, content=content, metadata=metadata))
+    def get_history(self, max_turns: int | None = None) -> list[Message]:
+        """Get conversation history.
+        Args:
+            max_turns: Maximum number of messages to return (from end)
+        Returns:
+            List of messages (most recent if limited)
+        """
+        if max_turns is None:
+            return list(self.messages)
+        return self.messages[-max_turns:]
+    def get_messages_by_role(self, role: MessageRole) -> list[Message]:
+        """Get all messages with a specific role.
+        Args:
+            role: Role to filter by
+        Returns:
+            List of messages with matching role
+        """
+        return [msg for msg in self.messages if msg.role == role]
+    def to_prompt(self, template: templates.PromptTemplate | None = None) -> str:
+        """Render conversation to prompt string.
+        Args:
+            template: Optional template for custom formatting
+        Returns:
+            Formatted prompt string
+        """
+        if template is not None:
+            return template.render(messages=self.messages)
+        # Default format: role-prefixed messages
+        lines = []
+        for msg in self.messages:
+            lines.append(f"{msg.role}: {msg.content}")
+        return "\n\n".join(lines)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert conversation to dictionary.
+        Returns:
+            Dictionary representation
+        """
+        return {
+            "messages": [msg.to_dict() for msg in self.messages],
+            "metadata": self.metadata,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ConversationContext:
+        """Create conversation from dictionary.
+        Args:
+            data: Dictionary with messages and metadata
+        Returns:
+            ConversationContext instance
+        """
+        context = cls(metadata=data.get("metadata", {}))
+        for msg_data in data.get("messages", []):
+            context.messages.append(
+                Message(
+                    role=msg_data["role"],
+                    content=msg_data["content"],
+                    metadata=msg_data.get("metadata", {}),
+                    timestamp=msg_data.get("timestamp", time.time()),
+                )
+            )
+        return context
+    def __len__(self) -> int:
+        """Return number of messages in conversation."""
+        return len(self.messages)
+@dataclass
+class ConversationTask:
+    """Task for multi-turn conversation execution.
+    This extends the basic GenerationTask concept to support
+    multi-turn conversations with configurable stopping conditions.
+    Attributes:
+        context: Conversation context with message history
+        model: Model to use for generation
+        sampling: Sampling configuration
+        metadata: Additional metadata
+        reference: Optional reference for evaluation
+        max_turns: Maximum number of conversation turns
+        stop_condition: Optional function to determine when to stop
+    """
+    context: ConversationContext
+    model: core_entities.ModelSpec
+    sampling: core_entities.SamplingConfig
+    metadata: dict[str, Any] = field(default_factory=dict)
+    reference: core_entities.Reference | None = None
+    max_turns: int = 10
+    stop_condition: Callable[[ConversationContext], bool] | None = None
+    def should_stop(self) -> bool:
+        """Check if conversation should stop.
+        Returns:
+            True if stop condition is met or max turns reached
+        """
+        if len(self.context) >= self.max_turns:
+            return True
+        if self.stop_condition is not None:
+            return self.stop_condition(self.context)
+        return False
+@dataclass
+class ConversationTurn:
+    """Single turn in a conversation.
+    Attributes:
+        turn_number: Turn index (0-based)
+        user_message: User message for this turn (if any)
+        generation_record: Generation result for this turn
+        context_snapshot: Conversation context at this turn
+    """
+    turn_number: int
+    user_message: Message | None
+    generation_record: core_entities.GenerationRecord
+    context_snapshot: ConversationContext
+@dataclass
+class ConversationRecord:
+    """Complete record of a multi-turn conversation.
+    This is the result of running a ConversationTask through
+    a ConversationRunner.
+    Attributes:
+        task: Original conversation task
+        context: Final conversation context
+        turns: List of turns executed
+        metadata: Additional metadata (e.g., total turns, stop reason)
+    """
+    task: ConversationTask
+    context: ConversationContext
+    turns: list[ConversationTurn] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def get_final_output(self) -> core_entities.ModelOutput | None:
+        """Get the final model output.
+        Returns:
+            Last turn's output, or None if no turns
+        """
+        if not self.turns:
+            return None
+        return self.turns[-1].generation_record.output
+    def get_all_outputs(self) -> list[core_entities.ModelOutput | None]:
+        """Get all model outputs from all turns.
+        Returns:
+            List of outputs (may contain None for failed turns)
+        """
+        return [turn.generation_record.output for turn in self.turns]
+    def total_turns(self) -> int:
+        """Get total number of turns executed.
+        Returns:
+            Number of turns
+        """
+        return len(self.turns)
+# Common stop conditions
+def stop_on_keyword(keyword: str) -> Callable[[ConversationContext], bool]:
+    """Create stop condition that triggers when keyword appears.
+    Args:
+        keyword: Keyword to look for in assistant messages
+    Returns:
+        Stop condition function
+    """
+    def condition(context: ConversationContext) -> bool:
+        if not context.messages:
+            return False
+        last_msg = context.messages[-1]
+        if last_msg.role == "assistant":
+            return keyword.lower() in last_msg.content.lower()
+        return False
+    return condition
+def stop_on_pattern(
+    pattern: str,
+) -> Callable[[ConversationContext], bool]:
+    """Create stop condition that triggers when regex pattern matches.
+    Args:
+        pattern: Regex pattern to match
+    Returns:
+        Stop condition function
+    """
+    import re
+    compiled = re.compile(pattern, re.IGNORECASE)
+    def condition(context: ConversationContext) -> bool:
+        if not context.messages:
+            return False
+        last_msg = context.messages[-1]
+        if last_msg.role == "assistant":
+            return compiled.search(last_msg.content) is not None
+        return False
+    return condition
+def stop_on_empty_response() -> Callable[[ConversationContext], bool]:
+    """Create stop condition that triggers on empty assistant response.
+    Returns:
+        Stop condition function
+    """
+    def condition(context: ConversationContext) -> bool:
+        if not context.messages:
+            return False
+        last_msg = context.messages[-1]
+        if last_msg.role == "assistant":
+            return not last_msg.content.strip()
+        return False
+    return condition
+__all__ = [
+    "MessageRole",
+    "Message",
+    "ConversationContext",
+    "ConversationTask",
+    "ConversationTurn",
+    "ConversationRecord",
+    "stop_on_keyword",
+    "stop_on_pattern",
+    "stop_on_empty_response",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl