PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/generation/clients.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""Model provider implementations used for experiments."""
+from __future__ import annotations
+import json
+import math
+import random
+import re
+from typing import Tuple
+from themis.core import entities as core_entities
+from themis.interfaces import ModelProvider
+from themis.providers import register_provider
+class FakeMathModelClient(ModelProvider):
+    """A lightweight heuristic provider used for math experiments."""
+    _POINT_PATTERN = re.compile(
+        r"point\s*\(\s*(-?\d+)\s*,\s*(-?\d+)\s*\)", re.IGNORECASE
+    )
+    _ARITHMETIC_PATTERN = re.compile(
+        r"(-?\d+(?:\.\d+)?)\s*([+\-*/])\s*(-?\d+(?:\.\d+)?)"
+    )
+    def __init__(
+        self, *, seed: int | None = None, default_answer: str = "unknown"
+    ) -> None:
+        self._rng = random.Random(seed)
+        self._default_answer = default_answer
+    def generate(
+        self, task: core_entities.GenerationTask
+    ) -> core_entities.GenerationRecord:  # type: ignore[override]
+        prompt_text = task.prompt.text
+        answer, reason = self._solve(prompt_text)
+        expect_boxed = bool(task.metadata.get("template_expect_boxed"))
+        if expect_boxed and "\\boxed{" not in answer:
+            answer = f"\\boxed{{{answer}}}"
+        payload = {
+            "answer": answer,
+            "reasoning": reason,
+            "model": task.model.identifier,
+        }
+        latency = self._rng.randint(8, 18)
+        return core_entities.GenerationRecord(
+            task=task,
+            output=core_entities.ModelOutput(text=json.dumps(payload), raw=payload),
+            error=None,
+            metrics={"latency_ms": latency},
+        )
+    def _solve(self, prompt: str) -> Tuple[str, str]:
+        prompt_lower = prompt.lower()
+        polar = self._solve_polar_coordinates(prompt_lower)
+        if polar is not None:
+            return polar
+        arithmetic = self._solve_arithmetic(prompt_lower)
+        if arithmetic is not None:
+            return arithmetic
+        return self._default_answer, "Unable to derive answer with heuristic solver."
+    def _solve_polar_coordinates(self, prompt_lower: str) -> Tuple[str, str] | None:
+        if "polar" not in prompt_lower:
+            return None
+        match = self._POINT_PATTERN.search(prompt_lower)
+        if not match:
+            return None
+        x = int(match.group(1))
+        y = int(match.group(2))
+        radius_squared = x * x + y * y
+        radius = math.sqrt(radius_squared)
+        if math.isclose(radius, round(radius)):
+            radius_str = str(int(round(radius)))
+        else:
+            radius_str = f"\\sqrt{{{radius_squared}}}"
+        theta = math.atan2(y, x)
+        theta_str = self._format_theta(theta)
+        answer = f"\\left( {radius_str}, {theta_str} \\right)"
+        reasoning = f"Converted rectangular point ({x}, {y}) into polar coordinates."
+        return answer, reasoning
+    def _format_theta(self, theta: float) -> str:
+        tau = 2 * math.pi
+        theta = theta % tau
+        multiples = {
+            0: "0",
+            math.pi / 6: "\\frac{\\pi}{6}",
+            math.pi / 4: "\\frac{\\pi}{4}",
+            math.pi / 3: "\\frac{\\pi}{3}",
+            math.pi / 2: "\\frac{\\pi}{2}",
+            math.pi: "\\pi",
+            3 * math.pi / 2: "\\frac{3\\pi}{2}",
+        }
+        for value, label in multiples.items():
+            if math.isclose(theta, value, abs_tol=1e-6):
+                return label
+        if math.isclose(theta, 5 * math.pi / 6, abs_tol=1e-6):
+            return "\\frac{5\\pi}{6}"
+        if math.isclose(theta, 7 * math.pi / 6, abs_tol=1e-6):
+            return "\\frac{7\\pi}{6}"
+        if math.isclose(theta, 4 * math.pi / 3, abs_tol=1e-6):
+            return "\\frac{4\\pi}{3}"
+        return f"{theta:.3f}"
+    def _solve_arithmetic(self, prompt_lower: str) -> Tuple[str, str] | None:
+        if "what is" not in prompt_lower and "compute" not in prompt_lower:
+            return None
+        match = self._ARITHMETIC_PATTERN.search(prompt_lower)
+        if not match:
+            return None
+        left = float(match.group(1))
+        op = match.group(2)
+        right = float(match.group(3))
+        if op == "+":
+            result = left + right
+        elif op == "-":
+            result = left - right
+        elif op == "*":
+            result = left * right
+        elif op == "/":
+            if right == 0:
+                return "undefined", "Division by zero encountered."
+            result = left / right
+        else:
+            return None
+        if result.is_integer():
+            answer = str(int(result))
+        else:
+            answer = f"{result:.3f}"
+        reasoning = f"Evaluated {left} {op} {right} using arithmetic solver."
+        return answer, reasoning
+    def count_tokens(self, text: str) -> int:
+        return len(text.split())
+__all__ = ["FakeMathModelClient"]
+register_provider("fake", FakeMathModelClient)

themis/generation/conversation_runner.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""Conversation runner for multi-turn interactions.
+This module provides a runner that executes multi-turn conversations
+using turn strategies to determine the flow of the conversation.
+Examples:
+    from themis.generation import conversation_runner, turn_strategies
+    from themis.core import conversation, entities
+    # Create provider and strategy
+    provider = FakeProvider()
+    strategy = turn_strategies.FixedSequenceTurnStrategy([
+        "What is 2+2?",
+        "What about 3+3?"
+    ])
+    # Create runner
+    runner = conversation_runner.ConversationRunner(
+        provider=provider,
+        turn_strategy=strategy,
+        max_turns=5
+    )
+    # Create conversation task
+    context = conversation.ConversationContext()
+    context.add_message("system", "You are a math tutor.")
+    task = conversation.ConversationTask(
+        context=context,
+        model=model_spec,
+        sampling=sampling_config
+    )
+    # Run conversation
+    record = runner.run_conversation(task)
+    print(f"Conversation had {record.total_turns()} turns")
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+from themis.core import conversation as conv
+from themis.core import entities as core_entities
+from themis.generation import turn_strategies
+from themis.interfaces import ModelProvider
+from themis.utils import tracing
+logger = logging.getLogger(__name__)
+class ConversationRunner:
+    """Runner for executing multi-turn conversations.
+    This runner manages the conversation loop, generating responses
+    and determining next turns using a TurnStrategy.
+    Attributes:
+        provider: Model provider for generation
+        turn_strategy: Strategy for determining next turns
+        max_turns: Maximum number of conversation turns
+        prompt_template: Optional template for formatting context
+    """
+    def __init__(
+        self,
+        *,
+        provider: ModelProvider,
+        turn_strategy: turn_strategies.TurnStrategy,
+        max_turns: int = 10,
+        prompt_template: Any | None = None,
+    ):
+        """Initialize conversation runner.
+        Args:
+            provider: Model provider for generation
+            turn_strategy: Strategy for determining next turns
+            max_turns: Maximum number of conversation turns
+            prompt_template: Optional template for formatting context
+        """
+        self._provider = provider
+        self._turn_strategy = turn_strategy
+        self._max_turns = max_turns
+        self._prompt_template = prompt_template
+    def run_conversation(self, task: conv.ConversationTask) -> conv.ConversationRecord:
+        """Execute a multi-turn conversation.
+        Args:
+            task: Conversation task to execute
+        Returns:
+            ConversationRecord with full conversation history
+        """
+        with tracing.span(
+            "run_conversation",
+            model=task.model.identifier,
+            max_turns=task.max_turns,
+        ):
+            turns: list[conv.ConversationTurn] = []
+            context = task.context
+            max_turns = min(task.max_turns, self._max_turns)
+            for turn_num in range(max_turns):
+                with tracing.span("conversation_turn", turn=turn_num):
+                    logger.debug(
+                        "Starting conversation turn %d/%d", turn_num + 1, max_turns
+                    )
+                    # Generate response for current context
+                    with tracing.span("generate_response"):
+                        prompt_text = context.to_prompt(self._prompt_template)
+                        generation_task = self._create_generation_task(
+                            task, prompt_text, turn_num
+                        )
+                        record = self._provider.generate(generation_task)
+                    # Add assistant response to context
+                    if record.output:
+                        context.add_message("assistant", record.output.text)
+                    else:
+                        # Generation failed
+                        logger.warning(
+                            "Generation failed at turn %d: %s",
+                            turn_num,
+                            record.error.message if record.error else "unknown error",
+                        )
+                    # Create turn record (no user message yet)
+                    turn = conv.ConversationTurn(
+                        turn_number=turn_num,
+                        user_message=None,
+                        generation_record=record,
+                        context_snapshot=self._snapshot_context(context),
+                    )
+                    turns.append(turn)
+                    # Check stop conditions
+                    if task.should_stop():
+                        logger.debug("Task stop condition met at turn %d", turn_num)
+                        break
+                    # Determine next turn
+                    with tracing.span("plan_next_turn"):
+                        next_message = self._turn_strategy.next_turn(context, record)
+                    if next_message is None:
+                        logger.debug(
+                            "Turn strategy ended conversation at turn %d", turn_num
+                        )
+                        break
+                    # Add user message for next turn
+                    user_msg = conv.Message(role="user", content=next_message)
+                    context.add_message("user", next_message)
+                    turn.user_message = user_msg
+                    logger.debug(
+                        "Planned next turn: %s",
+                        next_message[:50] + ("..." if len(next_message) > 50 else ""),
+                    )
+            # Create conversation record
+            record = conv.ConversationRecord(
+                task=task,
+                context=context,
+                turns=turns,
+                metadata={
+                    "total_turns": len(turns),
+                    "max_turns_reached": len(turns) >= max_turns,
+                    "stop_condition_met": task.should_stop(),
+                },
+            )
+            logger.info(
+                "Conversation completed: %d turns, stop_reason=%s",
+                len(turns),
+                "max_turns" if record.metadata["max_turns_reached"] else "strategy",
+            )
+            return record
+    def _create_generation_task(
+        self, conv_task: conv.ConversationTask, prompt_text: str, turn_num: int
+    ) -> core_entities.GenerationTask:
+        """Create a generation task from conversation state.
+        Args:
+            conv_task: Conversation task
+            prompt_text: Rendered prompt text
+            turn_num: Current turn number
+        Returns:
+            GenerationTask for this turn
+        """
+        from themis.core.entities import PromptRender, PromptSpec
+        prompt_render = PromptRender(
+            spec=PromptSpec(
+                name=f"conversation_turn_{turn_num}",
+                template="",
+                metadata={"turn": turn_num},
+            ),
+            text=prompt_text,
+            context={"turn": turn_num},
+            metadata={"turn": turn_num},
+        )
+        metadata = dict(conv_task.metadata)
+        metadata["turn"] = turn_num
+        metadata["conversation"] = True
+        return core_entities.GenerationTask(
+            prompt=prompt_render,
+            model=conv_task.model,
+            sampling=conv_task.sampling,
+            metadata=metadata,
+            reference=conv_task.reference,
+        )
+    def _snapshot_context(
+        self, context: conv.ConversationContext
+    ) -> conv.ConversationContext:
+        """Create a snapshot of conversation context.
+        Args:
+            context: Context to snapshot
+        Returns:
+            Copy of context
+        """
+        return conv.ConversationContext.from_dict(context.to_dict())
+__all__ = ["ConversationRunner"]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl