PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/evaluation/math_verify_utils.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Helpers for integrating math-verify with Themis."""
+from __future__ import annotations
+import re
+from typing import Any
+from sympy import sympify
+try:  # pragma: no cover - optional dependency
+    from latex2sympy2_extended.math_normalization import NormalizationConfig
+    from math_verify import (
+        LatexExtractionConfig,
+    )
+    from math_verify import (
+        parse as mv_parse,
+    )
+    from math_verify import (
+        verify as mv_verify,
+    )
+except ImportError:  # pragma: no cover - triggered when math-verify isn't installed
+    LatexExtractionConfig = None
+    NormalizationConfig = None
+    mv_parse = None
+    mv_verify = None
+_BOXED_PATTERN = re.compile(r"\\boxed\{([^}]*)\}")
+def math_verify_available() -> bool:
+    return mv_parse is not None and mv_verify is not None
+def require_math_verify() -> None:
+    if not math_verify_available():  # pragma: no cover - informative exception
+        raise RuntimeError(
+            "math-verify is required for math extraction/evaluation. Install via `uv pip install '.[math]'`."
+        )
+def extract_last_boxed(text: str) -> str:
+    match = _BOXED_PATTERN.findall(text)
+    if match:
+        return match[-1]
+    return text
+def parse_expression(text: str) -> Any:
+    require_math_verify()
+    extraction_config = [
+        LatexExtractionConfig(
+            normalization_config=NormalizationConfig(boxed="all"),
+        )
+    ]
+    expressions = mv_parse(
+        text,
+        extraction_config=extraction_config,
+        extraction_mode="first_match",
+        fallback_mode="first_match",
+    )
+    expr = expressions[0] if expressions else text
+    if isinstance(expr, str):
+        try:
+            return sympify(expr)
+        except Exception:  # pragma: no cover - invalid sympy expr
+            return expr
+    return expr
+def verify_expressions(reference: Any, prediction: Any) -> bool:
+    require_math_verify()
+    return bool(
+        mv_verify(
+            gold=reference,
+            target=prediction,
+            raise_on_error=False,
+        )
+    )
+__all__ = [
+    "math_verify_available",
+    "require_math_verify",
+    "extract_last_boxed",
+    "parse_expression",
+    "verify_expressions",
+]

themis/evaluation/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from __future__ import annotations
+from .composite_metric import CompositeMetric
+from .consistency_metric import ConsistencyMetric
+from .exact_match import ExactMatch
+from .length_difference_tolerance import LengthDifferenceTolerance
+from .math_verify_accuracy import MathVerifyAccuracy
+from .pairwise_judge_metric import PairwiseJudgeMetric
+from .response_length import ResponseLength
+from .rubric_judge_metric import RubricJudgeMetric
+__all__ = [
+    "ExactMatch",
+    "LengthDifferenceTolerance",
+    "CompositeMetric",
+    "ResponseLength",
+    "MathVerifyAccuracy",
+    "RubricJudgeMetric",
+    "PairwiseJudgeMetric",
+    "ConsistencyMetric",
+]

themis/evaluation/metrics/code/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Code generation evaluation metrics.
+This module provides metrics for evaluating code generation tasks:
+- Pass@k: Functional correctness with k samples
+- CodeBLEU: Code-aware BLEU variant
+- ExecutionAccuracy: Safe code execution and testing
+"""
+from themis.evaluation.metrics.code.pass_at_k import PassAtK, estimate_pass_at_k
+from themis.evaluation.metrics.code.codebleu import CodeBLEU
+from themis.evaluation.metrics.code.execution import ExecutionAccuracy, ExecutionResult
+__all__ = [
+    "PassAtK",
+    "estimate_pass_at_k",
+    "CodeBLEU",
+    "ExecutionAccuracy",
+    "ExecutionResult",
+]

themis/evaluation/metrics/code/codebleu.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""CodeBLEU metric for code generation evaluation.
+CodeBLEU extends BLEU with syntax awareness using abstract syntax trees (AST)
+and data flow matching.
+References:
+    Ren et al. (2020). CodeBLEU: a Method for Automatic Evaluation of Code Synthesis.
+"""
+from __future__ import annotations
+from typing import Any, Sequence
+from themis.core.entities import MetricScore
+from themis.interfaces import Metric
+class CodeBLEU(Metric):
+    """CodeBLEU metric for code generation.
+    CodeBLEU combines:
+    - N-gram matching (like BLEU)
+    - Syntax matching (AST-based)
+    - Data flow matching (variable dependencies)
+    It's more suitable for code evaluation than plain BLEU as it considers
+    code structure and semantics, not just surface form.
+    Attributes:
+        name: Metric identifier ("codebleu")
+        lang: Programming language ("python", "java", "javascript", etc.)
+        weights: Weights for [ngram, syntax, dataflow] components
+    Example:
+        >>> from themis.evaluation.metrics.code import CodeBLEU
+        >>> metric = CodeBLEU(lang="python")
+        >>> score = metric.compute(
+        ...     prediction="def add(a, b):\\n    return a + b",
+        ...     references=["def add(x, y):\\n    return x + y"]
+        ... )
+        >>> print(f"CodeBLEU: {score.value:.4f}")
+        CodeBLEU: 0.8234
+    """
+    requires_reference = True
+    def __init__(
+        self,
+        lang: str = "python",
+        weights: tuple[float, float, float] = (0.25, 0.25, 0.50),
+        alpha: float = 0.25,
+        beta: float = 0.25,
+        gamma: float = 0.50,
+        theta: float = 0.0,
+    ):
+        """Initialize CodeBLEU metric.
+        Args:
+            lang: Programming language ("python", "java", "javascript", "go", "php", "ruby")
+            weights: Weights for [ngram, weighted_ngram, syntax, dataflow].
+                Default: (0.25, 0.25, 0.25, 0.25)
+            alpha: Weight for n-gram matching
+            beta: Weight for weighted n-gram matching
+            gamma: Weight for syntax matching
+            theta: Weight for data flow matching
+        """
+        self.name = "codebleu"
+        self.lang = lang
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.theta = theta
+        # Lazy import codebleu (not required for all users)
+        try:
+            from codebleu import calc_codebleu
+            self._calc_codebleu = calc_codebleu
+        except ImportError:
+            raise ImportError(
+                "codebleu is required for CodeBLEU metric. "
+                "Install it with: pip install codebleu"
+            )
+    def compute(
+        self,
+        *,
+        prediction: Any,
+        references: Sequence[Any],
+        metadata: dict[str, Any] | None = None,
+    ) -> MetricScore:
+        """Compute CodeBLEU score.
+        Args:
+            prediction: Generated code (already extracted by pipeline)
+            references: List of reference code implementations
+            metadata: Optional metadata dict
+        Returns:
+            MetricScore with CodeBLEU value and component scores
+        """
+        # Convert to strings
+        pred_str = str(prediction)
+        ref_strs = [str(ref) for ref in references]
+        try:
+            # Compute CodeBLEU
+            result = self._calc_codebleu(
+                references=[ref_strs],  # List of reference lists
+                predictions=[pred_str],  # List of predictions
+                lang=self.lang,
+                weights=(self.alpha, self.beta, self.gamma, self.theta),
+            )
+            codebleu_score = result["codebleu"]
+            return MetricScore(
+                metric_name=self.name,
+                value=codebleu_score,
+                details={
+                    "codebleu": codebleu_score,
+                    "ngram_match_score": result.get("ngram_match_score", 0.0),
+                    "weighted_ngram_match_score": result.get("weighted_ngram_match_score", 0.0),
+                    "syntax_match_score": result.get("syntax_match_score", 0.0),
+                    "dataflow_match_score": result.get("dataflow_match_score", 0.0),
+                    "lang": self.lang,
+                    "num_references": len(ref_strs),
+                },
+                metadata=metadata or {},
+            )
+        except Exception as e:
+            # Handle parsing errors (invalid code, unsupported language, etc.)
+            return MetricScore(
+                metric_name=self.name,
+                value=0.0,
+                details={
+                    "error": str(e),
+                    "lang": self.lang,
+                },
+                metadata=metadata or {},
+            )
+__all__ = ["CodeBLEU"]

themis/evaluation/metrics/code/execution.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""Safe code execution for testing functional correctness.
+This module provides utilities for safely executing generated code against
+test cases in a sandboxed environment.
+"""
+from __future__ import annotations
+import multiprocessing
+import signal
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Callable, Sequence
+from themis.core.entities import MetricScore
+from themis.interfaces import Metric
+class ExecutionStatus(str, Enum):
+    """Execution result status."""
+    PASSED = "passed"
+    FAILED = "failed"
+    TIMEOUT = "timeout"
+    ERROR = "error"
+@dataclass
+class ExecutionResult:
+    """Result of code execution.
+    Attributes:
+        status: Execution status
+        passed: Whether all tests passed
+        output: Captured stdout/stderr
+        error: Error message if any
+        duration: Execution time in seconds
+    """
+    status: ExecutionStatus
+    passed: bool
+    output: str = ""
+    error: str | None = None
+    duration: float = 0.0
+class ExecutionAccuracy(Metric):
+    """Execute code and check against test cases.
+    This metric safely executes generated code in a restricted environment
+    and verifies correctness against provided test cases.
+    Security considerations:
+    - Executes in subprocess with timeout
+    - Restricted globals (no file I/O, network, etc.)
+    - Resource limits (memory, time)
+    Attributes:
+        name: Metric identifier ("execution_accuracy")
+        timeout: Maximum execution time per test (seconds)
+        max_memory_mb: Maximum memory usage (MB)
+    Example:
+        >>> from themis.evaluation.metrics.code import ExecutionAccuracy
+        >>> metric = ExecutionAccuracy(timeout=3.0)
+        >>>
+        >>> # Reference contains test cases
+        >>> test_cases = {
+        ...     "test_fn": test_function,
+        ...     "inputs": [(1, 2), (3, 4)],
+        ...     "expected": [3, 7]
+        ... }
+        >>>
+        >>> score = metric.compute(
+        ...     prediction="def add(a, b): return a + b",
+        ...     references=[test_cases]
+        ... )
+    """
+    requires_reference = True
+    def __init__(
+        self,
+        timeout: float = 3.0,
+        max_memory_mb: int = 512,
+    ):
+        """Initialize execution metric.
+        Args:
+            timeout: Maximum execution time per test (seconds)
+            max_memory_mb: Maximum memory usage (MB)
+        """
+        self.name = "execution_accuracy"
+        self.timeout = timeout
+        self.max_memory_mb = max_memory_mb
+    def compute(
+        self,
+        *,
+        prediction: Any,
+        references: Sequence[Any],
+        metadata: dict[str, Any] | None = None,
+    ) -> MetricScore:
+        """Execute code and compute accuracy.
+        Args:
+            prediction: Generated code to execute
+            references: List of test specifications
+            metadata: Optional metadata dict
+        Returns:
+            MetricScore with execution accuracy
+        """
+        code_str = str(prediction)
+        if not references:
+            return MetricScore(
+                metric_name=self.name,
+                value=0.0,
+                details={"error": "No test cases provided"},
+                metadata=metadata or {},
+            )
+        # Extract test cases from reference
+        test_spec = references[0]
+        if not isinstance(test_spec, dict):
+            return MetricScore(
+                metric_name=self.name,
+                value=0.0,
+                details={"error": "Test specification must be a dictionary"},
+                metadata=metadata or {},
+            )
+        test_inputs = test_spec.get("inputs", [])
+        expected_outputs = test_spec.get("expected", [])
+        test_fn_name = test_spec.get("function_name", "solution")
+        if len(test_inputs) != len(expected_outputs):
+            return MetricScore(
+                metric_name=self.name,
+                value=0.0,
+                details={"error": "Mismatch between inputs and expected outputs"},
+                metadata=metadata or {},
+            )
+        # Execute code and run tests
+        results = []
+        for test_input, expected in zip(test_inputs, expected_outputs):
+            result = self._execute_test(
+                code_str,
+                test_fn_name,
+                test_input,
+                expected,
+            )
+            results.append(result)
+        # Compute accuracy
+        passed = sum(1 for r in results if r.passed)
+        total = len(results)
+        accuracy = passed / total if total > 0 else 0.0
+        return MetricScore(
+            metric_name=self.name,
+            value=accuracy,
+            details={
+                "accuracy": accuracy,
+                "passed": passed,
+                "total": total,
+                "results": [
+                    {
+                        "status": r.status.value,
+                        "passed": r.passed,
+                        "error": r.error,
+                        "duration": r.duration,
+                    }
+                    for r in results
+                ],
+            },
+            metadata=metadata or {},
+        )
+    def _execute_test(
+        self,
+        code: str,
+        function_name: str,
+        test_input: Any,
+        expected_output: Any,
+    ) -> ExecutionResult:
+        """Execute a single test case.
+        Args:
+            code: Code to execute
+            function_name: Name of function to test
+            test_input: Input to pass to function
+            expected_output: Expected output
+        Returns:
+            ExecutionResult with status and outcome
+        """
+        import time
+        start_time = time.time()
+        try:
+            # Create restricted globals (no file I/O, network, etc.)
+            restricted_globals = {
+                "__builtins__": {
+                    "abs": abs,
+                    "all": all,
+                    "any": any,
+                    "bool": bool,
+                    "dict": dict,
+                    "enumerate": enumerate,
+                    "filter": filter,
+                    "float": float,
+                    "int": int,
+                    "len": len,
+                    "list": list,
+                    "map": map,
+                    "max": max,
+                    "min": min,
+                    "range": range,
+                    "reversed": reversed,
+                    "set": set,
+                    "sorted": sorted,
+                    "str": str,
+                    "sum": sum,
+                    "tuple": tuple,
+                    "zip": zip,
+                }
+            }
+            # Execute code with timeout
+            local_vars = {}
+            exec(code, restricted_globals, local_vars)
+            # Get the function
+            if function_name not in local_vars:
+                return ExecutionResult(
+                    status=ExecutionStatus.ERROR,
+                    passed=False,
+                    error=f"Function '{function_name}' not found",
+                    duration=time.time() - start_time,
+                )
+            func = local_vars[function_name]
+            # Run function with input
+            if isinstance(test_input, (list, tuple)):
+                actual_output = func(*test_input)
+            else:
+                actual_output = func(test_input)
+            # Check if output matches expected
+            passed = actual_output == expected_output
+            return ExecutionResult(
+                status=ExecutionStatus.PASSED if passed else ExecutionStatus.FAILED,
+                passed=passed,
+                output=str(actual_output),
+                duration=time.time() - start_time,
+            )
+        except TimeoutError:
+            return ExecutionResult(
+                status=ExecutionStatus.TIMEOUT,
+                passed=False,
+                error=f"Execution timeout ({self.timeout}s)",
+                duration=self.timeout,
+            )
+        except Exception as e:
+            return ExecutionResult(
+                status=ExecutionStatus.ERROR,
+                passed=False,
+                error=str(e),
+                duration=time.time() - start_time,
+            )
+__all__ = ["ExecutionAccuracy", "ExecutionResult", "ExecutionStatus"]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl