PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/core/tools.py ADDED Viewed

@@ -0,0 +1,393 @@
+"""Tool use primitives for agentic workflows.
+This module provides abstractions for defining and executing tools
+(functions) that models can call during generation. This enables
+agentic workflows, function calling, and tool-augmented generation.
+Examples:
+    # Define a tool
+    def calculator(operation: str, a: float, b: float) -> float:
+        if operation == "add":
+            return a + b
+        elif operation == "multiply":
+            return a * b
+        raise ValueError(f"Unknown operation: {operation}")
+    tool = ToolDefinition(
+        name="calculator",
+        description="Perform arithmetic operations",
+        parameters={
+            "type": "object",
+            "properties": {
+                "operation": {"type": "string", "enum": ["add", "multiply"]},
+                "a": {"type": "number"},
+                "b": {"type": "number"},
+            },
+            "required": ["operation", "a", "b"],
+        },
+        handler=calculator
+    )
+    # Register tool
+    registry = ToolRegistry()
+    registry.register(tool)
+    # Execute tool
+    call = ToolCall(tool_name="calculator", arguments={"operation": "add", "a": 2, "b": 3})
+    result = registry.execute(call)
+    print(result.result)  # 5.0
+"""
+from __future__ import annotations
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Callable
+@dataclass
+class ToolDefinition:
+    """Defines a tool/function available to the model.
+    Attributes:
+        name: Tool name (should be unique)
+        description: Human-readable description of what tool does
+        parameters: JSON Schema describing parameters
+        handler: Function to execute when tool is called
+        metadata: Additional metadata
+    """
+    name: str
+    description: str
+    parameters: dict[str, Any]
+    handler: Callable[[dict[str, Any]], Any]
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert tool definition to dictionary (without handler).
+        Returns:
+            Dictionary representation suitable for JSON serialization
+        """
+        return {
+            "name": self.name,
+            "description": self.description,
+            "parameters": self.parameters,
+            "metadata": self.metadata,
+        }
+    def validate_arguments(self, arguments: dict[str, Any]) -> list[str]:
+        """Validate arguments against parameter schema.
+        Args:
+            arguments: Arguments to validate
+        Returns:
+            List of validation error messages (empty if valid)
+        """
+        errors = []
+        # Simple validation - check required fields
+        if "required" in self.parameters:
+            for field in self.parameters["required"]:
+                if field not in arguments:
+                    errors.append(f"Missing required field: {field}")
+        # Check for unknown fields
+        if "properties" in self.parameters:
+            known_fields = set(self.parameters["properties"].keys())
+            for field in arguments.keys():
+                if field not in known_fields:
+                    errors.append(f"Unknown field: {field}")
+        return errors
+@dataclass
+class ToolCall:
+    """Represents a request to execute a tool.
+    Attributes:
+        tool_name: Name of tool to execute
+        arguments: Arguments to pass to tool
+        call_id: Unique identifier for this call
+    """
+    tool_name: str
+    arguments: dict[str, Any]
+    call_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary.
+        Returns:
+            Dictionary representation
+        """
+        return {
+            "tool_name": self.tool_name,
+            "arguments": self.arguments,
+            "call_id": self.call_id,
+        }
+@dataclass
+class ToolResult:
+    """Result from executing a tool.
+    Attributes:
+        call: Original tool call
+        result: Result value (if successful)
+        error: Error message (if failed)
+        execution_time_ms: Time taken to execute (milliseconds)
+        metadata: Additional metadata
+    """
+    call: ToolCall
+    result: Any | None
+    error: str | None
+    execution_time_ms: float
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def is_success(self) -> bool:
+        """Check if tool execution was successful.
+        Returns:
+            True if no error
+        """
+        return self.error is None
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary.
+        Returns:
+            Dictionary representation
+        """
+        return {
+            "call": self.call.to_dict(),
+            "result": self.result,
+            "error": self.error,
+            "execution_time_ms": self.execution_time_ms,
+            "metadata": self.metadata,
+        }
+class ToolRegistry:
+    """Registry for managing and executing tools.
+    This class maintains a registry of available tools and provides
+    methods for registering, retrieving, and executing them.
+    Examples:
+        registry = ToolRegistry()
+        # Register tools
+        registry.register(calculator_tool)
+        registry.register(search_tool)
+        # Execute tool
+        call = ToolCall(tool_name="calculator", arguments={...})
+        result = registry.execute(call)
+    """
+    def __init__(self):
+        """Initialize empty tool registry."""
+        self._tools: dict[str, ToolDefinition] = {}
+    def register(self, tool: ToolDefinition) -> None:
+        """Register a tool.
+        Args:
+            tool: Tool definition to register
+        Raises:
+            ValueError: If tool with same name already registered
+        """
+        if tool.name in self._tools:
+            raise ValueError(f"Tool '{tool.name}' already registered")
+        self._tools[tool.name] = tool
+    def unregister(self, name: str) -> None:
+        """Unregister a tool by name.
+        Args:
+            name: Tool name to unregister
+        """
+        self._tools.pop(name, None)
+    def get(self, name: str) -> ToolDefinition | None:
+        """Get tool by name.
+        Args:
+            name: Tool name
+        Returns:
+            ToolDefinition if found, None otherwise
+        """
+        return self._tools.get(name)
+    def list_tools(self) -> list[ToolDefinition]:
+        """Get all registered tools.
+        Returns:
+            List of tool definitions
+        """
+        return list(self._tools.values())
+    def execute(self, call: ToolCall) -> ToolResult:
+        """Execute a tool call.
+        Args:
+            call: Tool call to execute
+        Returns:
+            ToolResult with execution result or error
+        """
+        tool = self._tools.get(call.tool_name)
+        if tool is None:
+            return ToolResult(
+                call=call,
+                result=None,
+                error=f"Unknown tool: {call.tool_name}",
+                execution_time_ms=0.0,
+            )
+        # Validate arguments
+        validation_errors = tool.validate_arguments(call.arguments)
+        if validation_errors:
+            return ToolResult(
+                call=call,
+                result=None,
+                error=f"Invalid arguments: {'; '.join(validation_errors)}",
+                execution_time_ms=0.0,
+            )
+        # Execute tool
+        start = time.perf_counter()
+        try:
+            result = tool.handler(call.arguments)
+            elapsed = (time.perf_counter() - start) * 1000
+            return ToolResult(
+                call=call,
+                result=result,
+                error=None,
+                execution_time_ms=elapsed,
+            )
+        except Exception as e:
+            elapsed = (time.perf_counter() - start) * 1000
+            return ToolResult(
+                call=call,
+                result=None,
+                error=f"{e.__class__.__name__}: {str(e)}",
+                execution_time_ms=elapsed,
+            )
+    def to_dict_list(self) -> list[dict[str, Any]]:
+        """Get all tools as dictionary list (for sending to model).
+        Returns:
+            List of tool definitions as dictionaries
+        """
+        return [tool.to_dict() for tool in self._tools.values()]
+# Built-in tools for common use cases
+def create_calculator_tool() -> ToolDefinition:
+    """Create a basic calculator tool.
+    Returns:
+        ToolDefinition for calculator
+    """
+    def handler(args: dict[str, Any]) -> float:
+        operation = args["operation"]
+        a = float(args["a"])
+        b = float(args["b"])
+        if operation == "add":
+            return a + b
+        elif operation == "subtract":
+            return a - b
+        elif operation == "multiply":
+            return a * b
+        elif operation == "divide":
+            if b == 0:
+                raise ValueError("Division by zero")
+            return a / b
+        else:
+            raise ValueError(f"Unknown operation: {operation}")
+    return ToolDefinition(
+        name="calculator",
+        description="Perform basic arithmetic operations (add, subtract, multiply, divide)",
+        parameters={
+            "type": "object",
+            "properties": {
+                "operation": {
+                    "type": "string",
+                    "enum": ["add", "subtract", "multiply", "divide"],
+                    "description": "The arithmetic operation to perform",
+                },
+                "a": {"type": "number", "description": "First number"},
+                "b": {"type": "number", "description": "Second number"},
+            },
+            "required": ["operation", "a", "b"],
+        },
+        handler=handler,
+    )
+def create_counter_tool() -> ToolDefinition:
+    """Create a stateful counter tool for testing.
+    Returns:
+        ToolDefinition for counter
+    """
+    counter = {"value": 0}
+    def handler(args: dict[str, Any]) -> int:
+        action = args["action"]
+        if action == "increment":
+            counter["value"] += 1
+        elif action == "decrement":
+            counter["value"] -= 1
+        elif action == "reset":
+            counter["value"] = 0
+        elif action == "get":
+            pass  # Just return current value
+        else:
+            raise ValueError(f"Unknown action: {action}")
+        return counter["value"]
+    return ToolDefinition(
+        name="counter",
+        description="Simple counter that can be incremented, decremented, or reset",
+        parameters={
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": ["increment", "decrement", "reset", "get"],
+                    "description": "Action to perform on counter",
+                },
+            },
+            "required": ["action"],
+        },
+        handler=handler,
+    )
+__all__ = [
+    "ToolDefinition",
+    "ToolCall",
+    "ToolResult",
+    "ToolRegistry",
+    "create_calculator_tool",
+    "create_counter_tool",
+]

themis/core/types.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""Common type definitions and generic types for Themis.
+This module provides improved type safety through generic types and protocols.
+All types are designed to be backward compatible with existing code.
+"""
+from __future__ import annotations
+from typing import Any, Protocol, Sequence, TypeVar, runtime_checkable
+from themis.core import entities
+# Type variables for generic types
+T = TypeVar("T")  # Generic type for predictions/references
+T_co = TypeVar("T_co", covariant=True)  # Covariant type for outputs
+@runtime_checkable
+class TypedExtractor(Protocol[T_co]):
+    """Protocol for extractors with typed output.
+    This is a backward-compatible extension of the Extractor protocol that
+    provides type information about the extraction output.
+    """
+    def extract(self, raw_output: str) -> T_co:
+        """Extract structured data from raw output.
+        Args:
+            raw_output: Raw text output from model
+        Returns:
+            Extracted value of type T_co
+        Raises:
+            FieldExtractionError: If extraction fails
+        """
+        ...
+@runtime_checkable
+class TypedMetric(Protocol[T]):
+    """Protocol for metrics with typed predictions.
+    This is a backward-compatible extension of the Metric interface that
+    provides type information about expected prediction types.
+    """
+    name: str
+    def compute(
+        self,
+        *,
+        prediction: T,
+        references: Sequence[T],
+        metadata: dict[str, Any] | None = None,
+    ) -> entities.MetricScore:
+        """Compute metric score.
+        Args:
+            prediction: Model prediction of type T
+            references: Reference answers of type T
+            metadata: Optional metadata
+        Returns:
+            MetricScore with computed value
+        """
+        ...
+# Common type aliases for better readability
+PredictionType = TypeVar("PredictionType")
+ReferenceType = TypeVar("ReferenceType")
+ExtractionType = TypeVar("ExtractionType")
+class ValidationError(ValueError):
+    """Raised when runtime type validation fails."""
+    pass
+def validate_type(value: Any, expected_type: type[T], field_name: str = "value") -> T:
+    """Validate value against expected type at runtime.
+    Args:
+        value: Value to validate
+        expected_type: Expected type
+        field_name: Name of field for error messages
+    Returns:
+        Value cast to expected type
+    Raises:
+        ValidationError: If type validation fails
+    """
+    if not isinstance(value, expected_type):
+        raise ValidationError(
+            f"{field_name} expected type {expected_type.__name__}, "
+            f"got {type(value).__name__}"
+        )
+    return value
+def validate_sequence_type(
+    values: Sequence[Any], expected_type: type[T], field_name: str = "values"
+) -> Sequence[T]:
+    """Validate all values in sequence against expected type.
+    Args:
+        values: Sequence to validate
+        expected_type: Expected type for elements
+        field_name: Name of field for error messages
+    Returns:
+        Validated sequence
+    Raises:
+        ValidationError: If any element fails validation
+    """
+    for i, value in enumerate(values):
+        if not isinstance(value, expected_type):
+            raise ValidationError(
+                f"{field_name}[{i}] expected type {expected_type.__name__}, "
+                f"got {type(value).__name__}"
+            )
+    return values
+__all__ = [
+    "T",
+    "T_co",
+    "TypedExtractor",
+    "TypedMetric",
+    "PredictionType",
+    "ReferenceType",
+    "ExtractionType",
+    "ValidationError",
+    "validate_type",
+    "validate_sequence_type",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl