PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/utils/cost_tracking.py ADDED Viewed

@@ -0,0 +1,376 @@
+"""Cost tracking utilities for monitoring LLM API usage and costs.
+This module provides tools to track token usage, API costs, and generate
+cost reports across experiments and providers.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List
+from themis.core import entities as core_entities
+# Provider pricing per 1M tokens (as of 2024)
+# Format: {provider_model: (input_cost_per_1m, output_cost_per_1m)}
+DEFAULT_PRICING = {
+    # OpenAI GPT-4
+    "gpt-4": (30.0, 60.0),
+    "gpt-4-turbo": (10.0, 30.0),
+    "gpt-4o": (2.5, 10.0),
+    "gpt-4o-mini": (0.15, 0.60),
+    # OpenAI GPT-3.5
+    "gpt-3.5-turbo": (0.5, 1.5),
+    # Anthropic Claude
+    "claude-3-opus-20240229": (15.0, 75.0),
+    "claude-3-sonnet-20240229": (3.0, 15.0),
+    "claude-3-haiku-20240307": (0.25, 1.25),
+    "claude-3-5-sonnet-20241022": (3.0, 15.0),
+    # Google Gemini
+    "gemini-1.5-pro": (1.25, 5.0),
+    "gemini-1.5-flash": (0.075, 0.30),
+    # Meta Llama (via cloud providers - approximate)
+    "llama-3-70b": (0.9, 0.9),
+    "llama-3-8b": (0.2, 0.2),
+    # Fake/local models
+    "fake": (0.0, 0.0),
+}
+@dataclass
+class TokenUsage:
+    """Token usage statistics for a single API call.
+    Attributes:
+        input_tokens: Number of input/prompt tokens
+        output_tokens: Number of output/completion tokens
+        total_tokens: Total tokens (input + output)
+    """
+    input_tokens: int
+    output_tokens: int
+    @property
+    def total_tokens(self) -> int:
+        return self.input_tokens + self.output_tokens
+@dataclass
+class CostRecord:
+    """Cost record for a single generation.
+    Attributes:
+        model_identifier: Model name/identifier
+        provider: Provider name
+        usage: Token usage statistics
+        input_cost: Cost for input tokens (in USD)
+        output_cost: Cost for output tokens (in USD)
+        total_cost: Total cost (in USD)
+        metadata: Additional metadata (e.g., timestamp, run_id)
+    """
+    model_identifier: str
+    provider: str
+    usage: TokenUsage
+    input_cost: float
+    output_cost: float
+    total_cost: float
+    metadata: Dict[str, object] = field(default_factory=dict)
+@dataclass
+class CostSummary:
+    """Aggregated cost summary across multiple generations.
+    Attributes:
+        total_cost: Total cost in USD
+        total_tokens: Total number of tokens
+        total_input_tokens: Total input tokens
+        total_output_tokens: Total output tokens
+        num_requests: Number of API requests
+        cost_by_model: Cost breakdown by model
+        cost_by_provider: Cost breakdown by provider
+    """
+    total_cost: float
+    total_tokens: int
+    total_input_tokens: int
+    total_output_tokens: int
+    num_requests: int
+    cost_by_model: Dict[str, float]
+    cost_by_provider: Dict[str, float]
+class CostTracker:
+    """Track and compute costs for LLM API usage.
+    This class maintains a record of all API calls and their costs,
+    with support for custom pricing models and cost aggregation.
+    """
+    def __init__(
+        self,
+        pricing: Dict[str, tuple[float, float]] | None = None,
+    ) -> None:
+        """Initialize cost tracker.
+        Args:
+            pricing: Custom pricing dictionary mapping model names to
+                (input_cost_per_1m, output_cost_per_1m) tuples.
+                Defaults to DEFAULT_PRICING if not provided.
+        """
+        self.pricing = pricing or DEFAULT_PRICING.copy()
+        self.records: List[CostRecord] = []
+    def add_pricing(
+        self,
+        model: str,
+        input_cost_per_1m: float,
+        output_cost_per_1m: float,
+    ) -> None:
+        """Add or update pricing for a model.
+        Args:
+            model: Model identifier
+            input_cost_per_1m: Cost per 1M input tokens in USD
+            output_cost_per_1m: Cost per 1M output tokens in USD
+        """
+        self.pricing[model] = (input_cost_per_1m, output_cost_per_1m)
+    def track_generation(
+        self,
+        record: core_entities.GenerationRecord,
+        input_tokens: int | None = None,
+        output_tokens: int | None = None,
+    ) -> CostRecord:
+        """Track cost for a generation record.
+        Args:
+            record: Generation record to track
+            input_tokens: Number of input tokens (if None, estimated from prompt)
+            output_tokens: Number of output tokens (if None, estimated from output)
+        Returns:
+            CostRecord with computed costs
+        """
+        model_id = record.task.model.identifier
+        provider = record.task.model.provider
+        # Extract or estimate token counts
+        if input_tokens is None:
+            input_tokens = self._estimate_tokens(record.task.prompt.text)
+        if output_tokens is None and record.output:
+            output_tokens = self._estimate_tokens(record.output.text)
+        elif output_tokens is None:
+            output_tokens = 0
+        usage = TokenUsage(input_tokens=input_tokens, output_tokens=output_tokens)
+        # Compute costs
+        input_cost, output_cost = self._compute_cost(model_id, usage)
+        total_cost = input_cost + output_cost
+        cost_record = CostRecord(
+            model_identifier=model_id,
+            provider=provider,
+            usage=usage,
+            input_cost=input_cost,
+            output_cost=output_cost,
+            total_cost=total_cost,
+            metadata={
+                "sample_id": record.task.metadata.get("sample_id"),
+                "run_id": record.task.metadata.get("run_id"),
+            },
+        )
+        self.records.append(cost_record)
+        return cost_record
+    def get_summary(self) -> CostSummary:
+        """Compute aggregated cost summary across all tracked records.
+        Returns:
+            CostSummary with aggregated statistics
+        """
+        if not self.records:
+            return CostSummary(
+                total_cost=0.0,
+                total_tokens=0,
+                total_input_tokens=0,
+                total_output_tokens=0,
+                num_requests=0,
+                cost_by_model={},
+                cost_by_provider={},
+            )
+        total_cost = sum(r.total_cost for r in self.records)
+        total_input_tokens = sum(r.usage.input_tokens for r in self.records)
+        total_output_tokens = sum(r.usage.output_tokens for r in self.records)
+        # Aggregate by model
+        cost_by_model: Dict[str, float] = {}
+        for record in self.records:
+            model = record.model_identifier
+            cost_by_model[model] = cost_by_model.get(model, 0.0) + record.total_cost
+        # Aggregate by provider
+        cost_by_provider: Dict[str, float] = {}
+        for record in self.records:
+            provider = record.provider
+            cost_by_provider[provider] = (
+                cost_by_provider.get(provider, 0.0) + record.total_cost
+            )
+        return CostSummary(
+            total_cost=total_cost,
+            total_tokens=total_input_tokens + total_output_tokens,
+            total_input_tokens=total_input_tokens,
+            total_output_tokens=total_output_tokens,
+            num_requests=len(self.records),
+            cost_by_model=cost_by_model,
+            cost_by_provider=cost_by_provider,
+        )
+    def export_records(self, path: str | Path) -> None:
+        """Export cost records to JSON file.
+        Args:
+            path: Output file path
+        """
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            "records": [
+                {
+                    "model": r.model_identifier,
+                    "provider": r.provider,
+                    "input_tokens": r.usage.input_tokens,
+                    "output_tokens": r.usage.output_tokens,
+                    "total_tokens": r.usage.total_tokens,
+                    "input_cost": r.input_cost,
+                    "output_cost": r.output_cost,
+                    "total_cost": r.total_cost,
+                    "metadata": r.metadata,
+                }
+                for r in self.records
+            ],
+            "summary": {
+                "total_cost": self.get_summary().total_cost,
+                "total_tokens": self.get_summary().total_tokens,
+                "num_requests": len(self.records),
+            },
+        }
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2)
+    def _compute_cost(
+        self,
+        model: str,
+        usage: TokenUsage,
+    ) -> tuple[float, float]:
+        """Compute input and output costs for a model.
+        Args:
+            model: Model identifier
+            usage: Token usage statistics
+        Returns:
+            Tuple of (input_cost, output_cost) in USD
+        """
+        # Try exact match first
+        pricing = self.pricing.get(model)
+        # If no exact match, try prefix matching
+        if pricing is None:
+            for price_key in self.pricing:
+                if model.startswith(price_key):
+                    pricing = self.pricing[price_key]
+                    break
+        # Fall back to generic pricing if model not found
+        if pricing is None:
+            # Use a reasonable default ($1 per 1M tokens)
+            pricing = (1.0, 1.0)
+        input_cost_per_1m, output_cost_per_1m = pricing
+        input_cost = (usage.input_tokens / 1_000_000) * input_cost_per_1m
+        output_cost = (usage.output_tokens / 1_000_000) * output_cost_per_1m
+        return input_cost, output_cost
+    @staticmethod
+    def _estimate_tokens(text: str) -> int:
+        """Rough estimate of token count from text.
+        Uses a simple heuristic: ~4 characters per token on average.
+        For accurate counts, use provider-specific tokenizers.
+        Args:
+            text: Input text
+        Returns:
+            Estimated token count
+        """
+        return max(1, len(text) // 4)
+def format_cost_summary(summary: CostSummary) -> str:
+    """Format cost summary as human-readable string.
+    Args:
+        summary: Cost summary to format
+    Returns:
+        Formatted string representation
+    """
+    lines = [
+        "Cost Summary",
+        "=" * 50,
+        f"Total Cost:        ${summary.total_cost:.4f}",
+        f"Total Tokens:      {summary.total_tokens:,}",
+        f"  Input Tokens:    {summary.total_input_tokens:,}",
+        f"  Output Tokens:   {summary.total_output_tokens:,}",
+        f"API Requests:      {summary.num_requests:,}",
+        "",
+    ]
+    if summary.cost_by_model:
+        lines.append("Cost by Model:")
+        lines.append("-" * 50)
+        for model, cost in sorted(
+            summary.cost_by_model.items(),
+            key=lambda x: x[1],
+            reverse=True,
+        ):
+            pct = (cost / summary.total_cost * 100) if summary.total_cost > 0 else 0
+            lines.append(f"  {model:30s} ${cost:8.4f} ({pct:5.1f}%)")
+        lines.append("")
+    if summary.cost_by_provider:
+        lines.append("Cost by Provider:")
+        lines.append("-" * 50)
+        for provider, cost in sorted(
+            summary.cost_by_provider.items(),
+            key=lambda x: x[1],
+            reverse=True,
+        ):
+            pct = (cost / summary.total_cost * 100) if summary.total_cost > 0 else 0
+            lines.append(f"  {provider:30s} ${cost:8.4f} ({pct:5.1f}%)")
+    return "\n".join(lines)
+__all__ = [
+    "TokenUsage",
+    "CostRecord",
+    "CostSummary",
+    "CostTracker",
+    "DEFAULT_PRICING",
+    "format_cost_summary",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl