PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/experiment/orchestrator.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""Experiment orchestrator primitives."""
+from __future__ import annotations
+from datetime import datetime, timezone
+from typing import Callable, Sequence
+from themis.config.schema import IntegrationsConfig
+from themis.core.entities import (
+    EvaluationRecord,
+    ExperimentFailure,
+    ExperimentReport,
+    GenerationRecord,
+    GenerationTask,
+    MetricScore,
+)
+from themis.evaluation import pipeline as evaluation_pipeline
+from themis.evaluation.reports import EvaluationFailure
+from themis.experiment import storage as experiment_storage
+from themis.experiment.cache_manager import CacheManager
+from themis.experiment.cost import CostTracker
+from themis.experiment.integration_manager import IntegrationManager
+from themis.experiment.pricing import calculate_cost, get_provider_pricing
+from themis.generation import plan as generation_plan
+from themis.generation import runner as generation_runner
+class ExperimentOrchestrator:
+    """Orchestrates experiment execution: generation → evaluation → reporting.
+    This class coordinates the experiment workflow using focused managers:
+    - CacheManager: Handles storage and resumability
+    - IntegrationManager: Handles WandB and HuggingFace Hub
+    Single Responsibility: Orchestration of experiment flow
+    """
+    def __init__(
+        self,
+        *,
+        generation_plan: generation_plan.GenerationPlan,
+        generation_runner: generation_runner.GenerationRunner,
+        evaluation_pipeline: evaluation_pipeline.EvaluationPipeline,
+        storage: experiment_storage.ExperimentStorage | None = None,
+        integrations_config: IntegrationsConfig | None = None,
+        cache_manager: CacheManager | None = None,
+        integration_manager: IntegrationManager | None = None,
+    ) -> None:
+        """Initialize experiment orchestrator.
+        Args:
+            generation_plan: Plan for expanding dataset into tasks
+            generation_runner: Runner for executing generation tasks
+            evaluation_pipeline: Pipeline for evaluating outputs
+            storage: Optional storage backend (deprecated, use cache_manager)
+            integrations_config: Integration config (deprecated, use integration_manager)
+            cache_manager: Manager for caching and resumability
+            integration_manager: Manager for external integrations
+        """
+        self._plan = generation_plan
+        self._runner = generation_runner
+        self._evaluation = evaluation_pipeline
+        # Support both new managers and legacy direct parameters for backward compatibility
+        self._cache = cache_manager or CacheManager(
+            storage=storage,
+            enable_resume=True,
+            enable_cache=True,
+        )
+        self._integrations = integration_manager or IntegrationManager(
+            config=integrations_config or IntegrationsConfig()
+        )
+        # Initialize cost tracker
+        self._cost_tracker = CostTracker()
+        # Keep legacy references for backward compatibility
+        self._storage = storage
+    def run(
+        self,
+        dataset: Sequence[dict[str, object]] | None = None,
+        *,
+        dataset_loader: Callable[[], Sequence[dict[str, object]]] | None = None,
+        max_samples: int | None = None,
+        run_id: str | None = None,
+        resume: bool = True,
+        cache_results: bool = True,
+        on_result: Callable[[GenerationRecord], None] | None = None,
+    ) -> ExperimentReport:
+        """Run experiment: generate responses, evaluate, and report results.
+        Args:
+            dataset: Optional dataset samples to use
+            dataset_loader: Optional callable to load dataset
+            max_samples: Optional limit on number of samples
+            run_id: Optional run identifier for caching
+            resume: Whether to resume from cached results
+            cache_results: Whether to cache new results
+            on_result: Optional callback for each generation result
+        Returns:
+            ExperimentReport with generation results, evaluation, and metadata
+        """
+        # Initialize integrations
+        self._integrations.initialize_run(
+            {
+                "max_samples": max_samples,
+                "run_id": run_id,
+                "resume": resume,
+            }
+        )
+        # Prepare dataset
+        dataset_list = self._resolve_dataset(
+            dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
+        )
+        selected_dataset = (
+            dataset_list[:max_samples] if max_samples is not None else dataset_list
+        )
+        run_identifier = run_id or self._default_run_id()
+        # Cache dataset for resumability
+        if dataset_list:
+            self._cache.cache_dataset(run_identifier, dataset_list)
+        # Expand dataset into generation tasks
+        tasks = list(self._plan.expand(selected_dataset))
+        # Load cached results if resuming
+        cached_records = (
+            self._cache.load_cached_records(run_identifier) if resume else {}
+        )
+        cached_evaluations = (
+            self._cache.load_cached_evaluations(run_identifier) if resume else {}
+        )
+        # Process tasks: use cached or run new generations
+        generation_results: list[GenerationRecord] = []
+        failures: list[ExperimentFailure] = []
+        pending_tasks: list[GenerationTask] = []
+        pending_records: list[GenerationRecord] = []
+        pending_keys: list[str] = []
+        cached_eval_records: list[EvaluationRecord] = []
+        for task in tasks:
+            cache_key = experiment_storage.task_cache_key(task)
+            cached = cached_records.get(cache_key)
+            if cached is not None:
+                generation_results.append(cached)
+                if cached.error:
+                    failures.append(
+                        ExperimentFailure(
+                            sample_id=cached.task.metadata.get("dataset_id"),
+                            message=cached.error.message,
+                        )
+                    )
+                evaluation = cached_evaluations.get(cache_key)
+                if evaluation is not None:
+                    cached_eval_records.append(evaluation)
+                else:
+                    pending_records.append(cached)
+                    pending_keys.append(cache_key)
+                if on_result:
+                    on_result(cached)
+            else:
+                pending_tasks.append(task)
+        # Run pending generation tasks
+        if pending_tasks:
+            for record in self._runner.run(pending_tasks):
+                generation_results.append(record)
+                # Track cost for successful generations
+                if record.output and record.output.usage:
+                    usage = record.output.usage
+                    prompt_tokens = usage.get("prompt_tokens", 0)
+                    completion_tokens = usage.get("completion_tokens", 0)
+                    model = record.task.model.identifier
+                    # Calculate cost using pricing database
+                    cost = calculate_cost(model, prompt_tokens, completion_tokens)
+                    self._cost_tracker.record_generation(
+                        model=model,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        cost=cost,
+                    )
+                if record.error:
+                    failures.append(
+                        ExperimentFailure(
+                            sample_id=record.task.metadata.get("dataset_id"),
+                            message=record.error.message,
+                        )
+                    )
+                cache_key = experiment_storage.task_cache_key(record.task)
+                if cache_results:
+                    self._cache.save_generation_record(
+                        run_identifier, record, cache_key
+                    )
+                pending_records.append(record)
+                pending_keys.append(cache_key)
+                if on_result:
+                    on_result(record)
+        # Evaluate pending records
+        if pending_records:
+            new_evaluation_report = self._evaluation.evaluate(pending_records)
+        else:
+            new_evaluation_report = evaluation_pipeline.EvaluationReport(
+                metrics={}, failures=[], records=[]
+            )
+        # Cache evaluation results
+        for record, evaluation in zip(pending_records, new_evaluation_report.records):
+            self._cache.save_evaluation_record(run_identifier, record, evaluation)
+        # Combine cached and new evaluations
+        evaluation_report = self._combine_evaluations(
+            cached_eval_records, new_evaluation_report
+        )
+        # Get cost breakdown
+        cost_breakdown = self._cost_tracker.get_breakdown()
+        # Build metadata
+        metadata = {
+            "total_samples": len(selected_dataset),
+            "successful_generations": sum(
+                1 for result in generation_results if not result.error
+            ),
+            "failed_generations": sum(
+                1 for result in generation_results if result.error
+            ),
+            "run_id": run_identifier,
+            "evaluation_failures": sum(
+                1 for record in evaluation_report.records if record.failures
+            )
+            + len(evaluation_report.failures),
+            # Cost tracking
+            "cost": {
+                "total_cost": cost_breakdown.total_cost,
+                "generation_cost": cost_breakdown.generation_cost,
+                "evaluation_cost": cost_breakdown.evaluation_cost,
+                "currency": cost_breakdown.currency,
+                "token_counts": cost_breakdown.token_counts,
+                "api_calls": cost_breakdown.api_calls,
+                "per_model_costs": cost_breakdown.per_model_costs,
+            },
+        }
+        # Create final report
+        report = ExperimentReport(
+            generation_results=generation_results,
+            evaluation_report=evaluation_report,
+            failures=failures,
+            metadata=metadata,
+        )
+        # Log to integrations
+        self._integrations.log_results(report)
+        # Upload to HuggingFace Hub if enabled
+        run_path = self._cache.get_run_path(run_identifier)
+        self._integrations.upload_results(report, run_path)
+        # Save report.json for multi-experiment comparison
+        if cache_results:
+            self._save_report_json(report, run_identifier)
+        return report
+    def _default_run_id(self) -> str:
+        return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
+    def _resolve_dataset(
+        self,
+        *,
+        dataset: Sequence[dict[str, object]] | None,
+        dataset_loader: Callable[[], Sequence[dict[str, object]]] | None,
+        run_id: str | None,
+    ) -> list[dict[str, object]]:
+        """Resolve dataset from various sources.
+        Args:
+            dataset: Direct dataset samples
+            dataset_loader: Callable to load dataset
+            run_id: Run ID to load cached dataset
+        Returns:
+            List of dataset samples
+        Raises:
+            ValueError: If no dataset source is available
+        """
+        if dataset is not None:
+            return list(dataset)
+        if dataset_loader is not None:
+            return list(dataset_loader())
+        # Try to load from cache (for backward compatibility, still use _storage directly)
+        if self._storage is not None and run_id is not None:
+            return self._storage.load_dataset(run_id)
+        raise ValueError(
+            "No dataset provided. Supply `dataset=` rows, a `dataset_loader`, "
+            "or set `run_id` with storage configured so cached data can be reloaded."
+        )
+    def _combine_evaluations(
+        self,
+        cached_records: list[EvaluationRecord],
+        new_report: evaluation_pipeline.EvaluationReport,
+    ) -> evaluation_pipeline.EvaluationReport:
+        all_records = list(cached_records) + list(new_report.records)
+        per_metric: dict[str, list[MetricScore]] = {}
+        for record in all_records:
+            for score in record.scores:
+                per_metric.setdefault(score.metric_name, []).append(score)
+        aggregates: dict[str, evaluation_pipeline.MetricAggregate] = {}
+        metric_names = set(per_metric.keys()) | set(new_report.metrics.keys())
+        for name in metric_names:
+            scores = per_metric.get(name, [])
+            mean = sum(score.value for score in scores) / len(scores) if scores else 0.0
+            aggregates[name] = evaluation_pipeline.MetricAggregate(
+                name=name,
+                count=len(scores),
+                mean=mean,
+                per_sample=scores,
+            )
+        failures = list(new_report.failures)
+        for record in cached_records:
+            for message in record.failures:
+                failures.append(
+                    EvaluationFailure(sample_id=record.sample_id, message=message)
+                )
+        return evaluation_pipeline.EvaluationReport(
+            metrics=aggregates,
+            failures=failures,
+            records=all_records,
+        )
+    def _save_report_json(self, report: ExperimentReport, run_id: str) -> None:
+        """Save experiment report as JSON for multi-experiment comparison.
+        Args:
+            report: Experiment report to save
+            run_id: Run identifier
+        """
+        from pathlib import Path
+        from themis.experiment.export import build_json_report
+        # Get run path from cache manager
+        run_path_str = self._cache.get_run_path(run_id)
+        if run_path_str is None:
+            # No storage configured, skip saving report.json
+            return
+        run_path = Path(run_path_str)
+        report_path = run_path / "report.json"
+        # Build JSON report
+        json_data = build_json_report(report, title=f"Experiment {run_id}")
+        # Save to file
+        import json
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        with report_path.open("w", encoding="utf-8") as f:
+            json.dump(json_data, f, indent=2)

themis/experiment/pricing.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""Provider pricing database and cost calculation utilities."""
+from __future__ import annotations
+from typing import Any
+# Pricing table for common LLM providers (prices per token in USD)
+# Updated as of November 2024
+PRICING_TABLE: dict[str, dict[str, float]] = {
+    # OpenAI models
+    "gpt-4": {
+        "prompt_tokens": 0.00003,  # $30 per 1M tokens
+        "completion_tokens": 0.00006,  # $60 per 1M tokens
+    },
+    "gpt-4-32k": {
+        "prompt_tokens": 0.00006,
+        "completion_tokens": 0.00012,
+    },
+    "gpt-4-turbo": {
+        "prompt_tokens": 0.00001,  # $10 per 1M tokens
+        "completion_tokens": 0.00003,  # $30 per 1M tokens
+    },
+    "gpt-4-turbo-preview": {
+        "prompt_tokens": 0.00001,
+        "completion_tokens": 0.00003,
+    },
+    "gpt-3.5-turbo": {
+        "prompt_tokens": 0.0000005,  # $0.50 per 1M tokens
+        "completion_tokens": 0.0000015,  # $1.50 per 1M tokens
+    },
+    "gpt-3.5-turbo-16k": {
+        "prompt_tokens": 0.000003,
+        "completion_tokens": 0.000004,
+    },
+    # Anthropic Claude models
+    "claude-3-5-sonnet-20241022": {
+        "prompt_tokens": 0.000003,  # $3 per 1M tokens
+        "completion_tokens": 0.000015,  # $15 per 1M tokens
+    },
+    "claude-3-opus-20240229": {
+        "prompt_tokens": 0.000015,  # $15 per 1M tokens
+        "completion_tokens": 0.000075,  # $75 per 1M tokens
+    },
+    "claude-3-sonnet-20240229": {
+        "prompt_tokens": 0.000003,
+        "completion_tokens": 0.000015,
+    },
+    "claude-3-haiku-20240307": {
+        "prompt_tokens": 0.00000025,  # $0.25 per 1M tokens
+        "completion_tokens": 0.00000125,  # $1.25 per 1M tokens
+    },
+    # Google models
+    "gemini-pro": {
+        "prompt_tokens": 0.00000025,
+        "completion_tokens": 0.0000005,
+    },
+    "gemini-1.5-pro": {
+        "prompt_tokens": 0.00000125,  # $1.25 per 1M tokens
+        "completion_tokens": 0.000005,  # $5 per 1M tokens
+    },
+    "gemini-1.5-flash": {
+        "prompt_tokens": 0.000000075,  # $0.075 per 1M tokens
+        "completion_tokens": 0.0000003,  # $0.30 per 1M tokens
+    },
+    # Mistral models
+    "mistral-large-latest": {
+        "prompt_tokens": 0.000002,  # $2 per 1M tokens
+        "completion_tokens": 0.000006,  # $6 per 1M tokens
+    },
+    "mistral-medium-latest": {
+        "prompt_tokens": 0.0000027,
+        "completion_tokens": 0.0000081,
+    },
+    "mistral-small-latest": {
+        "prompt_tokens": 0.000001,
+        "completion_tokens": 0.000003,
+    },
+    # Cohere models
+    "command-r-plus": {
+        "prompt_tokens": 0.000003,
+        "completion_tokens": 0.000015,
+    },
+    "command-r": {
+        "prompt_tokens": 0.0000005,
+        "completion_tokens": 0.0000015,
+    },
+    # Meta Llama (via various providers - using typical cloud pricing)
+    "llama-3.1-70b": {
+        "prompt_tokens": 0.00000088,
+        "completion_tokens": 0.00000088,
+    },
+    "llama-3.1-8b": {
+        "prompt_tokens": 0.0000002,
+        "completion_tokens": 0.0000002,
+    },
+    # Default fallback for unknown models
+    "default": {
+        "prompt_tokens": 0.000001,
+        "completion_tokens": 0.000002,
+    },
+}
+# Model aliases and variations
+MODEL_ALIASES: dict[str, str] = {
+    # OpenAI aliases
+    "gpt-4-0613": "gpt-4",
+    "gpt-4-0314": "gpt-4",
+    "gpt-4-1106-preview": "gpt-4-turbo-preview",
+    "gpt-4-0125-preview": "gpt-4-turbo-preview",
+    "gpt-3.5-turbo-0613": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0301": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-1106": "gpt-3.5-turbo",
+    # Anthropic aliases
+    "claude-3-opus": "claude-3-opus-20240229",
+    "claude-3-sonnet": "claude-3-sonnet-20240229",
+    "claude-3-haiku": "claude-3-haiku-20240307",
+    "claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
+    # Google aliases
+    "gemini-pro-1.0": "gemini-pro",
+    "gemini-1.5-pro-latest": "gemini-1.5-pro",
+    "gemini-1.5-flash-latest": "gemini-1.5-flash",
+}
+def normalize_model_name(model: str) -> str:
+    """Normalize model name to canonical form.
+    Args:
+        model: Model identifier (may include provider prefix)
+    Returns:
+        Normalized model name
+    Example:
+        >>> normalize_model_name("openai/gpt-4-0613")
+        'gpt-4'
+        >>> normalize_model_name("claude-3-opus")
+        'claude-3-opus-20240229'
+    """
+    # Remove provider prefix if present (e.g., "openai/gpt-4" -> "gpt-4")
+    if "/" in model:
+        model = model.split("/", 1)[1]
+    # Look up alias
+    model = MODEL_ALIASES.get(model, model)
+    return model
+def get_provider_pricing(model: str) -> dict[str, float]:
+    """Get pricing for a model.
+    Args:
+        model: Model identifier
+    Returns:
+        Dict with 'prompt_tokens' and 'completion_tokens' prices per token
+    Example:
+        >>> pricing = get_provider_pricing("gpt-4")
+        >>> print(f"Prompt: ${pricing['prompt_tokens'] * 1_000_000:.2f}/1M tokens")
+        Prompt: $30.00/1M tokens
+    """
+    normalized = normalize_model_name(model)
+    # Check if we have pricing for this model
+    if normalized in PRICING_TABLE:
+        return PRICING_TABLE[normalized].copy()
+    # Try to find a partial match (e.g., "gpt-4-turbo-2024-04-09" matches "gpt-4-turbo")
+    for known_model in PRICING_TABLE:
+        if known_model in normalized or normalized.startswith(known_model):
+            return PRICING_TABLE[known_model].copy()
+    # Fallback to default pricing
+    return PRICING_TABLE["default"].copy()
+def calculate_cost(
+    model: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+    pricing: dict[str, float] | None = None,
+) -> float:
+    """Calculate cost for a model completion.
+    Args:
+        model: Model identifier
+        prompt_tokens: Number of prompt tokens
+        completion_tokens: Number of completion tokens
+        pricing: Optional custom pricing (if None, uses default pricing table)
+    Returns:
+        Total cost in USD
+    Example:
+        >>> cost = calculate_cost("gpt-4", 1000, 500)
+        >>> print(f"Cost: ${cost:.4f}")
+        Cost: $0.0600
+    """
+    if pricing is None:
+        pricing = get_provider_pricing(model)
+    prompt_cost = prompt_tokens * pricing["prompt_tokens"]
+    completion_cost = completion_tokens * pricing["completion_tokens"]
+    return prompt_cost + completion_cost
+def compare_provider_costs(
+    prompt_tokens: int,
+    completion_tokens: int,
+    models: list[str],
+) -> dict[str, float]:
+    """Compare costs across multiple providers for same workload.
+    Args:
+        prompt_tokens: Number of prompt tokens
+        completion_tokens: Number of completion tokens
+        models: List of model identifiers to compare
+    Returns:
+        Dict mapping model names to costs
+    Example:
+        >>> costs = compare_provider_costs(
+        ...     1000, 500, ["gpt-4", "gpt-3.5-turbo", "claude-3-haiku"]
+        ... )
+        >>> for model, cost in sorted(costs.items(), key=lambda x: x[1]):
+        ...     print(f"{model}: ${cost:.4f}")
+        claude-3-haiku: $0.0009
+        gpt-3.5-turbo: $0.0013
+        gpt-4: $0.0600
+    """
+    costs = {}
+    for model in models:
+        costs[model] = calculate_cost(model, prompt_tokens, completion_tokens)
+    return costs
+def estimate_tokens(text: str, chars_per_token: float = 4.0) -> int:
+    """Estimate number of tokens from text.
+    This is a rough approximation. For accurate token counts,
+    use the model's tokenizer.
+    Args:
+        text: Input text
+        chars_per_token: Average characters per token (default: 4.0)
+    Returns:
+        Estimated token count
+    Example:
+        >>> text = "This is a sample text for token estimation."
+        >>> tokens = estimate_tokens(text)
+        >>> print(f"Estimated tokens: {tokens}")
+        Estimated tokens: 11
+    """
+    if not text:
+        return 0
+    return max(1, int(len(text) / chars_per_token))
+def get_all_models() -> list[str]:
+    """Get list of all models with known pricing.
+    Returns:
+        List of model identifiers
+    """
+    return [k for k in PRICING_TABLE.keys() if k != "default"]
+def get_pricing_summary() -> dict[str, Any]:
+    """Get summary of pricing for all models.
+    Returns:
+        Dict with model pricing information
+    Example:
+        >>> summary = get_pricing_summary()
+        >>> print(f"Total models: {summary['total_models']}")
+        >>> print(f"Cheapest: {summary['cheapest_model']}")
+    """
+    models = get_all_models()
+    # Find cheapest and most expensive (based on prompt + completion average)
+    model_avg_costs = {}
+    for model in models:
+        pricing = PRICING_TABLE[model]
+        avg_cost = (pricing["prompt_tokens"] + pricing["completion_tokens"]) / 2
+        model_avg_costs[model] = avg_cost
+    cheapest = min(model_avg_costs.items(), key=lambda x: x[1])
+    most_expensive = max(model_avg_costs.items(), key=lambda x: x[1])
+    return {
+        "total_models": len(models),
+        "cheapest_model": cheapest[0],
+        "cheapest_avg_cost_per_token": cheapest[1],
+        "most_expensive_model": most_expensive[0],
+        "most_expensive_avg_cost_per_token": most_expensive[1],
+        "models": models,
+    }
+__all__ = [
+    "PRICING_TABLE",
+    "MODEL_ALIASES",
+    "normalize_model_name",
+    "get_provider_pricing",
+    "calculate_cost",
+    "compare_provider_costs",
+    "estimate_tokens",
+    "get_all_models",
+    "get_pricing_summary",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl