PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/experiment/pricing.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""Provider pricing database and cost calculation utilities."""
+from __future__ import annotations
+from typing import Any
+# Pricing table for common LLM providers (prices per token in USD)
+# Updated as of November 2024
+PRICING_TABLE: dict[str, dict[str, float]] = {
+    # OpenAI models
+    "gpt-4": {
+        "prompt_tokens": 0.00003,  # $30 per 1M tokens
+        "completion_tokens": 0.00006,  # $60 per 1M tokens
+    },
+    "gpt-4-32k": {
+        "prompt_tokens": 0.00006,
+        "completion_tokens": 0.00012,
+    },
+    "gpt-4-turbo": {
+        "prompt_tokens": 0.00001,  # $10 per 1M tokens
+        "completion_tokens": 0.00003,  # $30 per 1M tokens
+    },
+    "gpt-4-turbo-preview": {
+        "prompt_tokens": 0.00001,
+        "completion_tokens": 0.00003,
+    },
+    "gpt-3.5-turbo": {
+        "prompt_tokens": 0.0000005,  # $0.50 per 1M tokens
+        "completion_tokens": 0.0000015,  # $1.50 per 1M tokens
+    },
+    "gpt-3.5-turbo-16k": {
+        "prompt_tokens": 0.000003,
+        "completion_tokens": 0.000004,
+    },
+    # Anthropic Claude models
+    "claude-3-5-sonnet-20241022": {
+        "prompt_tokens": 0.000003,  # $3 per 1M tokens
+        "completion_tokens": 0.000015,  # $15 per 1M tokens
+    },
+    "claude-3-opus-20240229": {
+        "prompt_tokens": 0.000015,  # $15 per 1M tokens
+        "completion_tokens": 0.000075,  # $75 per 1M tokens
+    },
+    "claude-3-sonnet-20240229": {
+        "prompt_tokens": 0.000003,
+        "completion_tokens": 0.000015,
+    },
+    "claude-3-haiku-20240307": {
+        "prompt_tokens": 0.00000025,  # $0.25 per 1M tokens
+        "completion_tokens": 0.00000125,  # $1.25 per 1M tokens
+    },
+    # Google models
+    "gemini-pro": {
+        "prompt_tokens": 0.00000025,
+        "completion_tokens": 0.0000005,
+    },
+    "gemini-1.5-pro": {
+        "prompt_tokens": 0.00000125,  # $1.25 per 1M tokens
+        "completion_tokens": 0.000005,  # $5 per 1M tokens
+    },
+    "gemini-1.5-flash": {
+        "prompt_tokens": 0.000000075,  # $0.075 per 1M tokens
+        "completion_tokens": 0.0000003,  # $0.30 per 1M tokens
+    },
+    # Mistral models
+    "mistral-large-latest": {
+        "prompt_tokens": 0.000002,  # $2 per 1M tokens
+        "completion_tokens": 0.000006,  # $6 per 1M tokens
+    },
+    "mistral-medium-latest": {
+        "prompt_tokens": 0.0000027,
+        "completion_tokens": 0.0000081,
+    },
+    "mistral-small-latest": {
+        "prompt_tokens": 0.000001,
+        "completion_tokens": 0.000003,
+    },
+    # Cohere models
+    "command-r-plus": {
+        "prompt_tokens": 0.000003,
+        "completion_tokens": 0.000015,
+    },
+    "command-r": {
+        "prompt_tokens": 0.0000005,
+        "completion_tokens": 0.0000015,
+    },
+    # Meta Llama (via various providers - using typical cloud pricing)
+    "llama-3.1-70b": {
+        "prompt_tokens": 0.00000088,
+        "completion_tokens": 0.00000088,
+    },
+    "llama-3.1-8b": {
+        "prompt_tokens": 0.0000002,
+        "completion_tokens": 0.0000002,
+    },
+    # Default fallback for unknown models
+    "default": {
+        "prompt_tokens": 0.000001,
+        "completion_tokens": 0.000002,
+    },
+}
+# Model aliases and variations
+MODEL_ALIASES: dict[str, str] = {
+    # OpenAI aliases
+    "gpt-4-0613": "gpt-4",
+    "gpt-4-0314": "gpt-4",
+    "gpt-4-1106-preview": "gpt-4-turbo-preview",
+    "gpt-4-0125-preview": "gpt-4-turbo-preview",
+    "gpt-3.5-turbo-0613": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0301": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-1106": "gpt-3.5-turbo",
+    # Anthropic aliases
+    "claude-3-opus": "claude-3-opus-20240229",
+    "claude-3-sonnet": "claude-3-sonnet-20240229",
+    "claude-3-haiku": "claude-3-haiku-20240307",
+    "claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
+    # Google aliases
+    "gemini-pro-1.0": "gemini-pro",
+    "gemini-1.5-pro-latest": "gemini-1.5-pro",
+    "gemini-1.5-flash-latest": "gemini-1.5-flash",
+}
+def normalize_model_name(model: str) -> str:
+    """Normalize model name to canonical form.
+    Args:
+        model: Model identifier (may include provider prefix)
+    Returns:
+        Normalized model name
+    Example:
+        >>> normalize_model_name("openai/gpt-4-0613")
+        'gpt-4'
+        >>> normalize_model_name("claude-3-opus")
+        'claude-3-opus-20240229'
+    """
+    # Remove provider prefix if present (e.g., "openai/gpt-4" -> "gpt-4")
+    if "/" in model:
+        model = model.split("/", 1)[1]
+    # Look up alias
+    model = MODEL_ALIASES.get(model, model)
+    return model
+def get_provider_pricing(model: str) -> dict[str, float]:
+    """Get pricing for a model.
+    Args:
+        model: Model identifier
+    Returns:
+        Dict with 'prompt_tokens' and 'completion_tokens' prices per token
+    Example:
+        >>> pricing = get_provider_pricing("gpt-4")
+        >>> print(f"Prompt: ${pricing['prompt_tokens'] * 1_000_000:.2f}/1M tokens")
+        Prompt: $30.00/1M tokens
+    """
+    normalized = normalize_model_name(model)
+    # Check if we have pricing for this model
+    if normalized in PRICING_TABLE:
+        return PRICING_TABLE[normalized].copy()
+    # Try to find a partial match (e.g., "gpt-4-turbo-2024-04-09" matches "gpt-4-turbo")
+    for known_model in PRICING_TABLE:
+        if known_model in normalized or normalized.startswith(known_model):
+            return PRICING_TABLE[known_model].copy()
+    # Fallback to default pricing
+    return PRICING_TABLE["default"].copy()
+def calculate_cost(
+    model: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+    pricing: dict[str, float] | None = None,
+) -> float:
+    """Calculate cost for a model completion.
+    Args:
+        model: Model identifier
+        prompt_tokens: Number of prompt tokens
+        completion_tokens: Number of completion tokens
+        pricing: Optional custom pricing (if None, uses default pricing table)
+    Returns:
+        Total cost in USD
+    Example:
+        >>> cost = calculate_cost("gpt-4", 1000, 500)
+        >>> print(f"Cost: ${cost:.4f}")
+        Cost: $0.0600
+    """
+    if pricing is None:
+        pricing = get_provider_pricing(model)
+    prompt_cost = prompt_tokens * pricing["prompt_tokens"]
+    completion_cost = completion_tokens * pricing["completion_tokens"]
+    return prompt_cost + completion_cost
+def compare_provider_costs(
+    prompt_tokens: int,
+    completion_tokens: int,
+    models: list[str],
+) -> dict[str, float]:
+    """Compare costs across multiple providers for same workload.
+    Args:
+        prompt_tokens: Number of prompt tokens
+        completion_tokens: Number of completion tokens
+        models: List of model identifiers to compare
+    Returns:
+        Dict mapping model names to costs
+    Example:
+        >>> costs = compare_provider_costs(
+        ...     1000, 500, ["gpt-4", "gpt-3.5-turbo", "claude-3-haiku"]
+        ... )
+        >>> for model, cost in sorted(costs.items(), key=lambda x: x[1]):
+        ...     print(f"{model}: ${cost:.4f}")
+        claude-3-haiku: $0.0009
+        gpt-3.5-turbo: $0.0013
+        gpt-4: $0.0600
+    """
+    costs = {}
+    for model in models:
+        costs[model] = calculate_cost(model, prompt_tokens, completion_tokens)
+    return costs
+def estimate_tokens(text: str, chars_per_token: float = 4.0) -> int:
+    """Estimate number of tokens from text.
+    This is a rough approximation. For accurate token counts,
+    use the model's tokenizer.
+    Args:
+        text: Input text
+        chars_per_token: Average characters per token (default: 4.0)
+    Returns:
+        Estimated token count
+    Example:
+        >>> text = "This is a sample text for token estimation."
+        >>> tokens = estimate_tokens(text)
+        >>> print(f"Estimated tokens: {tokens}")
+        Estimated tokens: 11
+    """
+    if not text:
+        return 0
+    return max(1, int(len(text) / chars_per_token))
+def get_all_models() -> list[str]:
+    """Get list of all models with known pricing.
+    Returns:
+        List of model identifiers
+    """
+    return [k for k in PRICING_TABLE.keys() if k != "default"]
+def get_pricing_summary() -> dict[str, Any]:
+    """Get summary of pricing for all models.
+    Returns:
+        Dict with model pricing information
+    Example:
+        >>> summary = get_pricing_summary()
+        >>> print(f"Total models: {summary['total_models']}")
+        >>> print(f"Cheapest: {summary['cheapest_model']}")
+    """
+    models = get_all_models()
+    # Find cheapest and most expensive (based on prompt + completion average)
+    model_avg_costs = {}
+    for model in models:
+        pricing = PRICING_TABLE[model]
+        avg_cost = (pricing["prompt_tokens"] + pricing["completion_tokens"]) / 2
+        model_avg_costs[model] = avg_cost
+    cheapest = min(model_avg_costs.items(), key=lambda x: x[1])
+    most_expensive = max(model_avg_costs.items(), key=lambda x: x[1])
+    return {
+        "total_models": len(models),
+        "cheapest_model": cheapest[0],
+        "cheapest_avg_cost_per_token": cheapest[1],
+        "most_expensive_model": most_expensive[0],
+        "most_expensive_avg_cost_per_token": most_expensive[1],
+        "models": models,
+    }
+__all__ = [
+    "PRICING_TABLE",
+    "MODEL_ALIASES",
+    "normalize_model_name",
+    "get_provider_pricing",
+    "calculate_cost",
+    "compare_provider_costs",
+    "estimate_tokens",
+    "get_all_models",
+    "get_pricing_summary",
+]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl