PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/cli/commands/cost.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""Cost estimation and tracking commands."""
+from __future__ import annotations
+from typing import Annotated
+from cyclopts import Parameter
+from themis.experiment.cost import estimate_experiment_cost
+from themis.experiment.pricing import (
+    compare_provider_costs,
+    get_all_models,
+    get_provider_pricing,
+)
+def estimate_cost_command(
+    *,
+    model: Annotated[
+        str, Parameter(help="Model identifier (e.g., gpt-4, claude-3-5-sonnet)")
+    ],
+    dataset_size: Annotated[int, Parameter(help="Number of samples in dataset")],
+    avg_prompt_tokens: Annotated[
+        int, Parameter(help="Average prompt tokens per sample")
+    ] = 500,
+    avg_completion_tokens: Annotated[
+        int, Parameter(help="Average completion tokens per sample")
+    ] = 300,
+) -> int:
+    """Estimate cost for an experiment before running.
+    Examples:
+        # Estimate cost for 100 samples with GPT-4
+        uv run python -m themis.cli estimate-cost \\
+          --model gpt-4 \\
+          --dataset-size 100
+        # Custom token estimates
+        uv run python -m themis.cli estimate-cost \\
+          --model claude-3-5-sonnet-20241022 \\
+          --dataset-size 1000 \\
+          --avg-prompt-tokens 800 \\
+          --avg-completion-tokens 400
+    """
+    try:
+        estimate = estimate_experiment_cost(
+            model=model,
+            dataset_size=dataset_size,
+            avg_prompt_tokens=avg_prompt_tokens,
+            avg_completion_tokens=avg_completion_tokens,
+        )
+        print("=" * 80)
+        print("Cost Estimate")
+        print("=" * 80)
+        print(f"\nModel: {model}")
+        print(f"Dataset size: {dataset_size} samples")
+        print(
+            f"Avg tokens per sample: {avg_prompt_tokens} prompt + {avg_completion_tokens} completion"
+        )
+        print("\n💰 Estimated Cost")
+        print(f"  Total: ${estimate.estimated_cost:.4f}")
+        print(f"  Per sample: ${estimate.assumptions['cost_per_sample']:.6f}")
+        print(f"  95% CI: ${estimate.lower_bound:.4f} - ${estimate.upper_bound:.4f}")
+        print("\n📊 Breakdown")
+        for phase, cost in estimate.breakdown_by_phase.items():
+            print(f"  {phase.capitalize()}: ${cost:.4f}")
+        print("\n" + "=" * 80)
+        # Warning if cost is high
+        if estimate.estimated_cost > 10.0:
+            print(
+                f"\n⚠️  Warning: Estimated cost is ${estimate.estimated_cost:.2f}. "
+                "Consider using --limit for initial testing."
+            )
+        return 0
+    except Exception as e:
+        print(f"Error estimating cost: {e}")
+        return 1
+def show_pricing_command(
+    *,
+    model: Annotated[
+        str | None, Parameter(help="Show pricing for specific model")
+    ] = None,
+    list_all: Annotated[bool, Parameter(help="List all available models")] = False,
+    compare_models: Annotated[
+        list[str] | None, Parameter(help="Compare costs for multiple models")
+    ] = None,
+) -> int:
+    """Show pricing information for LLM models.
+    Examples:
+        # Show pricing for a specific model
+        uv run python -m themis.cli show-pricing --model gpt-4
+        # List all models with pricing
+        uv run python -m themis.cli show-pricing --list-all
+        # Compare pricing across models (use repeated --compare-models flags)
+        uv run python -m themis.cli show-pricing \\
+          --compare-models gpt-4 \\
+          --compare-models gpt-3.5-turbo \\
+          --compare-models claude-3-haiku-20240307
+    """
+    try:
+        if list_all:
+            models = get_all_models()
+            print("=" * 80)
+            print(f"Available Models ({len(models)} total)")
+            print("=" * 80)
+            print("\nModel pricing (per 1M tokens):\n")
+            for model_name in sorted(models):
+                pricing = get_provider_pricing(model_name)
+                prompt_price = pricing["prompt_tokens"] * 1_000_000
+                completion_price = pricing["completion_tokens"] * 1_000_000
+                print(
+                    f"  {model_name:40s} | "
+                    f"Prompt: ${prompt_price:6.2f} | "
+                    f"Completion: ${completion_price:6.2f}"
+                )
+            print("\n" + "=" * 80)
+            return 0
+        if compare_models:
+            # Compare costs for standard workload
+            prompt_tokens = 1000
+            completion_tokens = 500
+            costs = compare_provider_costs(
+                prompt_tokens, completion_tokens, compare_models
+            )
+            print("=" * 80)
+            print(
+                f"Cost Comparison ({prompt_tokens} prompt + {completion_tokens} completion tokens)"
+            )
+            print("=" * 80)
+            print()
+            # Sort by cost
+            sorted_costs = sorted(costs.items(), key=lambda x: x[1])
+            for model_name, cost in sorted_costs:
+                # Calculate cost per 1M tokens for comparison
+                pricing = get_provider_pricing(model_name)
+                prompt_price = pricing["prompt_tokens"] * 1_000_000
+                completion_price = pricing["completion_tokens"] * 1_000_000
+                print(f"  {model_name:40s} | ${cost:.6f}")
+                print(
+                    f"    {'':40s} | (${prompt_price:.2f} / ${completion_price:.2f} per 1M)"
+                )
+            # Show relative costs
+            if sorted_costs:
+                cheapest_cost = sorted_costs[0][1]
+                print(f"\nRelative costs (vs {sorted_costs[0][0]}):")
+                for model_name, cost in sorted_costs[1:]:
+                    multiplier = cost / cheapest_cost if cheapest_cost > 0 else 0
+                    print(f"  {model_name:40s} | {multiplier:.1f}x more expensive")
+            print("\n" + "=" * 80)
+            return 0
+        if model:
+            pricing = get_provider_pricing(model)
+            prompt_price = pricing["prompt_tokens"] * 1_000_000
+            completion_price = pricing["completion_tokens"] * 1_000_000
+            print("=" * 80)
+            print(f"Pricing for {model}")
+            print("=" * 80)
+            print(f"\nPrompt tokens: ${prompt_price:.2f} per 1M tokens")
+            print(f"Completion tokens: ${completion_price:.2f} per 1M tokens")
+            # Show example costs
+            print("\nExample costs:")
+            examples = [
+                (100, 50, "Short query"),
+                (500, 300, "Medium query"),
+                (1000, 500, "Long query"),
+            ]
+            for prompt_tok, completion_tok, label in examples:
+                from themis.experiment.pricing import calculate_cost
+                cost = calculate_cost(model, prompt_tok, completion_tok)
+                print(
+                    f"  {label:15s} ({prompt_tok:4d} + {completion_tok:4d} tokens): ${cost:.6f}"
+                )
+            print("\n" + "=" * 80)
+            return 0
+        # No options provided
+        print("Error: Must specify --model, --list-all, or --compare-models")
+        print("Use --help for usage information")
+        return 1
+    except Exception as e:
+        print(f"Error: {e}")
+        return 1
+__all__ = ["estimate_cost_command", "show_pricing_command"]

themis/cli/commands/demo.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Demo command implementation."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated
+from cyclopts import Parameter
+from themis.cli.utils import effective_total, export_outputs
+from themis.experiment import math as math_experiment
+from themis.utils.logging_utils import configure_logging
+from themis.utils.progress import ProgressReporter
+def demo_command(
+    *,
+    max_samples: Annotated[
+        int | None, Parameter(help="Limit number of demo samples")
+    ] = None,
+    log_level: Annotated[
+        str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
+    ] = "info",
+    csv_output: Annotated[
+        Path | None, Parameter(help="Write CSV export to this path")
+    ] = None,
+    html_output: Annotated[
+        Path | None, Parameter(help="Write HTML summary to this path")
+    ] = None,
+    json_output: Annotated[
+        Path | None, Parameter(help="Write JSON export to this path")
+    ] = None,
+) -> int:
+    """Run the built-in demo dataset."""
+    configure_logging(log_level)
+    dataset = [
+        {
+            "unique_id": "demo-1",
+            "problem": "Convert the point (0,3) in rectangular coordinates to polar coordinates.",
+            "answer": "\\left( 3, \\frac{\\pi}{2} \\right)",
+            "subject": "precalculus",
+            "level": 2,
+        },
+        {
+            "unique_id": "demo-2",
+            "problem": "What is 7 + 5?",
+            "answer": "12",
+            "subject": "arithmetic",
+            "level": 1,
+        },
+    ]
+    experiment = math_experiment.build_math500_zero_shot_experiment()
+    total = effective_total(len(dataset), max_samples)
+    with ProgressReporter(total=total, description="Generating") as progress:
+        report = experiment.run(
+            dataset,
+            max_samples=max_samples,
+            on_result=progress.on_result,
+        )
+    print(math_experiment.summarize_report(report))
+    export_outputs(
+        report,
+        csv_output=csv_output,
+        html_output=html_output,
+        json_output=json_output,
+        title="Demo experiment",
+    )
+    return 0

themis/cli/commands/info.py ADDED Viewed

@@ -0,0 +1,90 @@
+"""System information and listing commands."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import Annotated
+from cyclopts import Parameter
+from themis.providers.registry import _REGISTRY
+def show_info() -> int:
+    """Show system information and installed components."""
+    import themis
+    from themis import _version
+    print("Themis Information")
+    print("=" * 60)
+    print(f"Version: {getattr(_version, '__version__', 'unknown')}")
+    print(f"Python: {sys.version.split()[0]}")
+    print(f"Platform: {sys.platform}")
+    print("\n📦 Installed Providers:")
+    providers = sorted(_REGISTRY._factories.keys())
+    for provider in providers:
+        print(f"  ✓ {provider}")
+    print("\n📊 Available Benchmarks:")
+    benchmarks = [
+        "demo",
+        "math500",
+        "aime24",
+        "aime25",
+        "amc23",
+        "olympiadbench",
+        "beyondaime",
+        "supergpqa",
+        "mmlu-pro",
+        "inline (via config)",
+    ]
+    for bench in benchmarks:
+        print(f"  ✓ {bench}")
+    print("\n📁 Example Locations:")
+    examples_dir = Path(themis.__file__).parent.parent / "examples"
+    if examples_dir.exists():
+        print(f"  {examples_dir}")
+        example_dirs = sorted(
+            [
+                d.name
+                for d in examples_dir.iterdir()
+                if d.is_dir() and not d.name.startswith("_")
+            ]
+        )
+        for ex in example_dirs:
+            print(f"    • {ex}/")
+    print("\n📚 Documentation:")
+    print("  examples/README.md - Comprehensive tutorial cookbook")
+    print("  COOKBOOK.md - Quick reference guide")
+    print("  docs/ - Detailed documentation")
+    print("\n🚀 Quick Start:")
+    print("  uv run python -m themis.cli demo")
+    print("  uv run python -m themis.cli list-providers")
+    print("  uv run python -m themis.cli list-benchmarks")
+    return 0
+def new_project(
+    *,
+    project_name: Annotated[str, Parameter(help="The name of the new project")],
+    project_path: Annotated[
+        Path,
+        Parameter(help="The path where the new project will be created"),
+    ] = Path("."),
+) -> int:
+    """Create a new Themis project."""
+    from themis.cli.new_project import create_project
+    try:
+        create_project(project_name, project_path)
+        print(f"Successfully created new project '{project_name}' in {project_path}")
+        return 0
+    except FileExistsError as e:
+        print(f"Error: {e}")
+        return 1

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl