PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +93 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +164 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +288 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +129 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +690 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +373 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +255 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +61 -0
themis/integrations/wandb.py +65 -0
themis/interfaces/__init__.py +83 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
themis_eval-0.1.1.dist-info/RECORD +134 -0
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0

themis/cli/commands/mcq_benchmarks.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Multiple-choice question benchmark commands."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated, Callable, Literal, Sequence
+from cyclopts import Parameter
+from themis.cli.utils import effective_total, export_outputs
+from themis.datasets import (
+    mmlu_pro as mmlu_pro_dataset,
+)
+from themis.datasets import (
+    super_gpqa as super_gpqa_dataset,
+)
+from themis.experiment import mcq as mcq_experiment
+from themis.experiment import storage as experiment_storage
+from themis.utils.logging_utils import configure_logging
+from themis.utils.progress import ProgressReporter
+def load_multiple_choice_dataset(
+    *,
+    loader: Callable[..., Sequence],
+    source: Literal["huggingface", "local"],
+    data_dir: Path | None,
+    split: str,
+    limit: int | None,
+    subjects: Sequence[str] | None,
+):
+    """Load multiple choice dataset.
+    Args:
+        loader: Dataset loader function
+        source: Dataset source
+        data_dir: Directory containing local dataset
+        split: Dataset split
+        limit: Max rows to load
+        subjects: Subjects to filter
+    Returns:
+        List of generation examples
+    """
+    if source == "local" and data_dir is None:
+        raise ValueError(
+            "The --data-dir option is required when --source=local so Themis "
+            "knows where to read the dataset."
+        )
+    samples = loader(
+        source=source,
+        data_dir=data_dir,
+        split=split,
+        limit=limit,
+        subjects=subjects,
+    )
+    return [sample.to_generation_example() for sample in samples]
+def supergpqa_command(
+    *,
+    source: Annotated[
+        Literal["huggingface", "local"], Parameter(help="Dataset source")
+    ] = "huggingface",
+    split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
+    data_dir: Annotated[
+        Path | None, Parameter(help="Directory containing local dataset")
+    ] = None,
+    limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
+    subjects: Annotated[
+        tuple[str, ...], Parameter(help="Subjects or categories to filter")
+    ] = (),
+    max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
+    storage: Annotated[
+        Path | None, Parameter(help="Cache directory for datasets/results")
+    ] = None,
+    run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
+    resume: Annotated[
+        bool, Parameter(help="Reuse cached generations when storage is set")
+    ] = True,
+    temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
+    log_level: Annotated[
+        str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
+    ] = "info",
+    csv_output: Annotated[
+        Path | None, Parameter(help="Write CSV export to this path")
+    ] = None,
+    html_output: Annotated[
+        Path | None, Parameter(help="Write HTML summary to this path")
+    ] = None,
+    json_output: Annotated[
+        Path | None, Parameter(help="Write JSON export to this path")
+    ] = None,
+) -> int:
+    """Run the SuperGPQA multiple-choice evaluation."""
+    configure_logging(log_level)
+    subject_filter = list(subjects) if subjects else None
+    rows = load_multiple_choice_dataset(
+        loader=super_gpqa_dataset.load_super_gpqa,
+        source=source,
+        data_dir=data_dir,
+        split=split,
+        limit=limit,
+        subjects=subject_filter,
+    )
+    storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
+    experiment = mcq_experiment.build_multiple_choice_json_experiment(
+        dataset_name="supergpqa",
+        task_id="supergpqa",
+        temperature=temperature,
+        storage=storage_impl,
+    )
+    total = effective_total(len(rows), max_samples)
+    with ProgressReporter(total=total, description="Generating") as progress:
+        report = experiment.run(
+            rows,
+            max_samples=max_samples,
+            run_id=run_id,
+            resume=resume,
+            on_result=progress.on_result,
+        )
+    print(mcq_experiment.summarize_report(report))
+    export_outputs(
+        report,
+        csv_output=csv_output,
+        html_output=html_output,
+        json_output=json_output,
+        title="supergpqa experiment",
+    )
+    return 0
+def mmlu_pro_command(
+    *,
+    source: Annotated[
+        Literal["huggingface", "local"], Parameter(help="Dataset source")
+    ] = "huggingface",
+    split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
+    data_dir: Annotated[
+        Path | None, Parameter(help="Directory containing local dataset")
+    ] = None,
+    limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
+    subjects: Annotated[
+        tuple[str, ...], Parameter(help="Subjects or categories to filter")
+    ] = (),
+    max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
+    storage: Annotated[
+        Path | None, Parameter(help="Cache directory for datasets/results")
+    ] = None,
+    run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
+    resume: Annotated[
+        bool, Parameter(help="Reuse cached generations when storage is set")
+    ] = True,
+    temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
+    log_level: Annotated[
+        str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
+    ] = "info",
+    csv_output: Annotated[
+        Path | None, Parameter(help="Write CSV export to this path")
+    ] = None,
+    html_output: Annotated[
+        Path | None, Parameter(help="Write HTML summary to this path")
+    ] = None,
+    json_output: Annotated[
+        Path | None, Parameter(help="Write JSON export to this path")
+    ] = None,
+) -> int:
+    """Run the MMLU-Pro multiple-choice evaluation."""
+    configure_logging(log_level)
+    subject_filter = list(subjects) if subjects else None
+    rows = load_multiple_choice_dataset(
+        loader=mmlu_pro_dataset.load_mmlu_pro,
+        source=source,
+        data_dir=data_dir,
+        split=split,
+        limit=limit,
+        subjects=subject_filter,
+    )
+    storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
+    experiment = mcq_experiment.build_multiple_choice_json_experiment(
+        dataset_name="mmlu-pro",
+        task_id="mmlu_pro",
+        temperature=temperature,
+        storage=storage_impl,
+    )
+    total = effective_total(len(rows), max_samples)
+    with ProgressReporter(total=total, description="Generating") as progress:
+        report = experiment.run(
+            rows,
+            max_samples=max_samples,
+            run_id=run_id,
+            resume=resume,
+            on_result=progress.on_result,
+        )
+    print(mcq_experiment.summarize_report(report))
+    export_outputs(
+        report,
+        csv_output=csv_output,
+        html_output=html_output,
+        json_output=json_output,
+        title="mmlu_pro experiment",
+    )
+    return 0

themis/cli/commands/sample_run.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""Sample run command for quick testing before full experiments."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated
+from cyclopts import Parameter
+from themis.cli.commands.config_commands import run_configured_experiment
+def sample_run_command(
+    *,
+    config: Annotated[Path, Parameter(help="Path to experiment configuration file")],
+    n: Annotated[int, Parameter(help="Number of samples to test")] = 5,
+    verbose: Annotated[bool, Parameter(help="Show detailed output")] = False,
+    show_outputs: Annotated[
+        bool, Parameter(help="Display sample outputs and predictions")
+    ] = False,
+    estimate_cost: Annotated[
+        bool, Parameter(help="Estimate full run cost based on sample")
+    ] = True,
+) -> int:
+    """Quick test run on N samples before running full experiment.
+    This command helps you:
+    - Test your configuration works correctly
+    - Preview sample outputs before full run
+    - Estimate total cost based on actual token usage
+    - Catch configuration errors early
+    - Iterate on prompts quickly
+    Examples:
+        # Basic quick test
+        uv run python -m themis.cli sample-run \\
+          --config my_config.yaml \\
+          --n 5
+        # Test with verbose output
+        uv run python -m themis.cli sample-run \\
+          --config my_config.yaml \\
+          --n 3 \\
+          --verbose \\
+          --show-outputs
+        # Test and estimate full run cost
+        uv run python -m themis.cli sample-run \\
+          --config my_config.yaml \\
+          --n 10 \\
+          --estimate-cost
+    """
+    try:
+        import json
+        import tempfile
+        from hydra import compose, initialize_config_dir
+        # Load config
+        config_path = Path(config).resolve()
+        if not config_path.exists():
+            print(f"Error: Config file not found: {config_path}")
+            return 1
+        config_dir = str(config_path.parent)
+        config_name = config_path.stem
+        print("=" * 80)
+        print(f"🧪 Sample Run: Testing {n} samples")
+        print("=" * 80)
+        print(f"Config: {config_path}")
+        print(f"Samples: {n}")
+        print()
+        # Initialize Hydra
+        with initialize_config_dir(config_dir=config_dir, version_base=None):
+            cfg = compose(config_name=config_name)
+            # Override dataset limit
+            original_limit = cfg.dataset.get("limit")
+            cfg.dataset.limit = n
+            # Use temporary storage
+            with tempfile.TemporaryDirectory() as temp_dir:
+                cfg.storage.path = temp_dir
+                # Generate temporary run_id
+                cfg.run_id = "sample-run-temp"
+                cfg.resume = False
+                print("📋 Configuration:")
+                print(f"  Model: {cfg.generation.model_identifier}")
+                print(f"  Provider: {cfg.generation.provider.name}")
+                print(f"  Temperature: {cfg.generation.sampling.temperature}")
+                print(f"  Max tokens: {cfg.generation.sampling.max_tokens}")
+                if hasattr(cfg.dataset, "source"):
+                    print(f"  Dataset: {cfg.dataset.source}")
+                print()
+                # Run experiment on sample
+                print("🚀 Running sample experiment...")
+                print()
+                # Redirect to capture run
+                result = run_configured_experiment(
+                    config_path=config_path,
+                    overrides=[
+                        f"dataset.limit={n}",
+                        f"storage.path={temp_dir}",
+                        "run_id=sample-run-temp",
+                        "resume=false",
+                    ],
+                )
+                if result != 0:
+                    print("\n❌ Sample run failed")
+                    return result
+                # Load results
+                report_path = Path(temp_dir) / "sample-run-temp" / "report.json"
+                if not report_path.exists():
+                    print("\n⚠️  No report generated")
+                    return 1
+                with report_path.open("r") as f:
+                    report_data = json.load(f)
+                # Display results
+                print("\n" + "=" * 80)
+                print("✅ Sample Run Complete")
+                print("=" * 80)
+                # Metrics
+                metrics = report_data.get("metrics", [])
+                if metrics:
+                    print("\n📊 Metrics:")
+                    for metric in metrics:
+                        name = metric["name"]
+                        mean = metric["mean"]
+                        count = metric["count"]
+                        print(f"  {name}: {mean:.4f} (n={count})")
+                # Cost analysis
+                cost_data = report_data.get("summary", {}).get("cost")
+                if cost_data:
+                    total_cost = cost_data.get("total_cost", 0)
+                    token_counts = cost_data.get("token_counts", {})
+                    prompt_tokens = token_counts.get("prompt_tokens", 0)
+                    completion_tokens = token_counts.get("completion_tokens", 0)
+                    print("\n💰 Cost (sample run):")
+                    print(f"  Total: ${total_cost:.4f}")
+                    print(f"  Per sample: ${total_cost / n:.6f}")
+                    print(
+                        f"  Prompt tokens: {prompt_tokens} ({prompt_tokens / n:.0f} avg)"
+                    )
+                    print(
+                        f"  Completion tokens: {completion_tokens} ({completion_tokens / n:.0f} avg)"
+                    )
+                    # Estimate full run cost
+                    if estimate_cost and original_limit:
+                        full_cost = (total_cost / n) * original_limit
+                        print("\n📈 Estimated full run cost:")
+                        print(f"  Dataset size: {original_limit} samples")
+                        print(f"  Estimated cost: ${full_cost:.2f}")
+                        print(
+                            f"  95% CI: ${full_cost * 0.8:.2f} - ${full_cost * 1.2:.2f}"
+                        )
+                        if full_cost > 10.0:
+                            print(f"\n⚠️  Warning: Estimated cost is ${full_cost:.2f}")
+                            print("  Consider using --limit for initial testing")
+                # Failures
+                failures = report_data.get("run_failures", [])
+                eval_failures = report_data.get("evaluation_failures", [])
+                total_failures = len(failures) + len(eval_failures)
+                if total_failures > 0:
+                    print(f"\n⚠️  Failures: {total_failures}")
+                    if failures:
+                        print(f"  Generation failures: {len(failures)}")
+                        if verbose:
+                            for failure in failures[:3]:
+                                print(
+                                    f"    - {failure.get('sample_id')}: {failure.get('message')}"
+                                )
+                    if eval_failures:
+                        print(f"  Evaluation failures: {len(eval_failures)}")
+                # Show sample outputs
+                if show_outputs:
+                    samples = report_data.get("samples", [])
+                    print("\n📝 Sample Outputs (showing up to 3):")
+                    for i, sample in enumerate(samples[:3], 1):
+                        sample_id = sample.get("sample_id", f"sample-{i}")
+                        scores = sample.get("scores", [])
+                        print(f"\n  Sample {i}: {sample_id}")
+                        if scores:
+                            for score in scores:
+                                metric_name = score.get("metric")
+                                value = score.get("value")
+                                print(f"    {metric_name}: {value:.4f}")
+                # Summary
+                print("\n" + "=" * 80)
+                print("✨ Next Steps:")
+                print("=" * 80)
+                if total_failures == 0 and metrics:
+                    avg_metric = metrics[0]["mean"]
+                    if avg_metric > 0.1:  # Reasonable performance
+                        print("  ✅ Configuration looks good!")
+                        print("  Run full experiment with:")
+                        print(
+                            f"     uv run python -m themis.cli run-config --config {config_path}"
+                        )
+                    else:
+                        print("  ⚠️  Low performance on sample - consider:")
+                        print("     - Adjusting prompt template")
+                        print("     - Tuning temperature/max_tokens")
+                        print("     - Testing different model")
+                else:
+                    print("  ⚠️  Issues detected:")
+                    if total_failures > 0:
+                        print("     - Fix failures before full run")
+                    if not metrics:
+                        print("     - Check evaluation metrics")
+                    print("     - Review configuration")
+                return 0
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        if verbose:
+            traceback.print_exc()
+        return 1
+__all__ = ["sample_run_command"]

themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl