PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/experiment/orchestrator.py ADDED Viewed

@@ -0,0 +1,415 @@
+"""Experiment orchestrator primitives."""
+from __future__ import annotations
+from datetime import datetime, timezone
+from typing import Callable, Sequence
+from themis.config.schema import IntegrationsConfig
+from themis.core.entities import (
+    EvaluationRecord,
+    ExperimentFailure,
+    ExperimentReport,
+    GenerationRecord,
+    GenerationTask,
+    MetricScore,
+)
+from themis.evaluation import pipeline as evaluation_pipeline
+from themis.evaluation.reports import EvaluationFailure
+from themis.experiment import storage as experiment_storage
+from themis.experiment.cache_manager import CacheManager
+from themis.experiment.cost import CostTracker
+from themis.experiment.integration_manager import IntegrationManager
+from themis.experiment.pricing import calculate_cost, get_provider_pricing
+from themis.generation import plan as generation_plan
+from themis.generation import runner as generation_runner
+class ExperimentOrchestrator:
+    """Orchestrates experiment execution: generation → evaluation → reporting.
+    This class coordinates the experiment workflow using focused managers:
+    - CacheManager: Handles storage and resumability
+    - IntegrationManager: Handles WandB and HuggingFace Hub
+    Single Responsibility: Orchestration of experiment flow
+    """
+    def __init__(
+        self,
+        *,
+        generation_plan: generation_plan.GenerationPlan,
+        generation_runner: generation_runner.GenerationRunner,
+        evaluation_pipeline: evaluation_pipeline.EvaluationPipeline,
+        storage: experiment_storage.ExperimentStorage | None = None,
+        integrations_config: IntegrationsConfig | None = None,
+        cache_manager: CacheManager | None = None,
+        integration_manager: IntegrationManager | None = None,
+    ) -> None:
+        """Initialize experiment orchestrator.
+        Args:
+            generation_plan: Plan for expanding dataset into tasks
+            generation_runner: Runner for executing generation tasks
+            evaluation_pipeline: Pipeline for evaluating outputs
+            storage: Optional storage backend (deprecated, use cache_manager)
+            integrations_config: Integration config (deprecated, use integration_manager)
+            cache_manager: Manager for caching and resumability
+            integration_manager: Manager for external integrations
+        """
+        self._plan = generation_plan
+        self._runner = generation_runner
+        self._evaluation = evaluation_pipeline
+        # Support both new managers and legacy direct parameters for backward compatibility
+        self._cache = cache_manager or CacheManager(
+            storage=storage,
+            enable_resume=True,
+            enable_cache=True,
+        )
+        self._integrations = integration_manager or IntegrationManager(
+            config=integrations_config or IntegrationsConfig()
+        )
+        # Initialize cost tracker
+        self._cost_tracker = CostTracker()
+        # Keep legacy references for backward compatibility
+        self._storage = storage
+    def run(
+        self,
+        dataset: Sequence[dict[str, object]] | None = None,
+        *,
+        dataset_loader: Callable[[], Sequence[dict[str, object]]] | None = None,
+        max_samples: int | None = None,
+        run_id: str | None = None,
+        resume: bool = True,
+        cache_results: bool = True,
+        on_result: Callable[[GenerationRecord], None] | None = None,
+    ) -> ExperimentReport:
+        """Run experiment: generate responses, evaluate, and report results.
+        Args:
+            dataset: Optional dataset samples to use
+            dataset_loader: Optional callable to load dataset
+            max_samples: Optional limit on number of samples
+            run_id: Optional run identifier for caching
+            resume: Whether to resume from cached results
+            cache_results: Whether to cache new results
+            on_result: Optional callback for each generation result
+        Returns:
+            ExperimentReport with generation results, evaluation, and metadata
+        """
+        # Initialize integrations
+        self._integrations.initialize_run(
+            {
+                "max_samples": max_samples,
+                "run_id": run_id,
+                "resume": resume,
+            }
+        )
+        # Prepare dataset
+        dataset_list = self._resolve_dataset(
+            dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
+        )
+        selected_dataset = (
+            dataset_list[:max_samples] if max_samples is not None else dataset_list
+        )
+        run_identifier = run_id or self._default_run_id()
+        # Initialize run in storage (if storage exists and run doesn't exist)
+        if self._cache.has_storage:
+            if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
+                self._cache._storage.start_run(run_identifier, experiment_id="default")
+        # Cache dataset for resumability
+        if dataset_list:
+            self._cache.cache_dataset(run_identifier, dataset_list)
+        # Expand dataset into generation tasks
+        tasks = list(self._plan.expand(selected_dataset))
+        # Build evaluation configuration for cache invalidation
+        evaluation_config = self._build_evaluation_config()
+        # Load cached results if resuming
+        cached_records = (
+            self._cache.load_cached_records(run_identifier) if resume else {}
+        )
+        cached_evaluations = (
+            self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
+        )
+        # Process tasks: use cached or run new generations
+        generation_results: list[GenerationRecord] = []
+        failures: list[ExperimentFailure] = []
+        pending_tasks: list[GenerationTask] = []
+        pending_records: list[GenerationRecord] = []
+        pending_keys: list[str] = []
+        cached_eval_records: list[EvaluationRecord] = []
+        for task in tasks:
+            task_cache_key = experiment_storage.task_cache_key(task)
+            cached = cached_records.get(task_cache_key)
+            if cached is not None:
+                generation_results.append(cached)
+                if cached.error:
+                    failures.append(
+                        ExperimentFailure(
+                            sample_id=cached.task.metadata.get("dataset_id"),
+                            message=cached.error.message,
+                        )
+                    )
+                # Use evaluation_cache_key that includes evaluation config
+                eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
+                evaluation = cached_evaluations.get(eval_cache_key)
+                if evaluation is not None:
+                    cached_eval_records.append(evaluation)
+                else:
+                    pending_records.append(cached)
+                    pending_keys.append(eval_cache_key)
+                if on_result:
+                    on_result(cached)
+            else:
+                pending_tasks.append(task)
+        # Run pending generation tasks
+        if pending_tasks:
+            for record in self._runner.run(pending_tasks):
+                generation_results.append(record)
+                # Track cost for successful generations
+                if record.output and record.output.usage:
+                    usage = record.output.usage
+                    prompt_tokens = usage.get("prompt_tokens", 0)
+                    completion_tokens = usage.get("completion_tokens", 0)
+                    model = record.task.model.identifier
+                    # Calculate cost using pricing database
+                    cost = calculate_cost(model, prompt_tokens, completion_tokens)
+                    self._cost_tracker.record_generation(
+                        model=model,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        cost=cost,
+                    )
+                if record.error:
+                    failures.append(
+                        ExperimentFailure(
+                            sample_id=record.task.metadata.get("dataset_id"),
+                            message=record.error.message,
+                        )
+                    )
+                cache_key = experiment_storage.task_cache_key(record.task)
+                if cache_results:
+                    self._cache.save_generation_record(
+                        run_identifier, record, cache_key
+                    )
+                pending_records.append(record)
+                pending_keys.append(cache_key)
+                if on_result:
+                    on_result(record)
+        # Evaluate pending records
+        if pending_records:
+            new_evaluation_report = self._evaluation.evaluate(pending_records)
+        else:
+            new_evaluation_report = evaluation_pipeline.EvaluationReport(
+                metrics={}, failures=[], records=[]
+            )
+        # Cache evaluation results
+        for record, evaluation in zip(pending_records, new_evaluation_report.records):
+            self._cache.save_evaluation_record(
+                run_identifier, record, evaluation, evaluation_config
+            )
+        # Combine cached and new evaluations
+        evaluation_report = self._combine_evaluations(
+            cached_eval_records, new_evaluation_report
+        )
+        # Get cost breakdown
+        cost_breakdown = self._cost_tracker.get_breakdown()
+        # Build metadata
+        metadata = {
+            "total_samples": len(selected_dataset),
+            "successful_generations": sum(
+                1 for result in generation_results if not result.error
+            ),
+            "failed_generations": sum(
+                1 for result in generation_results if result.error
+            ),
+            "run_id": run_identifier,
+            "evaluation_failures": sum(
+                1 for record in evaluation_report.records if record.failures
+            )
+            + len(evaluation_report.failures),
+            # Cost tracking
+            "cost": {
+                "total_cost": cost_breakdown.total_cost,
+                "generation_cost": cost_breakdown.generation_cost,
+                "evaluation_cost": cost_breakdown.evaluation_cost,
+                "currency": cost_breakdown.currency,
+                "token_counts": cost_breakdown.token_counts,
+                "api_calls": cost_breakdown.api_calls,
+                "per_model_costs": cost_breakdown.per_model_costs,
+            },
+        }
+        # Create final report
+        report = ExperimentReport(
+            generation_results=generation_results,
+            evaluation_report=evaluation_report,
+            failures=failures,
+            metadata=metadata,
+        )
+        # Log to integrations
+        self._integrations.log_results(report)
+        # Upload to HuggingFace Hub if enabled
+        run_path = self._cache.get_run_path(run_identifier)
+        self._integrations.upload_results(report, run_path)
+        # Save report.json for multi-experiment comparison
+        if cache_results:
+            self._save_report_json(report, run_identifier)
+        return report
+    def _default_run_id(self) -> str:
+        return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
+    def _build_evaluation_config(self) -> dict:
+        """Build evaluation configuration for cache key generation.
+        This configuration includes all evaluation settings that affect results,
+        so changing metrics or extractors will invalidate the cache.
+        Returns:
+            Dictionary with evaluation configuration
+        """
+        config = {}
+        # Add metric names/types
+        if hasattr(self._evaluation, "_metrics"):
+            config["metrics"] = sorted([
+                f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
+                for metric in self._evaluation._metrics
+            ])
+        # Add extractor type
+        if hasattr(self._evaluation, "_extractor"):
+            extractor = self._evaluation._extractor
+            extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
+            config["extractor"] = extractor_type
+            # Include extractor-specific configuration if available
+            if hasattr(extractor, "field_name"):
+                config["extractor_field"] = extractor.field_name
+        return config
+    def _resolve_dataset(
+        self,
+        *,
+        dataset: Sequence[dict[str, object]] | None,
+        dataset_loader: Callable[[], Sequence[dict[str, object]]] | None,
+        run_id: str | None,
+    ) -> list[dict[str, object]]:
+        """Resolve dataset from various sources.
+        Args:
+            dataset: Direct dataset samples
+            dataset_loader: Callable to load dataset
+            run_id: Run ID to load cached dataset
+        Returns:
+            List of dataset samples
+        Raises:
+            ValueError: If no dataset source is available
+        """
+        if dataset is not None:
+            return list(dataset)
+        if dataset_loader is not None:
+            return list(dataset_loader())
+        # Try to load from cache (for backward compatibility, still use _storage directly)
+        if self._storage is not None and run_id is not None:
+            return self._storage.load_dataset(run_id)
+        raise ValueError(
+            "No dataset provided. Supply `dataset=` rows, a `dataset_loader`, "
+            "or set `run_id` with storage configured so cached data can be reloaded."
+        )
+    def _combine_evaluations(
+        self,
+        cached_records: list[EvaluationRecord],
+        new_report: evaluation_pipeline.EvaluationReport,
+    ) -> evaluation_pipeline.EvaluationReport:
+        all_records = list(cached_records) + list(new_report.records)
+        per_metric: dict[str, list[MetricScore]] = {}
+        for record in all_records:
+            for score in record.scores:
+                per_metric.setdefault(score.metric_name, []).append(score)
+        aggregates: dict[str, evaluation_pipeline.MetricAggregate] = {}
+        metric_names = set(per_metric.keys()) | set(new_report.metrics.keys())
+        for name in metric_names:
+            scores = per_metric.get(name, [])
+            mean = sum(score.value for score in scores) / len(scores) if scores else 0.0
+            aggregates[name] = evaluation_pipeline.MetricAggregate(
+                name=name,
+                count=len(scores),
+                mean=mean,
+                per_sample=scores,
+            )
+        failures = list(new_report.failures)
+        for record in cached_records:
+            for message in record.failures:
+                failures.append(
+                    EvaluationFailure(sample_id=record.sample_id, message=message)
+                )
+        return evaluation_pipeline.EvaluationReport(
+            metrics=aggregates,
+            failures=failures,
+            records=all_records,
+        )
+    def _save_report_json(self, report: ExperimentReport, run_id: str) -> None:
+        """Save experiment report as JSON for multi-experiment comparison.
+        Args:
+            report: Experiment report to save
+            run_id: Run identifier
+        """
+        from pathlib import Path
+        from themis.experiment.export import build_json_report
+        # Get run path from cache manager
+        run_path_str = self._cache.get_run_path(run_id)
+        if run_path_str is None:
+            # No storage configured, skip saving report.json
+            return
+        run_path = Path(run_path_str)
+        report_path = run_path / "report.json"
+        # Build JSON report
+        json_data = build_json_report(report, title=f"Experiment {run_id}")
+        # Save to file
+        import json
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        with report_path.open("w", encoding="utf-8") as f:
+            json.dump(json_data, f, indent=2)

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl