PyPI - themis-eval - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +429 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/commands/results.py +252 -0
themis/cli/main.py +427 -57
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/core/entities.py +23 -3
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/pipelines/standard_pipeline.py +68 -8
themis/experiment/cache_manager.py +8 -3
themis/experiment/export.py +110 -2
themis/experiment/orchestrator.py +109 -11
themis/experiment/storage.py +1457 -110
themis/generation/providers/litellm_provider.py +46 -0
themis/generation/runner.py +22 -6
themis/integrations/huggingface.py +12 -1
themis/integrations/wandb.py +13 -1
themis/interfaces/__init__.py +86 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis_eval-0.2.1.dist-info/METADATA +596 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
themis_eval-0.1.1.dist-info/METADATA +0 -758
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0

themis/experiment/export.py CHANGED Viewed

@@ -146,19 +146,122 @@ def export_report_json(
     return path
+def export_summary_json(
+    report: orchestrator.ExperimentReport,
+    path: str | Path,
+    *,
+    run_id: str | None = None,
+    indent: int = 2,
+) -> Path:
+    """Export a lightweight summary JSON file for quick results viewing.
+    This creates a small summary file (~1KB) containing only the essential
+    metrics and metadata, without the full sample-level details. This is
+    ideal for quickly comparing multiple runs without parsing large report files.
+    Args:
+        report: Experiment report to summarize
+        path: Output path for summary.json
+        run_id: Optional run identifier to include in summary
+        indent: JSON indentation level
+    Returns:
+        Path to the created summary file
+    Example:
+        >>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
+        >>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
+    Note:
+        The summary file is typically ~1KB compared to ~1.6MB for the full report.
+        This makes it 1000x faster to view and compare results across runs.
+    """
+    # Extract key metrics
+    metrics_summary = {}
+    for name, aggregate in report.evaluation_report.metrics.items():
+        metrics_summary[name] = {
+            "mean": aggregate.mean,
+            "count": aggregate.count,
+        }
+    # Extract metadata from first generation record
+    metadata = {}
+    if report.generation_results:
+        first_record = report.generation_results[0]
+        metadata = {
+            "model": first_record.task.model.identifier,
+            "prompt_template": first_record.task.prompt.spec.name,
+            "sampling": {
+                "temperature": first_record.task.sampling.temperature,
+                "top_p": first_record.task.sampling.top_p,
+                "max_tokens": first_record.task.sampling.max_tokens,
+            },
+        }
+    # Calculate total cost if available
+    total_cost = 0.0
+    for record in report.generation_results:
+        if "cost_usd" in record.metrics:
+            total_cost += record.metrics["cost_usd"]
+    # Count failures
+    failure_count = len(report.evaluation_report.failures)
+    # Build summary
+    summary = {
+        "run_id": run_id,
+        "total_samples": len(report.generation_results),
+        "metrics": metrics_summary,
+        "metadata": metadata,
+        "cost_usd": round(total_cost, 4) if total_cost > 0 else None,
+        "failures": failure_count,
+        "failure_rate": (
+            round(failure_count / len(report.generation_results), 4)
+            if report.generation_results
+            else 0.0
+        ),
+    }
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
+    return path
 def export_report_bundle(
     report: orchestrator.ExperimentReport,
     *,
     csv_path: str | Path | None = None,
     html_path: str | Path | None = None,
     json_path: str | Path | None = None,
+    summary_path: str | Path | None = None,
+    run_id: str | None = None,
     charts: Sequence[ChartLike] | None = None,
     title: str = "Experiment report",
     sample_limit: int = 100,
     indent: int = 2,
 ) -> OrderedDict[str, Path]:
-    """Convenience helper that writes multiple export formats at once."""
+    """Convenience helper that writes multiple export formats at once.
+    Args:
+        report: Experiment report to export
+        csv_path: Optional path for CSV export
+        html_path: Optional path for HTML export
+        json_path: Optional path for full JSON export
+        summary_path: Optional path for lightweight summary JSON export
+        run_id: Optional run identifier for summary
+        charts: Optional charts to include in visualizations
+        title: Report title
+        sample_limit: Maximum samples to include in detailed exports
+        indent: JSON indentation level
+    Returns:
+        Ordered dict of format -> path for created files
+    Note:
+        The summary export is highly recommended as it provides quick access
+        to key metrics without parsing large report files.
+    """
     outputs: OrderedDict[str, Path] = OrderedDict()
     if csv_path is not None:
         outputs["csv"] = export_report_csv(report, csv_path)
@@ -179,6 +282,10 @@ def export_report_bundle(
             sample_limit=sample_limit,
             indent=indent,
         )
+    if summary_path is not None:
+        outputs["summary"] = export_summary_json(
+            report, summary_path, run_id=run_id, indent=indent
+        )
     return outputs
@@ -684,6 +791,7 @@ __all__ = [
     "export_report_csv",
     "export_html_report",
     "export_report_json",
+    "export_summary_json",
     "export_report_bundle",
     "render_html_report",
     "build_json_report",

themis/experiment/orchestrator.py CHANGED Viewed

@@ -2,10 +2,13 @@
 from __future__ import annotations
+import logging
 from datetime import datetime, timezone
 from typing import Callable, Sequence
 from themis.config.schema import IntegrationsConfig
+logger = logging.getLogger(__name__)
 from themis.core.entities import (
     EvaluationRecord,
     ExperimentFailure,
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
         Returns:
             ExperimentReport with generation results, evaluation, and metadata
         """
+        logger.info("Orchestrator: Initializing experiment run")
         # Initialize integrations
         self._integrations.initialize_run(
             {
@@ -112,28 +117,58 @@ class ExperimentOrchestrator:
         )
         # Prepare dataset
-        dataset_list = self._resolve_dataset(
-            dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
-        )
+        logger.info("Orchestrator: Loading dataset...")
+        try:
+            dataset_list = self._resolve_dataset(
+                dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
+            )
+            logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
+        except Exception as e:
+            logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
+            raise
         selected_dataset = (
             dataset_list[:max_samples] if max_samples is not None else dataset_list
         )
         run_identifier = run_id or self._default_run_id()
+        logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
+        logger.info(f"Orchestrator: Run ID = {run_identifier}")
+        # Initialize run in storage (if storage exists and run doesn't exist)
+        if self._cache.has_storage:
+            if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
+                self._cache._storage.start_run(run_identifier, experiment_id="default")
         # Cache dataset for resumability
         if dataset_list:
             self._cache.cache_dataset(run_identifier, dataset_list)
         # Expand dataset into generation tasks
-        tasks = list(self._plan.expand(selected_dataset))
+        logger.info("Orchestrator: Expanding dataset into generation tasks...")
+        try:
+            tasks = list(self._plan.expand(selected_dataset))
+            logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
+        except Exception as e:
+            logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
+            raise
+        # Build evaluation configuration for cache invalidation
+        evaluation_config = self._build_evaluation_config()
         # Load cached results if resuming
+        if resume:
+            logger.info("Orchestrator: Loading cached results...")
         cached_records = (
             self._cache.load_cached_records(run_identifier) if resume else {}
         )
         cached_evaluations = (
-            self._cache.load_cached_evaluations(run_identifier) if resume else {}
+            self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
         )
+        if resume and cached_records:
+            logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
+        if resume and cached_evaluations:
+            logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
         # Process tasks: use cached or run new generations
         generation_results: list[GenerationRecord] = []
@@ -144,8 +179,8 @@ class ExperimentOrchestrator:
         cached_eval_records: list[EvaluationRecord] = []
         for task in tasks:
-            cache_key = experiment_storage.task_cache_key(task)
-            cached = cached_records.get(cache_key)
+            task_cache_key = experiment_storage.task_cache_key(task)
+            cached = cached_records.get(task_cache_key)
             if cached is not None:
                 generation_results.append(cached)
                 if cached.error:
@@ -155,12 +190,14 @@ class ExperimentOrchestrator:
                             message=cached.error.message,
                         )
                     )
-                evaluation = cached_evaluations.get(cache_key)
+                # Use evaluation_cache_key that includes evaluation config
+                eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
+                evaluation = cached_evaluations.get(eval_cache_key)
                 if evaluation is not None:
                     cached_eval_records.append(evaluation)
                 else:
                     pending_records.append(cached)
-                    pending_keys.append(cache_key)
+                    pending_keys.append(eval_cache_key)
                 if on_result:
                     on_result(cached)
             else:
@@ -168,9 +205,18 @@ class ExperimentOrchestrator:
         # Run pending generation tasks
         if pending_tasks:
+            logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
+            completed = 0
             for record in self._runner.run(pending_tasks):
+                logger.debug(f"Orchestrator: Received generation record")
                 generation_results.append(record)
+                completed += 1
+                # Log progress every 10 samples or at key milestones
+                if completed % 10 == 0 or completed == len(pending_tasks):
+                    logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
+                logger.debug(f"Orchestrator: Processing record (cost tracking...)")
                 # Track cost for successful generations
                 if record.output and record.output.usage:
                     usage = record.output.usage
@@ -187,6 +233,7 @@ class ExperimentOrchestrator:
                         cost=cost,
                     )
+                logger.debug(f"Orchestrator: Processing record (error handling...)")
                 if record.error:
                     failures.append(
                         ExperimentFailure(
@@ -194,35 +241,56 @@ class ExperimentOrchestrator:
                             message=record.error.message,
                         )
                     )
+                logger.debug(f"Orchestrator: Processing record (caching...)")
                 cache_key = experiment_storage.task_cache_key(record.task)
                 if cache_results:
                     self._cache.save_generation_record(
                         run_identifier, record, cache_key
                     )
+                logger.debug(f"Orchestrator: Processing record (adding to pending...)")
                 pending_records.append(record)
                 pending_keys.append(cache_key)
+                logger.debug(f"Orchestrator: Processing record (callback...)")
                 if on_result:
                     on_result(record)
+                logger.debug(f"Orchestrator: Record processing complete")
         # Evaluate pending records
+        logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
         if pending_records:
-            new_evaluation_report = self._evaluation.evaluate(pending_records)
+            logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
+            try:
+                new_evaluation_report = self._evaluation.evaluate(pending_records)
+                logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
+            except Exception as e:
+                logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
+                raise
         else:
+            logger.info("Orchestrator: No new records to evaluate (all cached)")
             new_evaluation_report = evaluation_pipeline.EvaluationReport(
                 metrics={}, failures=[], records=[]
             )
         # Cache evaluation results
         for record, evaluation in zip(pending_records, new_evaluation_report.records):
-            self._cache.save_evaluation_record(run_identifier, record, evaluation)
+            self._cache.save_evaluation_record(
+                run_identifier, record, evaluation, evaluation_config
+            )
         # Combine cached and new evaluations
+        logger.info("Orchestrator: Combining cached and new evaluations...")
         evaluation_report = self._combine_evaluations(
             cached_eval_records, new_evaluation_report
         )
+        logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
         # Get cost breakdown
         cost_breakdown = self._cost_tracker.get_breakdown()
+        if cost_breakdown.total_cost > 0:
+            logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
         # Build metadata
         metadata = {
@@ -274,6 +342,36 @@ class ExperimentOrchestrator:
     def _default_run_id(self) -> str:
         return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
+    def _build_evaluation_config(self) -> dict:
+        """Build evaluation configuration for cache key generation.
+        This configuration includes all evaluation settings that affect results,
+        so changing metrics or extractors will invalidate the cache.
+        Returns:
+            Dictionary with evaluation configuration
+        """
+        config = {}
+        # Add metric names/types
+        if hasattr(self._evaluation, "_metrics"):
+            config["metrics"] = sorted([
+                f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
+                for metric in self._evaluation._metrics
+            ])
+        # Add extractor type
+        if hasattr(self._evaluation, "_extractor"):
+            extractor = self._evaluation._extractor
+            extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
+            config["extractor"] = extractor_type
+            # Include extractor-specific configuration if available
+            if hasattr(extractor, "field_name"):
+                config["extractor_field"] = extractor.field_name
+        return config
     def _resolve_dataset(
         self,
         *,

themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl