PyPI - themis-eval - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/commands/results.py +252 -0
themis/cli/main.py +427 -57
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/core/entities.py +23 -3
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/pipelines/standard_pipeline.py +68 -8
themis/experiment/cache_manager.py +8 -3
themis/experiment/export.py +110 -2
themis/experiment/orchestrator.py +48 -6
themis/experiment/storage.py +1313 -110
themis/integrations/huggingface.py +12 -1
themis/integrations/wandb.py +13 -1
themis/interfaces/__init__.py +86 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
{themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.1.dist-info/METADATA +0 -758
{themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/experiment/export.py CHANGED Viewed

@@ -146,19 +146,122 @@ def export_report_json(
     return path
+def export_summary_json(
+    report: orchestrator.ExperimentReport,
+    path: str | Path,
+    *,
+    run_id: str | None = None,
+    indent: int = 2,
+) -> Path:
+    """Export a lightweight summary JSON file for quick results viewing.
+    This creates a small summary file (~1KB) containing only the essential
+    metrics and metadata, without the full sample-level details. This is
+    ideal for quickly comparing multiple runs without parsing large report files.
+    Args:
+        report: Experiment report to summarize
+        path: Output path for summary.json
+        run_id: Optional run identifier to include in summary
+        indent: JSON indentation level
+    Returns:
+        Path to the created summary file
+    Example:
+        >>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
+        >>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
+    Note:
+        The summary file is typically ~1KB compared to ~1.6MB for the full report.
+        This makes it 1000x faster to view and compare results across runs.
+    """
+    # Extract key metrics
+    metrics_summary = {}
+    for name, aggregate in report.evaluation_report.metrics.items():
+        metrics_summary[name] = {
+            "mean": aggregate.mean,
+            "count": aggregate.count,
+        }
+    # Extract metadata from first generation record
+    metadata = {}
+    if report.generation_results:
+        first_record = report.generation_results[0]
+        metadata = {
+            "model": first_record.task.model.identifier,
+            "prompt_template": first_record.task.prompt.spec.name,
+            "sampling": {
+                "temperature": first_record.task.sampling.temperature,
+                "top_p": first_record.task.sampling.top_p,
+                "max_tokens": first_record.task.sampling.max_tokens,
+            },
+        }
+    # Calculate total cost if available
+    total_cost = 0.0
+    for record in report.generation_results:
+        if "cost_usd" in record.metrics:
+            total_cost += record.metrics["cost_usd"]
+    # Count failures
+    failure_count = len(report.evaluation_report.failures)
+    # Build summary
+    summary = {
+        "run_id": run_id,
+        "total_samples": len(report.generation_results),
+        "metrics": metrics_summary,
+        "metadata": metadata,
+        "cost_usd": round(total_cost, 4) if total_cost > 0 else None,
+        "failures": failure_count,
+        "failure_rate": (
+            round(failure_count / len(report.generation_results), 4)
+            if report.generation_results
+            else 0.0
+        ),
+    }
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
+    return path
 def export_report_bundle(
     report: orchestrator.ExperimentReport,
     *,
     csv_path: str | Path | None = None,
     html_path: str | Path | None = None,
     json_path: str | Path | None = None,
+    summary_path: str | Path | None = None,
+    run_id: str | None = None,
     charts: Sequence[ChartLike] | None = None,
     title: str = "Experiment report",
     sample_limit: int = 100,
     indent: int = 2,
 ) -> OrderedDict[str, Path]:
-    """Convenience helper that writes multiple export formats at once."""
+    """Convenience helper that writes multiple export formats at once.
+    Args:
+        report: Experiment report to export
+        csv_path: Optional path for CSV export
+        html_path: Optional path for HTML export
+        json_path: Optional path for full JSON export
+        summary_path: Optional path for lightweight summary JSON export
+        run_id: Optional run identifier for summary
+        charts: Optional charts to include in visualizations
+        title: Report title
+        sample_limit: Maximum samples to include in detailed exports
+        indent: JSON indentation level
+    Returns:
+        Ordered dict of format -> path for created files
+    Note:
+        The summary export is highly recommended as it provides quick access
+        to key metrics without parsing large report files.
+    """
     outputs: OrderedDict[str, Path] = OrderedDict()
     if csv_path is not None:
         outputs["csv"] = export_report_csv(report, csv_path)
@@ -179,6 +282,10 @@ def export_report_bundle(
             sample_limit=sample_limit,
             indent=indent,
         )
+    if summary_path is not None:
+        outputs["summary"] = export_summary_json(
+            report, summary_path, run_id=run_id, indent=indent
+        )
     return outputs
@@ -684,6 +791,7 @@ __all__ = [
     "export_report_csv",
     "export_html_report",
     "export_report_json",
+    "export_summary_json",
     "export_report_bundle",
     "render_html_report",
     "build_json_report",

themis/experiment/orchestrator.py CHANGED Viewed

@@ -120,6 +120,11 @@ class ExperimentOrchestrator:
         )
         run_identifier = run_id or self._default_run_id()
+        # Initialize run in storage (if storage exists and run doesn't exist)
+        if self._cache.has_storage:
+            if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
+                self._cache._storage.start_run(run_identifier, experiment_id="default")
         # Cache dataset for resumability
         if dataset_list:
             self._cache.cache_dataset(run_identifier, dataset_list)
@@ -127,12 +132,15 @@ class ExperimentOrchestrator:
         # Expand dataset into generation tasks
         tasks = list(self._plan.expand(selected_dataset))
+        # Build evaluation configuration for cache invalidation
+        evaluation_config = self._build_evaluation_config()
         # Load cached results if resuming
         cached_records = (
             self._cache.load_cached_records(run_identifier) if resume else {}
         )
         cached_evaluations = (
-            self._cache.load_cached_evaluations(run_identifier) if resume else {}
+            self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
         )
         # Process tasks: use cached or run new generations
@@ -144,8 +152,8 @@ class ExperimentOrchestrator:
         cached_eval_records: list[EvaluationRecord] = []
         for task in tasks:
-            cache_key = experiment_storage.task_cache_key(task)
-            cached = cached_records.get(cache_key)
+            task_cache_key = experiment_storage.task_cache_key(task)
+            cached = cached_records.get(task_cache_key)
             if cached is not None:
                 generation_results.append(cached)
                 if cached.error:
@@ -155,12 +163,14 @@ class ExperimentOrchestrator:
                             message=cached.error.message,
                         )
                     )
-                evaluation = cached_evaluations.get(cache_key)
+                # Use evaluation_cache_key that includes evaluation config
+                eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
+                evaluation = cached_evaluations.get(eval_cache_key)
                 if evaluation is not None:
                     cached_eval_records.append(evaluation)
                 else:
                     pending_records.append(cached)
-                    pending_keys.append(cache_key)
+                    pending_keys.append(eval_cache_key)
                 if on_result:
                     on_result(cached)
             else:
@@ -214,7 +224,9 @@ class ExperimentOrchestrator:
         # Cache evaluation results
         for record, evaluation in zip(pending_records, new_evaluation_report.records):
-            self._cache.save_evaluation_record(run_identifier, record, evaluation)
+            self._cache.save_evaluation_record(
+                run_identifier, record, evaluation, evaluation_config
+            )
         # Combine cached and new evaluations
         evaluation_report = self._combine_evaluations(
@@ -274,6 +286,36 @@ class ExperimentOrchestrator:
     def _default_run_id(self) -> str:
         return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
+    def _build_evaluation_config(self) -> dict:
+        """Build evaluation configuration for cache key generation.
+        This configuration includes all evaluation settings that affect results,
+        so changing metrics or extractors will invalidate the cache.
+        Returns:
+            Dictionary with evaluation configuration
+        """
+        config = {}
+        # Add metric names/types
+        if hasattr(self._evaluation, "_metrics"):
+            config["metrics"] = sorted([
+                f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
+                for metric in self._evaluation._metrics
+            ])
+        # Add extractor type
+        if hasattr(self._evaluation, "_extractor"):
+            extractor = self._evaluation._extractor
+            extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
+            config["extractor"] = extractor_type
+            # Include extractor-specific configuration if available
+            if hasattr(extractor, "field_name"):
+                config["extractor_field"] = extractor.field_name
+        return config
     def _resolve_dataset(
         self,
         *,

themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl