PyPI - openms-insight - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

openms-insight 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

openms_insight/__init__.py +11 -7
openms_insight/components/__init__.py +2 -2
openms_insight/components/heatmap.py +433 -228
openms_insight/components/lineplot.py +377 -82
openms_insight/components/sequenceview.py +677 -213
openms_insight/components/table.py +86 -58
openms_insight/core/__init__.py +2 -2
openms_insight/core/base.py +122 -54
openms_insight/core/registry.py +6 -5
openms_insight/core/state.py +33 -31
openms_insight/core/subprocess_preprocess.py +1 -3
openms_insight/js-component/dist/assets/index.css +1 -1
openms_insight/js-component/dist/assets/index.js +105 -105
openms_insight/preprocessing/__init__.py +5 -6
openms_insight/preprocessing/compression.py +123 -67
openms_insight/preprocessing/filtering.py +39 -13
openms_insight/rendering/__init__.py +1 -1
openms_insight/rendering/bridge.py +192 -42
{openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/METADATA +163 -20
openms_insight-0.1.4.dist-info/RECORD +28 -0
openms_insight-0.1.2.dist-info/RECORD +0 -28
{openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/WHEEL +0 -0
{openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/licenses/LICENSE +0 -0

openms_insight/components/heatmap.py CHANGED Viewed

@@ -8,6 +8,7 @@ from ..core.base import BaseComponent
 from ..core.registry import register_component
 from ..preprocessing.compression import (
     compute_compression_levels,
+    compute_optimal_bins,
     downsample_2d,
     downsample_2d_simple,
     downsample_2d_streaming,
@@ -22,10 +23,10 @@ def _make_zoom_cache_key(zoom: Optional[Dict[str, Any]]) -> tuple:
     if zoom is None:
         return (None,)
     return (
-        ('x0', zoom.get('xRange', [-1, -1])[0]),
-        ('x1', zoom.get('xRange', [-1, -1])[1]),
-        ('y0', zoom.get('yRange', [-1, -1])[0]),
-        ('y1', zoom.get('yRange', [-1, -1])[1]),
+        ("x0", zoom.get("xRange", [-1, -1])[0]),
+        ("x1", zoom.get("xRange", [-1, -1])[1]),
+        ("y0", zoom.get("yRange", [-1, -1])[0]),
+        ("y1", zoom.get("yRange", [-1, -1])[1]),
     )
@@ -66,28 +67,29 @@ class Heatmap(BaseComponent):
     def __init__(
         self,
         cache_id: str,
-        x_column: str,
-        y_column: str,
+        x_column: Optional[str] = None,
+        y_column: Optional[str] = None,
         data: Optional[pl.LazyFrame] = None,
         data_path: Optional[str] = None,
-        intensity_column: str = 'intensity',
+        intensity_column: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
         cache_path: str = ".",
         regenerate_cache: bool = False,
-        min_points: int = 20000,
-        x_bins: int = 400,
-        y_bins: int = 50,
-        zoom_identifier: str = 'heatmap_zoom',
+        min_points: int = 10000,
+        display_aspect_ratio: float = 16 / 9,
+        x_bins: Optional[int] = None,
+        y_bins: Optional[int] = None,
+        zoom_identifier: str = "heatmap_zoom",
         title: Optional[str] = None,
         x_label: Optional[str] = None,
         y_label: Optional[str] = None,
-        colorscale: str = 'Portland',
+        colorscale: str = "Portland",
         use_simple_downsample: bool = False,
         use_streaming: bool = True,
         categorical_filters: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Initialize the Heatmap component.
@@ -106,10 +108,17 @@ class Heatmap(BaseComponent):
                 point's value in the corresponding column.
             cache_path: Base path for cache storage. Default "." (current dir).
             regenerate_cache: If True, regenerate cache even if valid cache exists.
-            min_points: Target size for smallest compression level and
-                threshold for level selection (default: 20000)
-            x_bins: Number of bins along x-axis for downsampling (default: 400)
-            y_bins: Number of bins along y-axis for downsampling (default: 50)
+            min_points: Target number of points to display (default: 10000).
+                Cache levels are built at 2× this value; final downsample
+                at render time reduces to exactly min_points.
+            display_aspect_ratio: Expected display width/height ratio for
+                optimal bin computation during caching (default: 16/9).
+                At render time, the actual zoom region's aspect ratio is used.
+            x_bins: Number of bins along x-axis for downsampling. If None
+                (default), auto-computed from display_aspect_ratio such that
+                x_bins × y_bins ≈ 2×min_points with even spatial distribution.
+            y_bins: Number of bins along y-axis for downsampling. If None
+                (default), auto-computed from display_aspect_ratio.
             zoom_identifier: State key for storing zoom range (default: 'heatmap_zoom')
             title: Heatmap title displayed above the plot
             x_label: X-axis label (defaults to x_column)
@@ -130,6 +139,7 @@ class Heatmap(BaseComponent):
         self._y_column = y_column
         self._intensity_column = intensity_column
         self._min_points = min_points
+        self._display_aspect_ratio = display_aspect_ratio
         self._x_bins = x_bins
         self._y_bins = y_bins
         self._zoom_identifier = zoom_identifier
@@ -155,6 +165,7 @@ class Heatmap(BaseComponent):
             y_column=y_column,
             intensity_column=intensity_column,
             min_points=min_points,
+            display_aspect_ratio=display_aspect_ratio,
             x_bins=x_bins,
             y_bins=y_bins,
             zoom_identifier=zoom_identifier,
@@ -165,7 +176,7 @@ class Heatmap(BaseComponent):
             use_simple_downsample=use_simple_downsample,
             use_streaming=use_streaming,
             categorical_filters=categorical_filters,
-            **kwargs
+            **kwargs,
         )
     def _get_cache_config(self) -> Dict[str, Any]:
@@ -176,17 +187,43 @@ class Heatmap(BaseComponent):
             Dict of config values that affect preprocessing
         """
         return {
-            'x_column': self._x_column,
-            'y_column': self._y_column,
-            'intensity_column': self._intensity_column,
-            'min_points': self._min_points,
-            'x_bins': self._x_bins,
-            'y_bins': self._y_bins,
-            'use_simple_downsample': self._use_simple_downsample,
-            'use_streaming': self._use_streaming,
-            'categorical_filters': sorted(self._categorical_filters),
+            "x_column": self._x_column,
+            "y_column": self._y_column,
+            "intensity_column": self._intensity_column,
+            "min_points": self._min_points,
+            "display_aspect_ratio": self._display_aspect_ratio,
+            "x_bins": self._x_bins,
+            "y_bins": self._y_bins,
+            "use_simple_downsample": self._use_simple_downsample,
+            "use_streaming": self._use_streaming,
+            "categorical_filters": sorted(self._categorical_filters),
+            "zoom_identifier": self._zoom_identifier,
+            "title": self._title,
+            "x_label": self._x_label,
+            "y_label": self._y_label,
+            "colorscale": self._colorscale,
         }
+    def _restore_cache_config(self, config: Dict[str, Any]) -> None:
+        """Restore component-specific configuration from cached config."""
+        self._x_column = config.get("x_column")
+        self._y_column = config.get("y_column")
+        self._intensity_column = config.get("intensity_column", "intensity")
+        self._min_points = config.get("min_points", 10000)
+        self._display_aspect_ratio = config.get("display_aspect_ratio", 16 / 9)
+        # x_bins/y_bins are computed during preprocessing and stored in cache
+        # Fallback to old defaults for backward compatibility with old caches
+        self._x_bins = config.get("x_bins", 400)
+        self._y_bins = config.get("y_bins", 50)
+        self._use_simple_downsample = config.get("use_simple_downsample", False)
+        self._use_streaming = config.get("use_streaming", True)
+        self._categorical_filters = config.get("categorical_filters", [])
+        self._zoom_identifier = config.get("zoom_identifier", "heatmap_zoom")
+        self._title = config.get("title")
+        self._x_label = config.get("x_label", self._x_column)
+        self._y_label = config.get("y_label", self._y_column)
+        self._colorscale = config.get("colorscale", "Portland")
     def get_state_dependencies(self) -> list:
         """
         Return list of state keys that affect this component's data.
@@ -220,14 +257,116 @@ class Heatmap(BaseComponent):
         else:
             self._preprocess_eager()
+    def _build_cascading_levels(
+        self,
+        source_data: pl.LazyFrame,
+        level_sizes: list,
+        x_range: tuple,
+        y_range: tuple,
+        cache_dir,
+        prefix: str = "level",
+    ) -> dict:
+        """
+        Build cascading compression levels from source data.
+        Each level is built from the previous larger level rather than from
+        raw data. This is efficient (raw data read once) and produces identical
+        results because the downsampling keeps top N highest-intensity points
+        per bin - points surviving at larger levels will also be selected at
+        smaller levels.
+        Args:
+            source_data: LazyFrame with raw/filtered data
+            level_sizes: List of target sizes for compressed levels (smallest first)
+            x_range: (x_min, x_max) for consistent bin boundaries
+            y_range: (y_min, y_max) for consistent bin boundaries
+            cache_dir: Path to save parquet files
+            prefix: Filename prefix (e.g., "level" or "cat_level_im_0")
+        Returns:
+            Dict with level LazyFrames keyed by "{prefix}_{idx}" and "num_levels"
+        """
+        import sys
+        result = {}
+        num_compressed = len(level_sizes)
+        # Get total count
+        total = source_data.select(pl.len()).collect().item()
+        # First: save full resolution as the largest level
+        full_res_path = cache_dir / f"{prefix}_{num_compressed}.parquet"
+        full_res = source_data.sort([self._x_column, self._y_column])
+        full_res.sink_parquet(full_res_path, compression="zstd")
+        print(
+            f"[HEATMAP] Saved {prefix}_{num_compressed} ({total:,} pts)",
+            file=sys.stderr,
+        )
+        # Start cascading from full resolution
+        current_source = pl.scan_parquet(full_res_path)
+        current_size = total
+        # Build compressed levels from largest to smallest
+        for i, target_size in enumerate(reversed(level_sizes)):
+            level_idx = num_compressed - 1 - i
+            level_path = cache_dir / f"{prefix}_{level_idx}.parquet"
+            # If target size equals or exceeds current, just copy reference
+            if target_size >= current_size:
+                level = current_source
+            elif self._use_simple_downsample:
+                level = downsample_2d_simple(
+                    current_source,
+                    max_points=target_size,
+                    intensity_column=self._intensity_column,
+                )
+            else:
+                level = downsample_2d_streaming(
+                    current_source,
+                    max_points=target_size,
+                    x_column=self._x_column,
+                    y_column=self._y_column,
+                    intensity_column=self._intensity_column,
+                    x_bins=self._x_bins,
+                    y_bins=self._y_bins,
+                    x_range=x_range,
+                    y_range=y_range,
+                )
+            # Sort and save immediately
+            level = level.sort([self._x_column, self._y_column])
+            level.sink_parquet(level_path, compression="zstd")
+            print(
+                f"[HEATMAP] Saved {prefix}_{level_idx} (target {target_size:,} pts)",
+                file=sys.stderr,
+            )
+            # Next iteration uses this level as source (cascading)
+            current_source = pl.scan_parquet(level_path)
+            current_size = target_size
+        # Load all levels back as LazyFrames
+        for i in range(num_compressed + 1):
+            level_path = cache_dir / f"{prefix}_{i}.parquet"
+            result[f"{prefix}_{i}"] = pl.scan_parquet(level_path)
+        result["num_levels"] = num_compressed + 1
+        return result
     def _preprocess_with_categorical_filters(self) -> None:
         """
-        Preprocess with per-filter-value compression levels.
+        Preprocess with per-filter-value compression levels using cascading.
         For each unique value of each categorical filter, creates separate
-        compression levels. This ensures that when a filter is applied at
-        render time, the resulting data has ~min_points regardless of the
-        filter value selected.
+        compression levels using cascading (building smaller levels from larger).
+        This ensures that when a filter is applied at render time, the resulting
+        data has ~min_points regardless of the filter value selected.
+        Uses cascading downsampling for efficiency - each level is built from
+        the previous larger level rather than from raw data.
         Data is sorted by x, y columns for efficient range query predicate pushdown.
@@ -239,202 +378,221 @@ class Heatmap(BaseComponent):
         import sys
         # Get data ranges (for the full dataset)
+        # These ranges are used for ALL levels to ensure consistent binning
         x_range, y_range = get_data_range(
             self._raw_data,
             self._x_column,
             self._y_column,
         )
-        self._preprocessed_data['x_range'] = x_range
-        self._preprocessed_data['y_range'] = y_range
+        self._preprocessed_data["x_range"] = x_range
+        self._preprocessed_data["y_range"] = y_range
+        # Compute optimal bins if not provided
+        # Cache at 2×min_points, use display_aspect_ratio for bin computation
+        cache_target = 2 * self._min_points
+        if self._x_bins is None or self._y_bins is None:
+            # Use display aspect ratio (not data aspect ratio) for optimal bins
+            self._x_bins, self._y_bins = compute_optimal_bins(
+                cache_target,
+                (0, self._display_aspect_ratio),  # Fake x_range matching aspect
+                (0, 1.0),  # Fake y_range
+            )
+            print(
+                f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
+                f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
+                f"display aspect: {self._display_aspect_ratio:.2f})",
+                file=sys.stderr,
+            )
         # Get total count
         total = self._raw_data.select(pl.len()).collect().item()
-        self._preprocessed_data['total'] = total
+        self._preprocessed_data["total"] = total
+        # Create cache directory for immediate level saving
+        cache_dir = self._cache_dir / "preprocessed"
+        cache_dir.mkdir(parents=True, exist_ok=True)
         # Store metadata about categorical filters
-        self._preprocessed_data['has_categorical_filters'] = True
-        self._preprocessed_data['categorical_filter_values'] = {}
+        self._preprocessed_data["has_categorical_filters"] = True
+        self._preprocessed_data["categorical_filter_values"] = {}
         # Process each categorical filter
         for filter_id in self._categorical_filters:
             if filter_id not in self._filters:
-                print(f"[HEATMAP] Warning: categorical_filter '{filter_id}' not in filters, skipping", file=sys.stderr)
+                print(
+                    f"[HEATMAP] Warning: categorical_filter '{filter_id}' not in filters, skipping",
+                    file=sys.stderr,
+                )
                 continue
             column_name = self._filters[filter_id]
             # Get unique values for this filter
             unique_values = (
-                self._raw_data
-                .select(pl.col(column_name))
+                self._raw_data.select(pl.col(column_name))
                 .unique()
                 .collect()
                 .to_series()
                 .to_list()
             )
-            unique_values = sorted([v for v in unique_values if v is not None and v >= 0])
+            unique_values = sorted(
+                [v for v in unique_values if v is not None and v >= 0]
+            )
-            print(f"[HEATMAP] Categorical filter '{filter_id}' ({column_name}): {len(unique_values)} unique values", file=sys.stderr)
+            print(
+                f"[HEATMAP] Categorical filter '{filter_id}' ({column_name}): {len(unique_values)} unique values",
+                file=sys.stderr,
+            )
-            self._preprocessed_data['categorical_filter_values'][filter_id] = unique_values
+            self._preprocessed_data["categorical_filter_values"][filter_id] = (
+                unique_values
+            )
-            # Create compression levels for each filter value
+            # Create compression levels for each filter value using cascading
             for filter_value in unique_values:
                 # Filter data to this value
-                filtered_data = self._raw_data.filter(pl.col(column_name) == filter_value)
+                filtered_data = self._raw_data.filter(
+                    pl.col(column_name) == filter_value
+                )
                 filtered_total = filtered_data.select(pl.len()).collect().item()
-                # Compute level sizes for this filtered subset
-                level_sizes = compute_compression_levels(self._min_points, filtered_total)
-                print(f"[HEATMAP]   Value {filter_value}: {filtered_total:,} pts → levels {level_sizes}", file=sys.stderr)
-                # Store level sizes for this filter value
-                self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
-                # Build each compressed level
-                for level_idx, target_size in enumerate(level_sizes):
-                    # If target size equals total, skip downsampling - use all data
-                    if target_size >= filtered_total:
-                        level = filtered_data
-                    elif self._use_simple_downsample:
-                        level = downsample_2d_simple(
-                            filtered_data,
-                            max_points=target_size,
-                            intensity_column=self._intensity_column,
-                        )
-                    else:
-                        level = downsample_2d_streaming(
-                            filtered_data,
-                            max_points=target_size,
-                            x_column=self._x_column,
-                            y_column=self._y_column,
-                            intensity_column=self._intensity_column,
-                            x_bins=self._x_bins,
-                            y_bins=self._y_bins,
-                            x_range=x_range,
-                            y_range=y_range,
-                        )
-                    # Sort by x, y for efficient range query predicate pushdown
-                    level = level.sort([self._x_column, self._y_column])
-                    # Store LazyFrame for streaming to disk
-                    level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
-                    self._preprocessed_data[level_key] = level  # Keep lazy
-                # Add full resolution as final level (for zoom fallback)
-                # Also sorted for consistent predicate pushdown behavior
-                num_compressed = len(level_sizes)
-                full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
-                self._preprocessed_data[full_res_key] = filtered_data.sort(
-                    [self._x_column, self._y_column]
+                # Compute level sizes for this filtered subset (2× for cache buffer)
+                level_sizes = compute_compression_levels(
+                    cache_target, filtered_total
                 )
-                self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
-        # Also create global levels for when no categorical filter is selected
-        # (fallback to standard behavior)
-        level_sizes = compute_compression_levels(self._min_points, total)
-        self._preprocessed_data['level_sizes'] = level_sizes
-        for i, size in enumerate(level_sizes):
-            # If target size equals total, skip downsampling - use all data
-            if size >= total:
-                level = self._raw_data
-            elif self._use_simple_downsample:
-                level = downsample_2d_simple(
-                    self._raw_data,
-                    max_points=size,
-                    intensity_column=self._intensity_column,
+                print(
+                    f"[HEATMAP]   Value {filter_value}: {filtered_total:,} pts → levels {level_sizes}",
+                    file=sys.stderr,
                 )
-            else:
-                level = downsample_2d_streaming(
-                    self._raw_data,
-                    max_points=size,
-                    x_column=self._x_column,
-                    y_column=self._y_column,
-                    intensity_column=self._intensity_column,
-                    x_bins=self._x_bins,
-                    y_bins=self._y_bins,
+                # Store level sizes for this filter value
+                self._preprocessed_data[
+                    f"cat_level_sizes_{filter_id}_{filter_value}"
+                ] = level_sizes
+                # Build cascading levels using helper
+                prefix = f"cat_level_{filter_id}_{filter_value}"
+                levels_result = self._build_cascading_levels(
+                    source_data=filtered_data,
+                    level_sizes=level_sizes,
                     x_range=x_range,
                     y_range=y_range,
+                    cache_dir=cache_dir,
+                    prefix=prefix,
                 )
-            # Sort by x, y for efficient range query predicate pushdown
-            level = level.sort([self._x_column, self._y_column])
-            self._preprocessed_data[f'level_{i}'] = level  # Keep lazy
-        # Add full resolution as final level (for zoom fallback)
-        # Also sorted for consistent predicate pushdown behavior
-        num_compressed = len(level_sizes)
-        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
-            [self._x_column, self._y_column]
+                # Copy results to preprocessed_data
+                for key, value in levels_result.items():
+                    if key == "num_levels":
+                        self._preprocessed_data[
+                            f"cat_num_levels_{filter_id}_{filter_value}"
+                        ] = value
+                    else:
+                        self._preprocessed_data[key] = value
+        # Also create global levels for when no categorical filter is selected
+        # (fallback to standard behavior) - using cascading with 2× cache buffer
+        level_sizes = compute_compression_levels(cache_target, total)
+        self._preprocessed_data["level_sizes"] = level_sizes
+        # Build global cascading levels using helper
+        levels_result = self._build_cascading_levels(
+            source_data=self._raw_data,
+            level_sizes=level_sizes,
+            x_range=x_range,
+            y_range=y_range,
+            cache_dir=cache_dir,
+            prefix="level",
         )
-        self._preprocessed_data['num_levels'] = num_compressed + 1
+        # Copy results to preprocessed_data
+        for key, value in levels_result.items():
+            if key == "num_levels":
+                self._preprocessed_data["num_levels"] = value
+            else:
+                self._preprocessed_data[key] = value
+        # Mark that files are already saved
+        self._preprocessed_data["_files_already_saved"] = True
     def _preprocess_streaming(self) -> None:
         """
-        Streaming preprocessing - levels stay lazy through caching.
+        Streaming preprocessing with cascading - builds smaller levels from larger.
+        Uses cascading downsampling: each level is built from the previous larger
+        level rather than from raw data. This is more efficient (raw data read once)
+        and produces identical results because the downsampling algorithm keeps
+        the TOP N highest-intensity points per bin - points that survive at a larger
+        level will also be selected at smaller levels.
+        Levels are saved to disk immediately after creation, then read back as the
+        source for the next smaller level. This keeps memory low while enabling
+        cascading.
-        Builds lazy query plans that are streamed to disk via sink_parquet().
         Data is sorted by x, y columns for efficient range query predicate pushdown.
         """
+        import sys
         # Get data ranges (minimal collect - just 4 values)
+        # These ranges are used for ALL levels to ensure consistent binning
         x_range, y_range = get_data_range(
             self._raw_data,
             self._x_column,
             self._y_column,
         )
-        self._preprocessed_data['x_range'] = x_range
-        self._preprocessed_data['y_range'] = y_range
+        self._preprocessed_data["x_range"] = x_range
+        self._preprocessed_data["y_range"] = y_range
+        # Compute optimal bins if not provided
+        # Cache at 2×min_points, use display_aspect_ratio for bin computation
+        cache_target = 2 * self._min_points
+        if self._x_bins is None or self._y_bins is None:
+            # Use display aspect ratio (not data aspect ratio) for optimal bins
+            # This ensures even distribution in the expected display dimensions
+            self._x_bins, self._y_bins = compute_optimal_bins(
+                cache_target,
+                (0, self._display_aspect_ratio),  # Fake x_range matching aspect
+                (0, 1.0),  # Fake y_range
+            )
+            print(
+                f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
+                f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
+                f"display aspect: {self._display_aspect_ratio:.2f})",
+                file=sys.stderr,
+            )
         # Get total count
         total = self._raw_data.select(pl.len()).collect().item()
-        self._preprocessed_data['total'] = total
-        # Compute target sizes for levels
-        level_sizes = compute_compression_levels(self._min_points, total)
-        self._preprocessed_data['level_sizes'] = level_sizes
-        # Build and collect each level
-        self._preprocessed_data['levels'] = []
+        self._preprocessed_data["total"] = total
+        # Compute target sizes for levels (use 2×min_points for smallest cache level)
+        level_sizes = compute_compression_levels(cache_target, total)
+        self._preprocessed_data["level_sizes"] = level_sizes
+        # Create cache directory for immediate level saving
+        cache_dir = self._cache_dir / "preprocessed"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        # Build cascading levels using helper
+        levels_result = self._build_cascading_levels(
+            source_data=self._raw_data,
+            level_sizes=level_sizes,
+            x_range=x_range,
+            y_range=y_range,
+            cache_dir=cache_dir,
+            prefix="level",
+        )
-        for i, size in enumerate(level_sizes):
-            # If target size equals total, skip downsampling - use all data
-            if size >= total:
-                level = self._raw_data
-            elif self._use_simple_downsample:
-                level = downsample_2d_simple(
-                    self._raw_data,
-                    max_points=size,
-                    intensity_column=self._intensity_column,
-                )
+        # Copy results to preprocessed_data
+        for key, value in levels_result.items():
+            if key == "num_levels":
+                self._preprocessed_data["num_levels"] = value
             else:
-                level = downsample_2d_streaming(
-                    self._raw_data,
-                    max_points=size,
-                    x_column=self._x_column,
-                    y_column=self._y_column,
-                    intensity_column=self._intensity_column,
-                    x_bins=self._x_bins,
-                    y_bins=self._y_bins,
-                    x_range=x_range,
-                    y_range=y_range,
-                )
-            # Sort by x, y for efficient range query predicate pushdown
-            # This clusters spatially close points together in row groups
-            level = level.sort([self._x_column, self._y_column])
-            # Store LazyFrame for streaming to disk
-            # Base class will use sink_parquet() to stream without full materialization
-            self._preprocessed_data[f'level_{i}'] = level  # Keep lazy
-        # Add full resolution as final level (for zoom fallback)
-        # Also sorted for consistent predicate pushdown behavior
-        num_compressed = len(level_sizes)
-        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
-            [self._x_column, self._y_column]
-        )
+                self._preprocessed_data[key] = value
-        # Store number of levels for reconstruction (includes full resolution)
-        self._preprocessed_data['num_levels'] = num_compressed + 1
+        # Mark that files are already saved (base class should skip saving)
+        self._preprocessed_data["_files_already_saved"] = True
     def _preprocess_eager(self) -> None:
         """
@@ -444,22 +602,41 @@ class Heatmap(BaseComponent):
         downsampling for better spatial distribution.
         Data is sorted by x, y columns for efficient range query predicate pushdown.
         """
+        import sys
         # Get data ranges
         x_range, y_range = get_data_range(
             self._raw_data,
             self._x_column,
             self._y_column,
         )
-        self._preprocessed_data['x_range'] = x_range
-        self._preprocessed_data['y_range'] = y_range
+        self._preprocessed_data["x_range"] = x_range
+        self._preprocessed_data["y_range"] = y_range
+        # Compute optimal bins if not provided
+        # Cache at 2×min_points, use display_aspect_ratio for bin computation
+        cache_target = 2 * self._min_points
+        if self._x_bins is None or self._y_bins is None:
+            # Use display aspect ratio (not data aspect ratio) for optimal bins
+            self._x_bins, self._y_bins = compute_optimal_bins(
+                cache_target,
+                (0, self._display_aspect_ratio),  # Fake x_range matching aspect
+                (0, 1.0),  # Fake y_range
+            )
+            print(
+                f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
+                f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
+                f"display aspect: {self._display_aspect_ratio:.2f})",
+                file=sys.stderr,
+            )
         # Get total count
         total = self._raw_data.select(pl.len()).collect().item()
-        self._preprocessed_data['total'] = total
+        self._preprocessed_data["total"] = total
-        # Compute compression level target sizes
-        level_sizes = compute_compression_levels(self._min_points, total)
-        self._preprocessed_data['level_sizes'] = level_sizes
+        # Compute compression level target sizes (2× for cache buffer)
+        level_sizes = compute_compression_levels(cache_target, total)
+        self._preprocessed_data["level_sizes"] = level_sizes
         # Build levels from largest to smallest
         if level_sizes:
@@ -493,21 +670,23 @@ class Heatmap(BaseComponent):
                 # Store LazyFrame for streaming to disk
                 level_idx = len(level_sizes) - 1 - i
                 if isinstance(downsampled, pl.LazyFrame):
-                    self._preprocessed_data[f'level_{level_idx}'] = downsampled  # Keep lazy
+                    self._preprocessed_data[f"level_{level_idx}"] = (
+                        downsampled  # Keep lazy
+                    )
                 else:
                     # DataFrame from downsample_2d - convert back to lazy
-                    self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
+                    self._preprocessed_data[f"level_{level_idx}"] = downsampled.lazy()
                 current = downsampled
         # Add full resolution as final level (for zoom fallback)
         # Also sorted for consistent predicate pushdown behavior
         num_compressed = len(level_sizes)
-        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
+        self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
             [self._x_column, self._y_column]
         )
         # Store number of levels for reconstruction (includes full resolution)
-        self._preprocessed_data['num_levels'] = num_compressed + 1
+        self._preprocessed_data["num_levels"] = num_compressed + 1
     def _get_levels(self) -> list:
         """
@@ -516,11 +695,11 @@ class Heatmap(BaseComponent):
         Reconstructs the levels list from preprocessed data,
         adding full resolution at the end.
         """
-        num_levels = self._preprocessed_data.get('num_levels', 0)
+        num_levels = self._preprocessed_data.get("num_levels", 0)
         levels = []
         for i in range(num_levels):
-            level_data = self._preprocessed_data.get(f'level_{i}')
+            level_data = self._preprocessed_data.get(f"level_{i}")
             if level_data is not None:
                 levels.append(level_data)
@@ -543,7 +722,7 @@ class Heatmap(BaseComponent):
             Returns ([], None) if no categorical levels exist for this filter
         """
         # Check if we have categorical levels for this filter/value
-        num_levels_key = f'cat_num_levels_{filter_id}_{filter_value}'
+        num_levels_key = f"cat_num_levels_{filter_id}_{filter_value}"
         num_levels = self._preprocessed_data.get(num_levels_key, 0)
         if num_levels == 0:
@@ -551,14 +730,16 @@ class Heatmap(BaseComponent):
         levels = []
         for i in range(num_levels):
-            level_key = f'cat_level_{filter_id}_{filter_value}_{i}'
+            level_key = f"cat_level_{filter_id}_{filter_value}_{i}"
             level_data = self._preprocessed_data.get(level_key)
             if level_data is not None:
                 levels.append(level_data)
         return levels, None  # Full resolution included in cached levels
-    def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
+    def _get_levels_for_state(
+        self, state: Dict[str, Any]
+    ) -> Tuple[list, Optional[pl.LazyFrame]]:
         """
         Get appropriate compression levels based on current filter state.
@@ -573,8 +754,10 @@ class Heatmap(BaseComponent):
             Tuple of (levels list, raw data for full resolution)
         """
         # Check if we have categorical filters and a selected value
-        if self._preprocessed_data.get('has_categorical_filters'):
-            cat_filter_values = self._preprocessed_data.get('categorical_filter_values', {})
+        if self._preprocessed_data.get("has_categorical_filters"):
+            cat_filter_values = self._preprocessed_data.get(
+                "categorical_filter_values", {}
+            )
             for filter_id in self._categorical_filters:
                 if filter_id not in cat_filter_values:
@@ -590,7 +773,9 @@ class Heatmap(BaseComponent):
                 # Check if this value has per-filter levels
                 if selected_value in cat_filter_values[filter_id]:
-                    levels, filtered_raw = self._get_categorical_levels(filter_id, selected_value)
+                    levels, filtered_raw = self._get_categorical_levels(
+                        filter_id, selected_value
+                    )
                     if levels:
                         return levels, filtered_raw
@@ -599,22 +784,19 @@ class Heatmap(BaseComponent):
     def _get_vue_component_name(self) -> str:
         """Return the Vue component name."""
-        return 'PlotlyHeatmap'
+        return "PlotlyHeatmap"
     def _get_data_key(self) -> str:
         """Return the key used to send primary data to Vue."""
-        return 'heatmapData'
+        return "heatmapData"
     def _is_no_zoom(self, zoom: Optional[Dict[str, Any]]) -> bool:
         """Check if zoom state represents no zoom (full view)."""
         if zoom is None:
             return True
-        x_range = zoom.get('xRange', [-1, -1])
-        y_range = zoom.get('yRange', [-1, -1])
-        return (
-            x_range[0] < 0 and x_range[1] < 0 and
-            y_range[0] < 0 and y_range[1] < 0
-        )
+        x_range = zoom.get("xRange", [-1, -1])
+        y_range = zoom.get("yRange", [-1, -1])
+        return x_range[0] < 0 and x_range[1] < 0 and y_range[0] < 0 and y_range[1] < 0
     def _select_level_for_zoom(
         self,
@@ -641,8 +823,9 @@ class Heatmap(BaseComponent):
             Filtered Polars DataFrame at appropriate resolution
         """
         import sys
-        x0, x1 = zoom['xRange']
-        y0, y1 = zoom['yRange']
+        x0, x1 = zoom["xRange"]
+        y0, y1 = zoom["yRange"]
         # Add raw data as final level if available
         all_levels = list(levels)
@@ -658,10 +841,10 @@ class Heatmap(BaseComponent):
             # Filter to zoom range
             filtered_lazy = level_data.filter(
-                (pl.col(self._x_column) >= x0) &
-                (pl.col(self._x_column) <= x1) &
-                (pl.col(self._y_column) >= y0) &
-                (pl.col(self._y_column) <= y1)
+                (pl.col(self._x_column) >= x0)
+                & (pl.col(self._x_column) <= x1)
+                & (pl.col(self._y_column) >= y0)
+                & (pl.col(self._y_column) <= y1)
             )
             # Apply non-categorical filters if any
@@ -680,15 +863,26 @@ class Heatmap(BaseComponent):
             count = len(filtered)
             last_filtered = filtered
-            print(f"[HEATMAP] Level {level_idx}: {count} pts in zoom range", file=sys.stderr)
+            print(
+                f"[HEATMAP] Level {level_idx}: {count} pts in zoom range",
+                file=sys.stderr,
+            )
             if count >= self._min_points:
                 # This level has enough detail
                 if count > self._min_points:
-                    # Over limit - downsample to stay at/under max
-                    # Use ZOOM range for binning (not global) to avoid sparse bins
+                    # Over limit - downsample to exactly min_points
+                    # Compute optimal bins from ACTUAL zoom region aspect ratio
                     zoom_x_range = (x0, x1)
                     zoom_y_range = (y0, y1)
+                    render_x_bins, render_y_bins = compute_optimal_bins(
+                        self._min_points, zoom_x_range, zoom_y_range
+                    )
+                    print(
+                        f"[HEATMAP] Render downsample: {count:,} → {self._min_points:,} pts "
+                        f"(bins: {render_x_bins}x{render_y_bins})",
+                        file=sys.stderr,
+                    )
                     if self._use_streaming or self._use_simple_downsample:
                         if self._use_simple_downsample:
                             return downsample_2d_simple(
@@ -703,8 +897,8 @@ class Heatmap(BaseComponent):
                                 x_column=self._x_column,
                                 y_column=self._y_column,
                                 intensity_column=self._intensity_column,
-                                x_bins=self._x_bins,
-                                y_bins=self._y_bins,
+                                x_bins=render_x_bins,
+                                y_bins=render_y_bins,
                                 x_range=zoom_x_range,
                                 y_range=zoom_y_range,
                             ).collect()
@@ -715,8 +909,8 @@ class Heatmap(BaseComponent):
                             x_column=self._x_column,
                             y_column=self._y_column,
                             intensity_column=self._intensity_column,
-                            x_bins=self._x_bins,
-                            y_bins=self._y_bins,
+                            x_bins=render_x_bins,
+                            y_bins=render_y_bins,
                         ).collect()
                 return filtered
@@ -740,6 +934,7 @@ class Heatmap(BaseComponent):
             Dict with heatmapData (pandas DataFrame) and _hash for change detection
         """
         import sys
         zoom = state.get(self._zoom_identifier)
         # Build columns to select
@@ -761,7 +956,9 @@ class Heatmap(BaseComponent):
         # Get levels based on current state (may use per-filter levels)
         levels, filtered_raw = self._get_levels_for_state(state)
-        level_sizes = [len(l) if isinstance(l, pl.DataFrame) else '?' for l in levels]
+        level_sizes = [
+            len(lvl) if isinstance(lvl, pl.DataFrame) else "?" for lvl in levels
+        ]
         # Determine which filters still need to be applied at render time
         # (filters not in categorical_filters need runtime application)
@@ -775,12 +972,15 @@ class Heatmap(BaseComponent):
             # No zoom - use smallest level
             if not levels:
                 # No levels available
-                print(f"[HEATMAP] No levels available", file=sys.stderr)
-                return {'heatmapData': pl.DataFrame().to_pandas(), '_hash': ''}
+                print("[HEATMAP] No levels available", file=sys.stderr)
+                return {"heatmapData": pl.DataFrame().to_pandas(), "_hash": ""}
             data = levels[0]
-            using_cat = self._preprocessed_data.get('has_categorical_filters', False)
-            print(f"[HEATMAP] No zoom → level 0 ({level_sizes[0]} pts), levels={level_sizes}, categorical={using_cat}", file=sys.stderr)
+            using_cat = self._preprocessed_data.get("has_categorical_filters", False)
+            print(
+                f"[HEATMAP] No zoom → level 0 ({level_sizes[0]} pts), levels={level_sizes}, categorical={using_cat}",
+                file=sys.stderr,
+            )
             # Ensure we have a LazyFrame
             if isinstance(data, pl.DataFrame):
@@ -796,7 +996,9 @@ class Heatmap(BaseComponent):
                     filter_defaults=self._filter_defaults,
                 )
                 # Sort by intensity ascending so high-intensity points are drawn on top
-                df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
+                df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(
+                    drop=True
+                )
             else:
                 # No filters to apply - levels already filtered by categorical filter
                 schema_names = data.collect_schema().names()
@@ -817,13 +1019,16 @@ class Heatmap(BaseComponent):
             df_polars = df_polars.select(available_cols)
             # Sort by intensity ascending so high-intensity points are drawn on top
             df_polars = df_polars.sort(self._intensity_column)
-            print(f"[HEATMAP] Selected {len(df_polars)} pts for zoom, levels={level_sizes}", file=sys.stderr)
+            print(
+                f"[HEATMAP] Selected {len(df_polars)} pts for zoom, levels={level_sizes}",
+                file=sys.stderr,
+            )
             data_hash = compute_dataframe_hash(df_polars)
             df_pandas = df_polars.to_pandas()
         return {
-            'heatmapData': df_pandas,
-            '_hash': data_hash,
+            "heatmapData": df_pandas,
+            "_hash": data_hash,
         }
     def _get_component_args(self) -> Dict[str, Any]:
@@ -834,19 +1039,19 @@ class Heatmap(BaseComponent):
             Dict with all heatmap configuration for Vue
         """
         args: Dict[str, Any] = {
-            'componentType': self._get_vue_component_name(),
-            'xColumn': self._x_column,
-            'yColumn': self._y_column,
-            'intensityColumn': self._intensity_column,
-            'xLabel': self._x_label,
-            'yLabel': self._y_label,
-            'colorscale': self._colorscale,
-            'zoomIdentifier': self._zoom_identifier,
-            'interactivity': self._interactivity,
+            "componentType": self._get_vue_component_name(),
+            "xColumn": self._x_column,
+            "yColumn": self._y_column,
+            "intensityColumn": self._intensity_column,
+            "xLabel": self._x_label,
+            "yLabel": self._y_label,
+            "colorscale": self._colorscale,
+            "zoomIdentifier": self._zoom_identifier,
+            "interactivity": self._interactivity,
         }
         if self._title:
-            args['title'] = self._title
+            args["title"] = self._title
         # Add any extra config options
         args.update(self._config)
@@ -858,7 +1063,7 @@ class Heatmap(BaseComponent):
         colorscale: Optional[str] = None,
         x_label: Optional[str] = None,
         y_label: Optional[str] = None,
-    ) -> 'Heatmap':
+    ) -> "Heatmap":
         """
         Update heatmap styling.

openms-insight 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

openms-insight 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl