PyPI - openms-insight - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

openms-insight 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

openms_insight/components/heatmap.py CHANGED Viewed

@@ -8,6 +8,7 @@ from ..core.base import BaseComponent
 from ..core.registry import register_component
 from ..preprocessing.compression import (
     compute_compression_levels,
+    compute_optimal_bins,
     downsample_2d,
     downsample_2d_simple,
     downsample_2d_streaming,
@@ -76,9 +77,10 @@ class Heatmap(BaseComponent):
         interactivity: Optional[Dict[str, str]] = None,
         cache_path: str = ".",
         regenerate_cache: bool = False,
-        min_points: int = 20000,
-        x_bins: int = 400,
-        y_bins: int = 50,
+        min_points: int = 10000,
+        display_aspect_ratio: float = 16 / 9,
+        x_bins: Optional[int] = None,
+        y_bins: Optional[int] = None,
         zoom_identifier: str = "heatmap_zoom",
         title: Optional[str] = None,
         x_label: Optional[str] = None,
@@ -106,10 +108,17 @@ class Heatmap(BaseComponent):
                 point's value in the corresponding column.
             cache_path: Base path for cache storage. Default "." (current dir).
             regenerate_cache: If True, regenerate cache even if valid cache exists.
-            min_points: Target size for smallest compression level and
-                threshold for level selection (default: 20000)
-            x_bins: Number of bins along x-axis for downsampling (default: 400)
-            y_bins: Number of bins along y-axis for downsampling (default: 50)
+            min_points: Target number of points to display (default: 10000).
+                Cache levels are built at 2× this value; final downsample
+                at render time reduces to exactly min_points.
+            display_aspect_ratio: Expected display width/height ratio for
+                optimal bin computation during caching (default: 16/9).
+                At render time, the actual zoom region's aspect ratio is used.
+            x_bins: Number of bins along x-axis for downsampling. If None
+                (default), auto-computed from display_aspect_ratio such that
+                x_bins × y_bins ≈ 2×min_points with even spatial distribution.
+            y_bins: Number of bins along y-axis for downsampling. If None
+                (default), auto-computed from display_aspect_ratio.
             zoom_identifier: State key for storing zoom range (default: 'heatmap_zoom')
             title: Heatmap title displayed above the plot
             x_label: X-axis label (defaults to x_column)
@@ -130,6 +139,7 @@ class Heatmap(BaseComponent):
         self._y_column = y_column
         self._intensity_column = intensity_column
         self._min_points = min_points
+        self._display_aspect_ratio = display_aspect_ratio
         self._x_bins = x_bins
         self._y_bins = y_bins
         self._zoom_identifier = zoom_identifier
@@ -155,6 +165,7 @@ class Heatmap(BaseComponent):
             y_column=y_column,
             intensity_column=intensity_column,
             min_points=min_points,
+            display_aspect_ratio=display_aspect_ratio,
             x_bins=x_bins,
             y_bins=y_bins,
             zoom_identifier=zoom_identifier,
@@ -180,6 +191,7 @@ class Heatmap(BaseComponent):
             "y_column": self._y_column,
             "intensity_column": self._intensity_column,
             "min_points": self._min_points,
+            "display_aspect_ratio": self._display_aspect_ratio,
             "x_bins": self._x_bins,
             "y_bins": self._y_bins,
             "use_simple_downsample": self._use_simple_downsample,
@@ -197,7 +209,10 @@ class Heatmap(BaseComponent):
         self._x_column = config.get("x_column")
         self._y_column = config.get("y_column")
         self._intensity_column = config.get("intensity_column", "intensity")
-        self._min_points = config.get("min_points", 20000)
+        self._min_points = config.get("min_points", 10000)
+        self._display_aspect_ratio = config.get("display_aspect_ratio", 16 / 9)
+        # x_bins/y_bins are computed during preprocessing and stored in cache
+        # Fallback to old defaults for backward compatibility with old caches
         self._x_bins = config.get("x_bins", 400)
         self._y_bins = config.get("y_bins", 50)
         self._use_simple_downsample = config.get("use_simple_downsample", False)
@@ -242,14 +257,116 @@ class Heatmap(BaseComponent):
         else:
             self._preprocess_eager()
+    def _build_cascading_levels(
+        self,
+        source_data: pl.LazyFrame,
+        level_sizes: list,
+        x_range: tuple,
+        y_range: tuple,
+        cache_dir,
+        prefix: str = "level",
+    ) -> dict:
+        """
+        Build cascading compression levels from source data.
+        Each level is built from the previous larger level rather than from
+        raw data. This is efficient (raw data read once) and produces identical
+        results because the downsampling keeps top N highest-intensity points
+        per bin - points surviving at larger levels will also be selected at
+        smaller levels.
+        Args:
+            source_data: LazyFrame with raw/filtered data
+            level_sizes: List of target sizes for compressed levels (smallest first)
+            x_range: (x_min, x_max) for consistent bin boundaries
+            y_range: (y_min, y_max) for consistent bin boundaries
+            cache_dir: Path to save parquet files
+            prefix: Filename prefix (e.g., "level" or "cat_level_im_0")
+        Returns:
+            Dict with level LazyFrames keyed by "{prefix}_{idx}" and "num_levels"
+        """
+        import sys
+        result = {}
+        num_compressed = len(level_sizes)
+        # Get total count
+        total = source_data.select(pl.len()).collect().item()
+        # First: save full resolution as the largest level
+        full_res_path = cache_dir / f"{prefix}_{num_compressed}.parquet"
+        full_res = source_data.sort([self._x_column, self._y_column])
+        full_res.sink_parquet(full_res_path, compression="zstd")
+        print(
+            f"[HEATMAP] Saved {prefix}_{num_compressed} ({total:,} pts)",
+            file=sys.stderr,
+        )
+        # Start cascading from full resolution
+        current_source = pl.scan_parquet(full_res_path)
+        current_size = total
+        # Build compressed levels from largest to smallest
+        for i, target_size in enumerate(reversed(level_sizes)):
+            level_idx = num_compressed - 1 - i
+            level_path = cache_dir / f"{prefix}_{level_idx}.parquet"
+            # If target size equals or exceeds current, just copy reference
+            if target_size >= current_size:
+                level = current_source
+            elif self._use_simple_downsample:
+                level = downsample_2d_simple(
+                    current_source,
+                    max_points=target_size,
+                    intensity_column=self._intensity_column,
+                )
+            else:
+                level = downsample_2d_streaming(
+                    current_source,
+                    max_points=target_size,
+                    x_column=self._x_column,
+                    y_column=self._y_column,
+                    intensity_column=self._intensity_column,
+                    x_bins=self._x_bins,
+                    y_bins=self._y_bins,
+                    x_range=x_range,
+                    y_range=y_range,
+                )
+            # Sort and save immediately
+            level = level.sort([self._x_column, self._y_column])
+            level.sink_parquet(level_path, compression="zstd")
+            print(
+                f"[HEATMAP] Saved {prefix}_{level_idx} (target {target_size:,} pts)",
+                file=sys.stderr,
+            )
+            # Next iteration uses this level as source (cascading)
+            current_source = pl.scan_parquet(level_path)
+            current_size = target_size
+        # Load all levels back as LazyFrames
+        for i in range(num_compressed + 1):
+            level_path = cache_dir / f"{prefix}_{i}.parquet"
+            result[f"{prefix}_{i}"] = pl.scan_parquet(level_path)
+        result["num_levels"] = num_compressed + 1
+        return result
     def _preprocess_with_categorical_filters(self) -> None:
         """
-        Preprocess with per-filter-value compression levels.
+        Preprocess with per-filter-value compression levels using cascading.
         For each unique value of each categorical filter, creates separate
-        compression levels. This ensures that when a filter is applied at
-        render time, the resulting data has ~min_points regardless of the
-        filter value selected.
+        compression levels using cascading (building smaller levels from larger).
+        This ensures that when a filter is applied at render time, the resulting
+        data has ~min_points regardless of the filter value selected.
+        Uses cascading downsampling for efficiency - each level is built from
+        the previous larger level rather than from raw data.
         Data is sorted by x, y columns for efficient range query predicate pushdown.
@@ -261,6 +378,7 @@ class Heatmap(BaseComponent):
         import sys
         # Get data ranges (for the full dataset)
+        # These ranges are used for ALL levels to ensure consistent binning
         x_range, y_range = get_data_range(
             self._raw_data,
             self._x_column,
@@ -269,10 +387,31 @@ class Heatmap(BaseComponent):
         self._preprocessed_data["x_range"] = x_range
         self._preprocessed_data["y_range"] = y_range
+        # Compute optimal bins if not provided
+        # Cache at 2×min_points, use display_aspect_ratio for bin computation
+        cache_target = 2 * self._min_points
+        if self._x_bins is None or self._y_bins is None:
+            # Use display aspect ratio (not data aspect ratio) for optimal bins
+            self._x_bins, self._y_bins = compute_optimal_bins(
+                cache_target,
+                (0, self._display_aspect_ratio),  # Fake x_range matching aspect
+                (0, 1.0),  # Fake y_range
+            )
+            print(
+                f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
+                f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
+                f"display aspect: {self._display_aspect_ratio:.2f})",
+                file=sys.stderr,
+            )
         # Get total count
         total = self._raw_data.select(pl.len()).collect().item()
         self._preprocessed_data["total"] = total
+        # Create cache directory for immediate level saving
+        cache_dir = self._cache_dir / "preprocessed"
+        cache_dir.mkdir(parents=True, exist_ok=True)
         # Store metadata about categorical filters
         self._preprocessed_data["has_categorical_filters"] = True
         self._preprocessed_data["categorical_filter_values"] = {}
@@ -309,7 +448,7 @@ class Heatmap(BaseComponent):
                 unique_values
             )
-            # Create compression levels for each filter value
+            # Create compression levels for each filter value using cascading
             for filter_value in unique_values:
                 # Filter data to this value
                 filtered_data = self._raw_data.filter(
@@ -317,9 +456,9 @@ class Heatmap(BaseComponent):
                 )
                 filtered_total = filtered_data.select(pl.len()).collect().item()
-                # Compute level sizes for this filtered subset
+                # Compute level sizes for this filtered subset (2× for cache buffer)
                 level_sizes = compute_compression_levels(
-                    self._min_points, filtered_total
+                    cache_target, filtered_total
                 )
                 print(
@@ -332,94 +471,71 @@ class Heatmap(BaseComponent):
                     f"cat_level_sizes_{filter_id}_{filter_value}"
                 ] = level_sizes
-                # Build each compressed level
-                for level_idx, target_size in enumerate(level_sizes):
-                    # If target size equals total, skip downsampling - use all data
-                    if target_size >= filtered_total:
-                        level = filtered_data
-                    elif self._use_simple_downsample:
-                        level = downsample_2d_simple(
-                            filtered_data,
-                            max_points=target_size,
-                            intensity_column=self._intensity_column,
-                        )
-                    else:
-                        level = downsample_2d_streaming(
-                            filtered_data,
-                            max_points=target_size,
-                            x_column=self._x_column,
-                            y_column=self._y_column,
-                            intensity_column=self._intensity_column,
-                            x_bins=self._x_bins,
-                            y_bins=self._y_bins,
-                            x_range=x_range,
-                            y_range=y_range,
-                        )
-                    # Sort by x, y for efficient range query predicate pushdown
-                    level = level.sort([self._x_column, self._y_column])
-                    # Store LazyFrame for streaming to disk
-                    level_key = f"cat_level_{filter_id}_{filter_value}_{level_idx}"
-                    self._preprocessed_data[level_key] = level  # Keep lazy
-                # Add full resolution as final level (for zoom fallback)
-                # Also sorted for consistent predicate pushdown behavior
-                num_compressed = len(level_sizes)
-                full_res_key = f"cat_level_{filter_id}_{filter_value}_{num_compressed}"
-                self._preprocessed_data[full_res_key] = filtered_data.sort(
-                    [self._x_column, self._y_column]
+                # Build cascading levels using helper
+                prefix = f"cat_level_{filter_id}_{filter_value}"
+                levels_result = self._build_cascading_levels(
+                    source_data=filtered_data,
+                    level_sizes=level_sizes,
+                    x_range=x_range,
+                    y_range=y_range,
+                    cache_dir=cache_dir,
+                    prefix=prefix,
                 )
-                self._preprocessed_data[
-                    f"cat_num_levels_{filter_id}_{filter_value}"
-                ] = num_compressed + 1
+                # Copy results to preprocessed_data
+                for key, value in levels_result.items():
+                    if key == "num_levels":
+                        self._preprocessed_data[
+                            f"cat_num_levels_{filter_id}_{filter_value}"
+                        ] = value
+                    else:
+                        self._preprocessed_data[key] = value
         # Also create global levels for when no categorical filter is selected
-        # (fallback to standard behavior)
-        level_sizes = compute_compression_levels(self._min_points, total)
+        # (fallback to standard behavior) - using cascading with 2× cache buffer
+        level_sizes = compute_compression_levels(cache_target, total)
         self._preprocessed_data["level_sizes"] = level_sizes
-        for i, size in enumerate(level_sizes):
-            # If target size equals total, skip downsampling - use all data
-            if size >= total:
-                level = self._raw_data
-            elif self._use_simple_downsample:
-                level = downsample_2d_simple(
-                    self._raw_data,
-                    max_points=size,
-                    intensity_column=self._intensity_column,
-                )
+        # Build global cascading levels using helper
+        levels_result = self._build_cascading_levels(
+            source_data=self._raw_data,
+            level_sizes=level_sizes,
+            x_range=x_range,
+            y_range=y_range,
+            cache_dir=cache_dir,
+            prefix="level",
+        )
+        # Copy results to preprocessed_data
+        for key, value in levels_result.items():
+            if key == "num_levels":
+                self._preprocessed_data["num_levels"] = value
             else:
-                level = downsample_2d_streaming(
-                    self._raw_data,
-                    max_points=size,
-                    x_column=self._x_column,
-                    y_column=self._y_column,
-                    intensity_column=self._intensity_column,
-                    x_bins=self._x_bins,
-                    y_bins=self._y_bins,
-                    x_range=x_range,
-                    y_range=y_range,
-                )
-            # Sort by x, y for efficient range query predicate pushdown
-            level = level.sort([self._x_column, self._y_column])
-            self._preprocessed_data[f"level_{i}"] = level  # Keep lazy
+                self._preprocessed_data[key] = value
-        # Add full resolution as final level (for zoom fallback)
-        # Also sorted for consistent predicate pushdown behavior
-        num_compressed = len(level_sizes)
-        self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
-            [self._x_column, self._y_column]
-        )
-        self._preprocessed_data["num_levels"] = num_compressed + 1
+        # Mark that files are already saved
+        self._preprocessed_data["_files_already_saved"] = True
     def _preprocess_streaming(self) -> None:
         """
-        Streaming preprocessing - levels stay lazy through caching.
+        Streaming preprocessing with cascading - builds smaller levels from larger.
+        Uses cascading downsampling: each level is built from the previous larger
+        level rather than from raw data. This is more efficient (raw data read once)
+        and produces identical results because the downsampling algorithm keeps
+        the TOP N highest-intensity points per bin - points that survive at a larger
+        level will also be selected at smaller levels.
+        Levels are saved to disk immediately after creation, then read back as the
+        source for the next smaller level. This keeps memory low while enabling
+        cascading.
-        Builds lazy query plans that are streamed to disk via sink_parquet().
         Data is sorted by x, y columns for efficient range query predicate pushdown.
         """
+        import sys
         # Get data ranges (minimal collect - just 4 values)
+        # These ranges are used for ALL levels to ensure consistent binning
         x_range, y_range = get_data_range(
             self._raw_data,
             self._x_column,
@@ -428,55 +544,55 @@ class Heatmap(BaseComponent):
         self._preprocessed_data["x_range"] = x_range
         self._preprocessed_data["y_range"] = y_range
+        # Compute optimal bins if not provided
+        # Cache at 2×min_points, use display_aspect_ratio for bin computation
+        cache_target = 2 * self._min_points
+        if self._x_bins is None or self._y_bins is None:
+            # Use display aspect ratio (not data aspect ratio) for optimal bins
+            # This ensures even distribution in the expected display dimensions
+            self._x_bins, self._y_bins = compute_optimal_bins(
+                cache_target,
+                (0, self._display_aspect_ratio),  # Fake x_range matching aspect
+                (0, 1.0),  # Fake y_range
+            )
+            print(
+                f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
+                f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
+                f"display aspect: {self._display_aspect_ratio:.2f})",
+                file=sys.stderr,
+            )
         # Get total count
         total = self._raw_data.select(pl.len()).collect().item()
         self._preprocessed_data["total"] = total
-        # Compute target sizes for levels
-        level_sizes = compute_compression_levels(self._min_points, total)
+        # Compute target sizes for levels (use 2×min_points for smallest cache level)
+        level_sizes = compute_compression_levels(cache_target, total)
         self._preprocessed_data["level_sizes"] = level_sizes
-        # Build and collect each level
-        self._preprocessed_data["levels"] = []
+        # Create cache directory for immediate level saving
+        cache_dir = self._cache_dir / "preprocessed"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        # Build cascading levels using helper
+        levels_result = self._build_cascading_levels(
+            source_data=self._raw_data,
+            level_sizes=level_sizes,
+            x_range=x_range,
+            y_range=y_range,
+            cache_dir=cache_dir,
+            prefix="level",
+        )
-        for i, size in enumerate(level_sizes):
-            # If target size equals total, skip downsampling - use all data
-            if size >= total:
-                level = self._raw_data
-            elif self._use_simple_downsample:
-                level = downsample_2d_simple(
-                    self._raw_data,
-                    max_points=size,
-                    intensity_column=self._intensity_column,
-                )
+        # Copy results to preprocessed_data
+        for key, value in levels_result.items():
+            if key == "num_levels":
+                self._preprocessed_data["num_levels"] = value
             else:
-                level = downsample_2d_streaming(
-                    self._raw_data,
-                    max_points=size,
-                    x_column=self._x_column,
-                    y_column=self._y_column,
-                    intensity_column=self._intensity_column,
-                    x_bins=self._x_bins,
-                    y_bins=self._y_bins,
-                    x_range=x_range,
-                    y_range=y_range,
-                )
-            # Sort by x, y for efficient range query predicate pushdown
-            # This clusters spatially close points together in row groups
-            level = level.sort([self._x_column, self._y_column])
-            # Store LazyFrame for streaming to disk
-            # Base class will use sink_parquet() to stream without full materialization
-            self._preprocessed_data[f"level_{i}"] = level  # Keep lazy
+                self._preprocessed_data[key] = value
-        # Add full resolution as final level (for zoom fallback)
-        # Also sorted for consistent predicate pushdown behavior
-        num_compressed = len(level_sizes)
-        self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
-            [self._x_column, self._y_column]
-        )
-        # Store number of levels for reconstruction (includes full resolution)
-        self._preprocessed_data["num_levels"] = num_compressed + 1
+        # Mark that files are already saved (base class should skip saving)
+        self._preprocessed_data["_files_already_saved"] = True
     def _preprocess_eager(self) -> None:
         """
@@ -486,6 +602,8 @@ class Heatmap(BaseComponent):
         downsampling for better spatial distribution.
         Data is sorted by x, y columns for efficient range query predicate pushdown.
         """
+        import sys
         # Get data ranges
         x_range, y_range = get_data_range(
             self._raw_data,
@@ -495,12 +613,29 @@ class Heatmap(BaseComponent):
         self._preprocessed_data["x_range"] = x_range
         self._preprocessed_data["y_range"] = y_range
+        # Compute optimal bins if not provided
+        # Cache at 2×min_points, use display_aspect_ratio for bin computation
+        cache_target = 2 * self._min_points
+        if self._x_bins is None or self._y_bins is None:
+            # Use display aspect ratio (not data aspect ratio) for optimal bins
+            self._x_bins, self._y_bins = compute_optimal_bins(
+                cache_target,
+                (0, self._display_aspect_ratio),  # Fake x_range matching aspect
+                (0, 1.0),  # Fake y_range
+            )
+            print(
+                f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
+                f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
+                f"display aspect: {self._display_aspect_ratio:.2f})",
+                file=sys.stderr,
+            )
         # Get total count
         total = self._raw_data.select(pl.len()).collect().item()
         self._preprocessed_data["total"] = total
-        # Compute compression level target sizes
-        level_sizes = compute_compression_levels(self._min_points, total)
+        # Compute compression level target sizes (2× for cache buffer)
+        level_sizes = compute_compression_levels(cache_target, total)
         self._preprocessed_data["level_sizes"] = level_sizes
         # Build levels from largest to smallest
@@ -736,10 +871,18 @@ class Heatmap(BaseComponent):
             if count >= self._min_points:
                 # This level has enough detail
                 if count > self._min_points:
-                    # Over limit - downsample to stay at/under max
-                    # Use ZOOM range for binning (not global) to avoid sparse bins
+                    # Over limit - downsample to exactly min_points
+                    # Compute optimal bins from ACTUAL zoom region aspect ratio
                     zoom_x_range = (x0, x1)
                     zoom_y_range = (y0, y1)
+                    render_x_bins, render_y_bins = compute_optimal_bins(
+                        self._min_points, zoom_x_range, zoom_y_range
+                    )
+                    print(
+                        f"[HEATMAP] Render downsample: {count:,} → {self._min_points:,} pts "
+                        f"(bins: {render_x_bins}x{render_y_bins})",
+                        file=sys.stderr,
+                    )
                     if self._use_streaming or self._use_simple_downsample:
                         if self._use_simple_downsample:
                             return downsample_2d_simple(
@@ -754,8 +897,8 @@ class Heatmap(BaseComponent):
                                 x_column=self._x_column,
                                 y_column=self._y_column,
                                 intensity_column=self._intensity_column,
-                                x_bins=self._x_bins,
-                                y_bins=self._y_bins,
+                                x_bins=render_x_bins,
+                                y_bins=render_y_bins,
                                 x_range=zoom_x_range,
                                 y_range=zoom_y_range,
                             ).collect()
@@ -766,8 +909,8 @@ class Heatmap(BaseComponent):
                             x_column=self._x_column,
                             y_column=self._y_column,
                             intensity_column=self._intensity_column,
-                            x_bins=self._x_bins,
-                            y_bins=self._y_bins,
+                            x_bins=render_x_bins,
+                            y_bins=render_y_bins,
                         ).collect()
                 return filtered

openms_insight/core/base.py CHANGED Viewed

@@ -318,6 +318,9 @@ class BaseComponent(ABC):
             "data_values": {},
         }
+        # Check if files were already saved during preprocessing (e.g., cascading)
+        files_already_saved = self._preprocessed_data.pop("_files_already_saved", False)
         # Save preprocessed data with type optimization for efficient transfer
         # Float64→Float32 reduces Arrow payload size
         # Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
@@ -325,18 +328,28 @@ class BaseComponent(ABC):
             if isinstance(value, pl.LazyFrame):
                 filename = f"{key}.parquet"
                 filepath = preprocessed_dir / filename
-                # Apply streaming-safe optimization (Float64→Float32 only)
-                # Int64 bounds checking would require collect(), breaking streaming
-                value = optimize_for_transfer_lazy(value)
-                value.sink_parquet(filepath, compression="zstd")
-                manifest["data_files"][key] = filename
+                if files_already_saved and filepath.exists():
+                    # File was saved during preprocessing (cascading) - just register it
+                    manifest["data_files"][key] = filename
+                else:
+                    # Apply streaming-safe optimization (Float64→Float32 only)
+                    # Int64 bounds checking would require collect(), breaking streaming
+                    value = optimize_for_transfer_lazy(value)
+                    value.sink_parquet(filepath, compression="zstd")
+                    manifest["data_files"][key] = filename
             elif isinstance(value, pl.DataFrame):
                 filename = f"{key}.parquet"
                 filepath = preprocessed_dir / filename
-                # Full optimization including Int64→Int32 with bounds checking
-                value = optimize_for_transfer(value)
-                value.write_parquet(filepath, compression="zstd")
-                manifest["data_files"][key] = filename
+                if files_already_saved and filepath.exists():
+                    # File was saved during preprocessing - just register it
+                    manifest["data_files"][key] = filename
+                else:
+                    # Full optimization including Int64→Int32 with bounds checking
+                    value = optimize_for_transfer(value)
+                    value.write_parquet(filepath, compression="zstd")
+                    manifest["data_files"][key] = filename
             elif self._is_json_serializable(value):
                 manifest["data_values"][key] = value

openms_insight/preprocessing/compression.py CHANGED Viewed

@@ -6,7 +6,8 @@ data, enabling efficient visualization of datasets with millions of points.
 Supports both streaming (lazy) and eager downsampling approaches.
 """
-from typing import List, Optional, Union
+import math
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import polars as pl
@@ -19,6 +20,59 @@ except ImportError:
     HAS_SCIPY = False
+def compute_optimal_bins(
+    target_points: int,
+    x_range: Tuple[float, float],
+    y_range: Tuple[float, float],
+) -> Tuple[int, int]:
+    """
+    Compute optimal x_bins, y_bins for even spatial distribution.
+    The bin grid matches the data's aspect ratio so bins are approximately
+    square in data space. Total bins ≈ target_points for 1 point per bin.
+    Solves the system:
+        x_bins × y_bins = target_points
+        x_bins / y_bins = aspect_ratio
+    Solution:
+        y_bins = sqrt(target_points / aspect_ratio)
+        x_bins = sqrt(target_points × aspect_ratio)
+    Args:
+        target_points: Target number of bins (and thus max points with 1 per bin)
+        x_range: (x_min, x_max) data range
+        y_range: (y_min, y_max) data range
+    Returns:
+        (x_bins, y_bins) tuple
+    Examples:
+        >>> compute_optimal_bins(10000, (0, 1000), (0, 100))  # 10:1 aspect
+        (316, 31)
+        >>> compute_optimal_bins(10000, (0, 100), (0, 100))   # 1:1 aspect
+        (100, 100)
+    """
+    x_span = x_range[1] - x_range[0]
+    y_span = y_range[1] - y_range[0]
+    # Handle edge cases
+    if y_span < 1e-10:
+        y_span = x_span if x_span > 1e-10 else 1.0
+    if x_span < 1e-10:
+        x_span = y_span
+    aspect_ratio = x_span / y_span
+    # Clamp to reasonable bounds (avoid extreme rectangles)
+    aspect_ratio = max(0.05, min(20.0, aspect_ratio))
+    y_bins = max(1, int(math.sqrt(target_points / aspect_ratio)))
+    x_bins = max(1, int(math.sqrt(target_points * aspect_ratio)))
+    return x_bins, y_bins
 def compute_compression_levels(min_size: int, total: int) -> List[int]:
     """
     Compute logarithmically-spaced compression level target sizes.

{openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openms-insight
-Version: 0.1.3
+Version: 0.1.4
 Summary: Interactive visualization components for mass spectrometry data in Streamlit
 Project-URL: Homepage, https://github.com/t0mdavid-m/OpenMS-Insight
 Project-URL: Documentation, https://github.com/t0mdavid-m/OpenMS-Insight#readme

{openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 openms_insight/__init__.py,sha256=Iv9w0J_7J3pMsyvM4xaYDMWt6IvrtAt6WqOmJ-_tUxk,1044
 openms_insight/components/__init__.py,sha256=T9mUxfgFUiHILmXh1VjcGVlnRvuxRMqi_GJJYOmJKwY,177
-openms_insight/components/heatmap.py,sha256=hCvmtGsFYiZCoIrk4L9KsD7FC6j1GBljwZgKxjNkLPo,38143
+openms_insight/components/heatmap.py,sha256=LigtpPbAPQpfjFljMWoEPPAc3t27Bl1ekr5uaR1Ctuk,44090
 openms_insight/components/lineplot.py,sha256=I-JPvDzCr3Nu8Boc1V4D8QQ1bHgTqvM6CbeoIe7zJ-s,30896
 openms_insight/components/sequenceview.py,sha256=0pDOE0xeoc1-85QZNGdNwwoBwXi-5MFfeb9pCcOi6rc,30274
 openms_insight/components/table.py,sha256=wmq1rjGVe4Ef0SAf5p85pfVCeyLlVevZnxBc9EIg2uk,16458
 openms_insight/core/__init__.py,sha256=EPjKX_FFQRgO8mWHs59I-o0BiuzEMzEU1Pfu9YOfLC4,338
-openms_insight/core/base.py,sha256=t8G_hertREPf1qSqoDU6PGDKFV_mnvOyLUNtaNbnQvQ,18745
+openms_insight/core/base.py,sha256=P2cOrPvPIzxfYQ7xMn9e0BlyKEMrhOCgD9FAtyxTiCc,19408
 openms_insight/core/cache.py,sha256=3fnPDWjuWUnxazK2XflcUIeRZZPQ3N45kAKYu-xGBKw,1197
 openms_insight/core/registry.py,sha256=Hak80Jqhx0qa4gbd1YolNZnM6xBrS8I4U_X7zC0bQ8Y,2108
 openms_insight/core/state.py,sha256=_vNYxYHYFgIigbkqYwkIO6cBGFJyF2VN9dr7CBEAQbY,6873
 openms_insight/core/subprocess_preprocess.py,sha256=m9FbAAFy9Do1Exlh-m4Wo-LDwv6yHlEI4klz5OVwemc,3133
 openms_insight/preprocessing/__init__.py,sha256=hXKTI9zHtMtHojqXq_0V62xfNokozpnpRAwEnxs81fM,461
-openms_insight/preprocessing/compression.py,sha256=phe0D568lpNiwkPGI7AXWg9i3iX3xgEyi7JIqydCtxk,10664
+openms_insight/preprocessing/compression.py,sha256=T4YbX9PUlfTfPit_kpuLZn8hYpqLYu3xtTme_CG2ymc,12241
 openms_insight/preprocessing/filtering.py,sha256=fkmaIXfR5hfjyWfaMYqaeybMHaZjvUZYaKCqvxPOWMQ,14152
 openms_insight/rendering/__init__.py,sha256=ApHvKeh87yY4GTIEai-tCeIXpNbwOXWlmcmIwMMRZYc,198
 openms_insight/rendering/bridge.py,sha256=i8cZq_ra13XpuV1KT0qC6Jf4VCAe4BGrLE-ybrFHwZE,19408
@@ -22,7 +22,7 @@ openms_insight/js-component/dist/assets/materialdesignicons-webfont.eot,sha256=C
 openms_insight/js-component/dist/assets/materialdesignicons-webfont.ttf,sha256=YeirpaTpgf4iz3yOi82-oAR251xiw38Bv37jM2HWhCg,1307660
 openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff,sha256=pZKKDVwvYk5G-Y2bFcL2AEU3f3xZTdeKF1kTLqO0Y-s,587984
 openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff2,sha256=Zi_vqPL4qVwYWI0hd0eJwQfGTnccvmWmmvRikcQxGvw,403216
-openms_insight-0.1.3.dist-info/METADATA,sha256=k_FqXgopIKrICwhW5qcs1xXsYvfav1JOhTD6kQ410wA,12807
-openms_insight-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-openms_insight-0.1.3.dist-info/licenses/LICENSE,sha256=INFF4rOMmpah7Oi14hLqu7NTOsx56KRRNChAAUcfh2E,1823
-openms_insight-0.1.3.dist-info/RECORD,,
+openms_insight-0.1.4.dist-info/METADATA,sha256=_c_eGoMj7wCxAWE5CHC6T2Emri6DEZRXwrQJ-RNptrI,12807
+openms_insight-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+openms_insight-0.1.4.dist-info/licenses/LICENSE,sha256=INFF4rOMmpah7Oi14hLqu7NTOsx56KRRNChAAUcfh2E,1823
+openms_insight-0.1.4.dist-info/RECORD,,

{openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

openms-insight 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

openms-insight 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl