PyPI - openms-insight - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

openms-insight 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

openms_insight/components/heatmap.py +64 -36
openms_insight/components/lineplot.py +16 -3
openms_insight/components/table.py +16 -3
openms_insight/core/base.py +43 -23
openms_insight/core/subprocess_preprocess.py +96 -0
openms_insight/js-component/dist/assets/index.css +1 -1
openms_insight/js-component/dist/assets/index.js +1 -1
openms_insight/preprocessing/filtering.py +8 -15
openms_insight/rendering/bridge.py +88 -27
{openms_insight-0.1.0.dist-info → openms_insight-0.1.1.dist-info}/METADATA +12 -12
{openms_insight-0.1.0.dist-info → openms_insight-0.1.1.dist-info}/RECORD +13 -12
{openms_insight-0.1.0.dist-info → openms_insight-0.1.1.dist-info}/WHEEL +0 -0
{openms_insight-0.1.0.dist-info → openms_insight-0.1.1.dist-info}/licenses/LICENSE +0 -0

openms_insight/components/heatmap.py CHANGED Viewed

@@ -69,6 +69,7 @@ class Heatmap(BaseComponent):
         x_column: str,
         y_column: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         intensity_column: str = 'intensity',
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
@@ -97,6 +98,7 @@ class Heatmap(BaseComponent):
             x_column: Name of column for x-axis values
             y_column: Name of column for y-axis values
             data: Polars LazyFrame with heatmap data. Optional if cache exists.
+            data_path: Path to parquet file (preferred for large datasets).
             intensity_column: Name of column for intensity/color values
             filters: Mapping of identifier names to column names for filtering
             interactivity: Mapping of identifier names to column names for clicks.
@@ -142,11 +144,27 @@ class Heatmap(BaseComponent):
         super().__init__(
             cache_id=cache_id,
             data=data,
+            data_path=data_path,
             filters=filters,
             filter_defaults=filter_defaults,
             interactivity=interactivity,
             cache_path=cache_path,
             regenerate_cache=regenerate_cache,
+            # Pass component-specific params for subprocess recreation
+            x_column=x_column,
+            y_column=y_column,
+            intensity_column=intensity_column,
+            min_points=min_points,
+            x_bins=x_bins,
+            y_bins=y_bins,
+            zoom_identifier=zoom_identifier,
+            title=title,
+            x_label=x_label,
+            y_label=y_label,
+            colorscale=colorscale,
+            use_simple_downsample=use_simple_downsample,
+            use_streaming=use_streaming,
+            categorical_filters=categorical_filters,
             **kwargs
         )
@@ -271,9 +289,8 @@ class Heatmap(BaseComponent):
                 # Store level sizes for this filter value
                 self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
-                self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = len(level_sizes)
-                # Build each level
+                # Build each compressed level
                 for level_idx, target_size in enumerate(level_sizes):
                     # If target size equals total, skip downsampling - use all data
                     if target_size >= filtered_total:
@@ -297,15 +314,20 @@ class Heatmap(BaseComponent):
                             y_range=y_range,
                         )
-                    # Collect and store
+                    # Store LazyFrame for streaming to disk
                     level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
-                    self._preprocessed_data[level_key] = level.collect()
+                    self._preprocessed_data[level_key] = level  # Keep lazy
+                # Add full resolution as final level (for zoom fallback)
+                num_compressed = len(level_sizes)
+                full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
+                self._preprocessed_data[full_res_key] = filtered_data
+                self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
         # Also create global levels for when no categorical filter is selected
         # (fallback to standard behavior)
         level_sizes = compute_compression_levels(self._min_points, total)
         self._preprocessed_data['level_sizes'] = level_sizes
-        self._preprocessed_data['num_levels'] = len(level_sizes)
         for i, size in enumerate(level_sizes):
             # If target size equals total, skip downsampling - use all data
@@ -329,13 +351,18 @@ class Heatmap(BaseComponent):
                     x_range=x_range,
                     y_range=y_range,
                 )
-            self._preprocessed_data[f'level_{i}'] = level.collect()
+            self._preprocessed_data[f'level_{i}'] = level  # Keep lazy
+        # Add full resolution as final level (for zoom fallback)
+        num_compressed = len(level_sizes)
+        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
+        self._preprocessed_data['num_levels'] = num_compressed + 1
     def _preprocess_streaming(self) -> None:
         """
-        Streaming preprocessing - levels stay lazy until render.
+        Streaming preprocessing - levels stay lazy through caching.
-        Builds lazy query plans and collects them for caching.
+        Builds lazy query plans that are streamed to disk via sink_parquet().
         """
         # Get data ranges (minimal collect - just 4 values)
         x_range, y_range = get_data_range(
@@ -379,12 +406,16 @@ class Heatmap(BaseComponent):
                     x_range=x_range,
                     y_range=y_range,
                 )
-            # Collect and store as DataFrame for caching
-            # Base class will serialize these to parquet
-            self._preprocessed_data[f'level_{i}'] = level.collect()
+            # Store LazyFrame for streaming to disk
+            # Base class will use sink_parquet() to stream without full materialization
+            self._preprocessed_data[f'level_{i}'] = level  # Keep lazy
-        # Store number of levels for reconstruction
-        self._preprocessed_data['num_levels'] = len(level_sizes)
+        # Add full resolution as final level (for zoom fallback)
+        num_compressed = len(level_sizes)
+        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
+        # Store number of levels for reconstruction (includes full resolution)
+        self._preprocessed_data['num_levels'] = num_compressed + 1
     def _preprocess_eager(self) -> None:
         """
@@ -434,16 +465,21 @@ class Heatmap(BaseComponent):
                         x_bins=self._x_bins,
                         y_bins=self._y_bins,
                     )
-                # Collect for caching - store with reversed index
+                # Store LazyFrame for streaming to disk
                 level_idx = len(level_sizes) - 1 - i
                 if isinstance(downsampled, pl.LazyFrame):
-                    self._preprocessed_data[f'level_{level_idx}'] = downsampled.collect()
+                    self._preprocessed_data[f'level_{level_idx}'] = downsampled  # Keep lazy
                 else:
-                    self._preprocessed_data[f'level_{level_idx}'] = downsampled
+                    # DataFrame from downsample_2d - convert back to lazy
+                    self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
                 current = downsampled
-        # Store number of levels for reconstruction
-        self._preprocessed_data['num_levels'] = len(level_sizes)
+        # Add full resolution as final level (for zoom fallback)
+        num_compressed = len(level_sizes)
+        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
+        # Store number of levels for reconstruction (includes full resolution)
+        self._preprocessed_data['num_levels'] = num_compressed + 1
     def _get_levels(self) -> list:
         """
@@ -460,10 +496,6 @@ class Heatmap(BaseComponent):
             if level_data is not None:
                 levels.append(level_data)
-        # Add full resolution at end (if raw data available)
-        if self._raw_data is not None:
-            levels.append(self._raw_data)
         return levels
     def _get_categorical_levels(
@@ -496,13 +528,7 @@ class Heatmap(BaseComponent):
             if level_data is not None:
                 levels.append(level_data)
-        # Get filtered raw data for full resolution (if available)
-        filtered_raw = None
-        if self._raw_data is not None and filter_id in self._filters:
-            column_name = self._filters[filter_id]
-            filtered_raw = self._raw_data.filter(pl.col(column_name) == filter_value)
-        return levels, filtered_raw
+        return levels, None  # Full resolution included in cached levels
     def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
         """
@@ -630,10 +656,11 @@ class Heatmap(BaseComponent):
             if count >= self._min_points:
                 # This level has enough detail
-                if count > self._min_points * 2:
-                    # Still too many - downsample further
-                    x_range = self._preprocessed_data.get('x_range')
-                    y_range = self._preprocessed_data.get('y_range')
+                if count > self._min_points:
+                    # Over limit - downsample to stay at/under max
+                    # Use ZOOM range for binning (not global) to avoid sparse bins
+                    zoom_x_range = (x0, x1)
+                    zoom_y_range = (y0, y1)
                     if self._use_streaming or self._use_simple_downsample:
                         if self._use_simple_downsample:
                             return downsample_2d_simple(
@@ -650,8 +677,8 @@ class Heatmap(BaseComponent):
                                 intensity_column=self._intensity_column,
                                 x_bins=self._x_bins,
                                 y_bins=self._y_bins,
-                                x_range=x_range,
-                                y_range=y_range,
+                                x_range=zoom_x_range,
+                                y_range=zoom_y_range,
                             ).collect()
                     else:
                         return downsample_2d(
@@ -744,7 +771,8 @@ class Heatmap(BaseComponent):
                 df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
             else:
                 # No filters to apply - levels already filtered by categorical filter
-                available_cols = [c for c in columns_to_select if c in data.columns]
+                schema_names = data.collect_schema().names()
+                available_cols = [c for c in columns_to_select if c in schema_names]
                 df_polars = data.select(available_cols).collect()
                 # Sort by intensity ascending so high-intensity points are drawn on top
                 df_polars = df_polars.sort(self._intensity_column)

openms_insight/components/lineplot.py CHANGED Viewed

@@ -45,6 +45,7 @@ class LinePlot(BaseComponent):
         self,
         cache_id: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
@@ -68,6 +69,7 @@ class LinePlot(BaseComponent):
             cache_id: Unique identifier for this component's cache (MANDATORY).
                 Creates a folder {cache_path}/{cache_id}/ for cached data.
             data: Polars LazyFrame with plot data. Optional if cache exists.
+            data_path: Path to parquet file (preferred for large datasets).
             filters: Mapping of identifier names to column names for filtering.
                 Example: {'spectrum': 'scan_id'}
                 When 'spectrum' selection exists, plot shows only data where
@@ -116,11 +118,22 @@ class LinePlot(BaseComponent):
         super().__init__(
             cache_id=cache_id,
             data=data,
+            data_path=data_path,
             filters=filters,
             filter_defaults=filter_defaults,
             interactivity=interactivity,
             cache_path=cache_path,
             regenerate_cache=regenerate_cache,
+            # Pass component-specific params for subprocess recreation
+            x_column=x_column,
+            y_column=y_column,
+            title=title,
+            x_label=x_label,
+            y_label=y_label,
+            highlight_column=highlight_column,
+            annotation_column=annotation_column,
+            styling=styling,
+            config=config,
             **kwargs
         )
@@ -208,9 +221,9 @@ class LinePlot(BaseComponent):
             'annotation_column': self._annotation_column,
         }
-        # Collect data for caching (filter happens at render time)
-        # Base class will serialize this to parquet
-        self._preprocessed_data['data'] = data.collect()
+        # Store LazyFrame for streaming to disk (filter happens at render time)
+        # Base class will use sink_parquet() to stream without full materialization
+        self._preprocessed_data['data'] = data  # Keep lazy
     def _get_vue_component_name(self) -> str:
         """Return the Vue component name."""

openms_insight/components/table.py CHANGED Viewed

@@ -49,6 +49,7 @@ class Table(BaseComponent):
         self,
         cache_id: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
@@ -72,6 +73,7 @@ class Table(BaseComponent):
             cache_id: Unique identifier for this component's cache (MANDATORY).
                 Creates a folder {cache_path}/{cache_id}/ for cached data.
             data: Polars LazyFrame with table data. Optional if cache exists.
+            data_path: Path to parquet file (preferred for large datasets).
             filters: Mapping of identifier names to column names for filtering.
                 Example: {'spectrum': 'scan_id'}
                 When 'spectrum' selection exists, table shows only rows where
@@ -120,11 +122,22 @@ class Table(BaseComponent):
         super().__init__(
             cache_id=cache_id,
             data=data,
+            data_path=data_path,
             filters=filters,
             filter_defaults=filter_defaults,
             interactivity=interactivity,
             cache_path=cache_path,
             regenerate_cache=regenerate_cache,
+            # Pass component-specific params for subprocess recreation
+            column_definitions=column_definitions,
+            title=title,
+            index_field=index_field,
+            go_to_fields=go_to_fields,
+            layout=layout,
+            default_row=default_row,
+            initial_sort=initial_sort,
+            pagination=pagination,
+            page_size=page_size,
             **kwargs
         )
@@ -204,9 +217,9 @@ class Table(BaseComponent):
         # Store column definitions in preprocessed data for serialization
         self._preprocessed_data['column_definitions'] = self._column_definitions
-        # Collect data for caching (filter happens at render time)
-        # Base class will serialize this to parquet with optimized row groups
-        self._preprocessed_data['data'] = data.collect()
+        # Store LazyFrame for streaming to disk (filter happens at render time)
+        # Base class will use sink_parquet() to stream without full materialization
+        self._preprocessed_data['data'] = data  # Keep lazy
     def _get_columns_to_select(self) -> Optional[List[str]]:
         """Get list of columns needed for this table."""

openms_insight/core/base.py CHANGED Viewed

@@ -43,6 +43,7 @@ class BaseComponent(ABC):
         self,
         cache_id: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
@@ -57,6 +58,9 @@ class BaseComponent(ABC):
             cache_id: Unique identifier for this component's cache (MANDATORY).
                 Creates a folder {cache_path}/{cache_id}/ for cached data.
             data: Polars LazyFrame with source data. Optional if cache exists.
+            data_path: Path to parquet file with source data. Preferred over
+                data= for large datasets as preprocessing runs in a subprocess
+                to ensure memory is released after cache creation.
             filters: Mapping of identifier names to column names for filtering.
                 Example: {'spectrum': 'scan_id'}
                 When 'spectrum' selection exists, component filters data where
@@ -73,6 +77,10 @@ class BaseComponent(ABC):
             regenerate_cache: If True, regenerate cache even if valid cache exists.
             **kwargs: Component-specific configuration options
         """
+        # Validate inputs
+        if data is not None and data_path is not None:
+            raise ValueError("Provide either 'data' or 'data_path', not both")
         self._cache_id = cache_id
         self._cache_dir = get_cache_dir(cache_path, cache_id)
         self._filters = filters or {}
@@ -83,18 +91,33 @@ class BaseComponent(ABC):
         # Check if we should load from cache or preprocess
         if regenerate_cache or not self._is_cache_valid():
-            if data is None:
+            if data is None and data_path is None:
                 raise CacheMissError(
                     f"Cache not found at '{self._cache_dir}' and no data provided. "
-                    f"Either provide data= or ensure cache exists from a previous run."
+                    f"Either provide data=, data_path=, or ensure cache exists."
+                )
+            if data_path is not None:
+                # Subprocess preprocessing - memory released after cache creation
+                from .subprocess_preprocess import preprocess_component
+                preprocess_component(
+                    type(self),
+                    data_path=data_path,
+                    cache_id=cache_id,
+                    cache_path=cache_path,
+                    filters=filters,
+                    filter_defaults=filter_defaults,
+                    interactivity=interactivity,
+                    **kwargs
                 )
-            self._raw_data = data
-            # Validate columns exist in data
-            self._validate_mappings()
-            # Run component-specific preprocessing
-            self._preprocess()
-            # Save to cache for next time
-            self._save_to_cache()
+                self._raw_data = None
+                self._load_from_cache()
+            else:
+                # In-process preprocessing (backward compatible)
+                self._raw_data = data
+                self._validate_mappings()
+                self._preprocess()
+                self._save_to_cache()
         else:
             # Load from valid cache
             self._raw_data = None
@@ -231,28 +254,18 @@ class BaseComponent(ABC):
             "data_values": {},
         }
-        # Save preprocessed data
-        row_group_size = self._get_row_group_size()
+        # Save preprocessed data - stream LazyFrames directly to disk
         for key, value in self._preprocessed_data.items():
             if isinstance(value, pl.LazyFrame):
                 filename = f"{key}.parquet"
                 filepath = preprocessed_dir / filename
-                value.collect().write_parquet(
-                    filepath,
-                    compression='zstd',
-                    statistics=True,
-                    row_group_size=row_group_size,
-                )
+                # Stream directly to disk without full materialization
+                value.sink_parquet(filepath, compression='zstd')
                 manifest["data_files"][key] = filename
             elif isinstance(value, pl.DataFrame):
                 filename = f"{key}.parquet"
                 filepath = preprocessed_dir / filename
-                value.write_parquet(
-                    filepath,
-                    compression='zstd',
-                    statistics=True,
-                    row_group_size=row_group_size,
-                )
+                value.write_parquet(filepath, compression='zstd')
                 manifest["data_files"][key] = filename
             elif self._is_json_serializable(value):
                 manifest["data_values"][key] = value
@@ -261,6 +274,13 @@ class BaseComponent(ABC):
         with open(self._get_manifest_path(), "w") as f:
             json.dump(manifest, f, indent=2)
+        # Release memory - data is now safely on disk
+        self._preprocessed_data = {}
+        self._raw_data = None
+        # Reload as lazy scan_parquet() references
+        self._load_from_cache()
     def _is_json_serializable(self, value: Any) -> bool:
         """Check if value can be JSON serialized."""
         try:

openms_insight/core/subprocess_preprocess.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""Subprocess-based preprocessing to ensure memory is released after cache creation.
+When preprocessing large datasets (especially heatmaps with millions of points),
+memory allocators like mimalloc retain freed memory. Running preprocessing in a
+subprocess ensures all memory is returned to the OS when the subprocess exits.
+"""
+import multiprocessing
+import os
+import traceback
+from typing import Any, Dict, Type
+def _preprocess_worker(
+    component_class: Type,
+    data_path: str,
+    kwargs: Dict[str, Any],
+    error_queue: multiprocessing.Queue,
+) -> None:
+    """Worker function that runs in subprocess to do preprocessing."""
+    try:
+        import polars as pl
+        # Set mimalloc to release memory aggressively (in case not inherited)
+        os.environ.setdefault("MIMALLOC_PURGE_DELAY", "0")
+        # Create component with data - this triggers preprocessing and cache save
+        data = pl.scan_parquet(data_path)
+        component_class(data=data, **kwargs)
+        # Subprocess exits here, releasing all memory
+        error_queue.put(None)
+    except Exception as e:
+        # Send exception info back to parent process
+        error_queue.put((type(e).__name__, str(e), traceback.format_exc()))
+def preprocess_component(
+    component_class: Type,
+    data_path: str,
+    cache_id: str,
+    cache_path: str,
+    **kwargs,
+) -> None:
+    """
+    Run component preprocessing in a subprocess to guarantee memory release.
+    This is an internal function called by BaseComponent when data_path is
+    provided. Users should use the component constructor directly:
+        heatmap = Heatmap(
+            data_path="/path/to/data.parquet",
+            cache_id="my_heatmap",
+            cache_path="/path/to/cache",
+            x_column="rt",
+            y_column="mz",
+            intensity_column="intensity",
+        )
+    Args:
+        component_class: The component class (e.g., Heatmap, Table)
+        data_path: Path to the parquet file containing the data
+        cache_id: Unique identifier for the cache
+        cache_path: Directory for cache storage
+        **kwargs: Additional arguments passed to component constructor
+    """
+    # Prepare kwargs for subprocess
+    worker_kwargs = {
+        "cache_id": cache_id,
+        "cache_path": cache_path,
+        **kwargs,
+    }
+    # Use spawn to get a fresh process (fork might copy memory)
+    ctx = multiprocessing.get_context("spawn")
+    error_queue = ctx.Queue()
+    process = ctx.Process(
+        target=_preprocess_worker,
+        args=(component_class, data_path, worker_kwargs, error_queue),
+    )
+    process.start()
+    process.join()
+    # Check for errors from subprocess
+    if not error_queue.empty():
+        error_info = error_queue.get_nowait()
+        if error_info is not None:
+            exc_type, exc_msg, exc_tb = error_info
+            raise RuntimeError(
+                f"Subprocess preprocessing failed with {exc_type}: {exc_msg}\n"
+                f"Subprocess traceback:\n{exc_tb}"
+            )
+    if process.exitcode != 0:
+        raise RuntimeError(
+            f"Preprocessing failed with exit code {process.exitcode}"
+        )

openms-insight 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

openms-insight 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl