PyPI - openms-insight - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

openms-insight 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{openms_insight-0.1.0 → openms_insight-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openms-insight
-Version: 0.1.0
+Version: 0.1.1
 Summary: Interactive visualization components for mass spectrometry data in Streamlit
 Project-URL: Homepage, https://github.com/t0mdavid-m/OpenMS-Insight
 Project-URL: Documentation, https://github.com/t0mdavid-m/OpenMS-Insight#readme
@@ -43,7 +43,7 @@ Interactive visualization components for mass spectrometry data in Streamlit, ba
 ## Features
 - **Cross-component selection linking** via shared identifiers
-- **Polars LazyFrame support** for efficient data handling
+- **Memory-efficient preprocessing** via subprocess isolation
 - **Automatic disk caching** with config-based invalidation
 - **Table component** (Tabulator.js) with filtering, sorting, go-to, pagination
 - **Line plot component** (Plotly.js) with highlighting, annotations, zoom
@@ -60,7 +60,6 @@ pip install openms-insight
 ```python
 import streamlit as st
-import polars as pl
 from openms_insight import Table, LinePlot, StateManager
 # Create state manager for cross-component linking
@@ -69,7 +68,7 @@ state_manager = StateManager()
 # Create a table - clicking a row sets the 'item' selection
 table = Table(
     cache_id="items_table",
-    data=pl.scan_parquet("items.parquet"),
+    data_path="items.parquet",
     interactivity={'item': 'item_id'},
     column_definitions=[
         {'field': 'item_id', 'title': 'ID', 'sorter': 'number'},
@@ -81,7 +80,7 @@ table(state_manager=state_manager)
 # Create a linked plot - filters by the selected 'item'
 plot = LinePlot(
     cache_id="values_plot",
-    data=pl.scan_parquet("values.parquet"),
+    data_path="values.parquet",
     filters={'item': 'item_id'},
     x_column='x',
     y_column='y',
@@ -100,14 +99,14 @@ Components communicate through **identifiers** using two mechanisms:
 # Master table: no filters, sets 'spectrum' on click
 master = Table(
     cache_id="spectra",
-    data=spectra_data,
+    data_path="spectra.parquet",
     interactivity={'spectrum': 'scan_id'},  # Click -> sets spectrum=scan_id
 )
 # Detail table: filters by 'spectrum', sets 'peak' on click
 detail = Table(
     cache_id="peaks",
-    data=peaks_data,
+    data_path="peaks.parquet",
     filters={'spectrum': 'scan_id'},        # Filters where scan_id = selected spectrum
     interactivity={'peak': 'peak_id'},      # Click -> sets peak=peak_id
 )
@@ -115,7 +114,7 @@ detail = Table(
 # Plot: filters by 'spectrum', highlights selected 'peak'
 plot = LinePlot(
     cache_id="plot",
-    data=peaks_data,
+    data_path="peaks.parquet",
     filters={'spectrum': 'scan_id'},
     interactivity={'peak': 'peak_id'},
     x_column='mass',
@@ -134,7 +133,7 @@ Interactive table using Tabulator.js with filtering dialogs, sorting, pagination
 ```python
 Table(
     cache_id="spectra_table",
-    data=pl.scan_parquet("spectra.parquet"),
+    data_path="spectra.parquet",
     interactivity={'spectrum': 'scan_id'},
     column_definitions=[
         {'field': 'scan_id', 'title': 'Scan', 'sorter': 'number'},
@@ -156,7 +155,7 @@ Stick-style line plot using Plotly.js for mass spectra visualization.
 ```python
 LinePlot(
     cache_id="spectrum_plot",
-    data=pl.scan_parquet("peaks.parquet"),
+    data_path="peaks.parquet",
     filters={'spectrum': 'scan_id'},
     interactivity={'peak': 'peak_id'},
     x_column='mass',
@@ -176,7 +175,7 @@ LinePlot(
 ```python
 Heatmap(
     cache_id="peaks_heatmap",
-    data=pl.scan_parquet("all_peaks.parquet"),
+    data_path="all_peaks.parquet",
     x_column='retention_time',
     y_column='mass',
     intensity_column='intensity',
@@ -213,7 +212,8 @@ All components accept these common arguments:
 | Argument | Type | Default | Description |
 |----------|------|---------|-------------|
 | `cache_id` | `str` | **Required** | Unique identifier for disk cache |
-| `data` | `pl.LazyFrame` | `None` | Polars LazyFrame with source data |
+| `data_path` | `str` | `None` | Path to parquet file (preferred - uses subprocess for memory efficiency) |
+| `data` | `pl.LazyFrame` | `None` | Polars LazyFrame (alternative to data_path, in-process preprocessing) |
 | `filters` | `Dict[str, str]` | `None` | Map identifier -> column for filtering |
 | `interactivity` | `Dict[str, str]` | `None` | Map identifier -> column for click actions |
 | `cache_path` | `str` | `"."` | Base directory for cache storage |

{openms_insight-0.1.0 → openms_insight-0.1.1}/README.md RENAMED Viewed

@@ -8,7 +8,7 @@ Interactive visualization components for mass spectrometry data in Streamlit, ba
 ## Features
 - **Cross-component selection linking** via shared identifiers
-- **Polars LazyFrame support** for efficient data handling
+- **Memory-efficient preprocessing** via subprocess isolation
 - **Automatic disk caching** with config-based invalidation
 - **Table component** (Tabulator.js) with filtering, sorting, go-to, pagination
 - **Line plot component** (Plotly.js) with highlighting, annotations, zoom
@@ -25,7 +25,6 @@ pip install openms-insight
 ```python
 import streamlit as st
-import polars as pl
 from openms_insight import Table, LinePlot, StateManager
 # Create state manager for cross-component linking
@@ -34,7 +33,7 @@ state_manager = StateManager()
 # Create a table - clicking a row sets the 'item' selection
 table = Table(
     cache_id="items_table",
-    data=pl.scan_parquet("items.parquet"),
+    data_path="items.parquet",
     interactivity={'item': 'item_id'},
     column_definitions=[
         {'field': 'item_id', 'title': 'ID', 'sorter': 'number'},
@@ -46,7 +45,7 @@ table(state_manager=state_manager)
 # Create a linked plot - filters by the selected 'item'
 plot = LinePlot(
     cache_id="values_plot",
-    data=pl.scan_parquet("values.parquet"),
+    data_path="values.parquet",
     filters={'item': 'item_id'},
     x_column='x',
     y_column='y',
@@ -65,14 +64,14 @@ Components communicate through **identifiers** using two mechanisms:
 # Master table: no filters, sets 'spectrum' on click
 master = Table(
     cache_id="spectra",
-    data=spectra_data,
+    data_path="spectra.parquet",
     interactivity={'spectrum': 'scan_id'},  # Click -> sets spectrum=scan_id
 )
 # Detail table: filters by 'spectrum', sets 'peak' on click
 detail = Table(
     cache_id="peaks",
-    data=peaks_data,
+    data_path="peaks.parquet",
     filters={'spectrum': 'scan_id'},        # Filters where scan_id = selected spectrum
     interactivity={'peak': 'peak_id'},      # Click -> sets peak=peak_id
 )
@@ -80,7 +79,7 @@ detail = Table(
 # Plot: filters by 'spectrum', highlights selected 'peak'
 plot = LinePlot(
     cache_id="plot",
-    data=peaks_data,
+    data_path="peaks.parquet",
     filters={'spectrum': 'scan_id'},
     interactivity={'peak': 'peak_id'},
     x_column='mass',
@@ -99,7 +98,7 @@ Interactive table using Tabulator.js with filtering dialogs, sorting, pagination
 ```python
 Table(
     cache_id="spectra_table",
-    data=pl.scan_parquet("spectra.parquet"),
+    data_path="spectra.parquet",
     interactivity={'spectrum': 'scan_id'},
     column_definitions=[
         {'field': 'scan_id', 'title': 'Scan', 'sorter': 'number'},
@@ -121,7 +120,7 @@ Stick-style line plot using Plotly.js for mass spectra visualization.
 ```python
 LinePlot(
     cache_id="spectrum_plot",
-    data=pl.scan_parquet("peaks.parquet"),
+    data_path="peaks.parquet",
     filters={'spectrum': 'scan_id'},
     interactivity={'peak': 'peak_id'},
     x_column='mass',
@@ -141,7 +140,7 @@ LinePlot(
 ```python
 Heatmap(
     cache_id="peaks_heatmap",
-    data=pl.scan_parquet("all_peaks.parquet"),
+    data_path="all_peaks.parquet",
     x_column='retention_time',
     y_column='mass',
     intensity_column='intensity',
@@ -178,7 +177,8 @@ All components accept these common arguments:
 | Argument | Type | Default | Description |
 |----------|------|---------|-------------|
 | `cache_id` | `str` | **Required** | Unique identifier for disk cache |
-| `data` | `pl.LazyFrame` | `None` | Polars LazyFrame with source data |
+| `data_path` | `str` | `None` | Path to parquet file (preferred - uses subprocess for memory efficiency) |
+| `data` | `pl.LazyFrame` | `None` | Polars LazyFrame (alternative to data_path, in-process preprocessing) |
 | `filters` | `Dict[str, str]` | `None` | Map identifier -> column for filtering |
 | `interactivity` | `Dict[str, str]` | `None` | Map identifier -> column for click actions |
 | `cache_path` | `str` | `"."` | Base directory for cache storage |

{openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/heatmap.py RENAMED Viewed

@@ -69,6 +69,7 @@ class Heatmap(BaseComponent):
         x_column: str,
         y_column: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         intensity_column: str = 'intensity',
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
@@ -97,6 +98,7 @@ class Heatmap(BaseComponent):
             x_column: Name of column for x-axis values
             y_column: Name of column for y-axis values
             data: Polars LazyFrame with heatmap data. Optional if cache exists.
+            data_path: Path to parquet file (preferred for large datasets).
             intensity_column: Name of column for intensity/color values
             filters: Mapping of identifier names to column names for filtering
             interactivity: Mapping of identifier names to column names for clicks.
@@ -142,11 +144,27 @@ class Heatmap(BaseComponent):
         super().__init__(
             cache_id=cache_id,
             data=data,
+            data_path=data_path,
             filters=filters,
             filter_defaults=filter_defaults,
             interactivity=interactivity,
             cache_path=cache_path,
             regenerate_cache=regenerate_cache,
+            # Pass component-specific params for subprocess recreation
+            x_column=x_column,
+            y_column=y_column,
+            intensity_column=intensity_column,
+            min_points=min_points,
+            x_bins=x_bins,
+            y_bins=y_bins,
+            zoom_identifier=zoom_identifier,
+            title=title,
+            x_label=x_label,
+            y_label=y_label,
+            colorscale=colorscale,
+            use_simple_downsample=use_simple_downsample,
+            use_streaming=use_streaming,
+            categorical_filters=categorical_filters,
             **kwargs
         )
@@ -271,9 +289,8 @@ class Heatmap(BaseComponent):
                 # Store level sizes for this filter value
                 self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
-                self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = len(level_sizes)
-                # Build each level
+                # Build each compressed level
                 for level_idx, target_size in enumerate(level_sizes):
                     # If target size equals total, skip downsampling - use all data
                     if target_size >= filtered_total:
@@ -297,15 +314,20 @@ class Heatmap(BaseComponent):
                             y_range=y_range,
                         )
-                    # Collect and store
+                    # Store LazyFrame for streaming to disk
                     level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
-                    self._preprocessed_data[level_key] = level.collect()
+                    self._preprocessed_data[level_key] = level  # Keep lazy
+                # Add full resolution as final level (for zoom fallback)
+                num_compressed = len(level_sizes)
+                full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
+                self._preprocessed_data[full_res_key] = filtered_data
+                self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
         # Also create global levels for when no categorical filter is selected
         # (fallback to standard behavior)
         level_sizes = compute_compression_levels(self._min_points, total)
         self._preprocessed_data['level_sizes'] = level_sizes
-        self._preprocessed_data['num_levels'] = len(level_sizes)
         for i, size in enumerate(level_sizes):
             # If target size equals total, skip downsampling - use all data
@@ -329,13 +351,18 @@ class Heatmap(BaseComponent):
                     x_range=x_range,
                     y_range=y_range,
                 )
-            self._preprocessed_data[f'level_{i}'] = level.collect()
+            self._preprocessed_data[f'level_{i}'] = level  # Keep lazy
+        # Add full resolution as final level (for zoom fallback)
+        num_compressed = len(level_sizes)
+        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
+        self._preprocessed_data['num_levels'] = num_compressed + 1
     def _preprocess_streaming(self) -> None:
         """
-        Streaming preprocessing - levels stay lazy until render.
+        Streaming preprocessing - levels stay lazy through caching.
-        Builds lazy query plans and collects them for caching.
+        Builds lazy query plans that are streamed to disk via sink_parquet().
         """
         # Get data ranges (minimal collect - just 4 values)
         x_range, y_range = get_data_range(
@@ -379,12 +406,16 @@ class Heatmap(BaseComponent):
                     x_range=x_range,
                     y_range=y_range,
                 )
-            # Collect and store as DataFrame for caching
-            # Base class will serialize these to parquet
-            self._preprocessed_data[f'level_{i}'] = level.collect()
+            # Store LazyFrame for streaming to disk
+            # Base class will use sink_parquet() to stream without full materialization
+            self._preprocessed_data[f'level_{i}'] = level  # Keep lazy
-        # Store number of levels for reconstruction
-        self._preprocessed_data['num_levels'] = len(level_sizes)
+        # Add full resolution as final level (for zoom fallback)
+        num_compressed = len(level_sizes)
+        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
+        # Store number of levels for reconstruction (includes full resolution)
+        self._preprocessed_data['num_levels'] = num_compressed + 1
     def _preprocess_eager(self) -> None:
         """
@@ -434,16 +465,21 @@ class Heatmap(BaseComponent):
                         x_bins=self._x_bins,
                         y_bins=self._y_bins,
                     )
-                # Collect for caching - store with reversed index
+                # Store LazyFrame for streaming to disk
                 level_idx = len(level_sizes) - 1 - i
                 if isinstance(downsampled, pl.LazyFrame):
-                    self._preprocessed_data[f'level_{level_idx}'] = downsampled.collect()
+                    self._preprocessed_data[f'level_{level_idx}'] = downsampled  # Keep lazy
                 else:
-                    self._preprocessed_data[f'level_{level_idx}'] = downsampled
+                    # DataFrame from downsample_2d - convert back to lazy
+                    self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
                 current = downsampled
-        # Store number of levels for reconstruction
-        self._preprocessed_data['num_levels'] = len(level_sizes)
+        # Add full resolution as final level (for zoom fallback)
+        num_compressed = len(level_sizes)
+        self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
+        # Store number of levels for reconstruction (includes full resolution)
+        self._preprocessed_data['num_levels'] = num_compressed + 1
     def _get_levels(self) -> list:
         """
@@ -460,10 +496,6 @@ class Heatmap(BaseComponent):
             if level_data is not None:
                 levels.append(level_data)
-        # Add full resolution at end (if raw data available)
-        if self._raw_data is not None:
-            levels.append(self._raw_data)
         return levels
     def _get_categorical_levels(
@@ -496,13 +528,7 @@ class Heatmap(BaseComponent):
             if level_data is not None:
                 levels.append(level_data)
-        # Get filtered raw data for full resolution (if available)
-        filtered_raw = None
-        if self._raw_data is not None and filter_id in self._filters:
-            column_name = self._filters[filter_id]
-            filtered_raw = self._raw_data.filter(pl.col(column_name) == filter_value)
-        return levels, filtered_raw
+        return levels, None  # Full resolution included in cached levels
     def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
         """
@@ -630,10 +656,11 @@ class Heatmap(BaseComponent):
             if count >= self._min_points:
                 # This level has enough detail
-                if count > self._min_points * 2:
-                    # Still too many - downsample further
-                    x_range = self._preprocessed_data.get('x_range')
-                    y_range = self._preprocessed_data.get('y_range')
+                if count > self._min_points:
+                    # Over limit - downsample to stay at/under max
+                    # Use ZOOM range for binning (not global) to avoid sparse bins
+                    zoom_x_range = (x0, x1)
+                    zoom_y_range = (y0, y1)
                     if self._use_streaming or self._use_simple_downsample:
                         if self._use_simple_downsample:
                             return downsample_2d_simple(
@@ -650,8 +677,8 @@ class Heatmap(BaseComponent):
                                 intensity_column=self._intensity_column,
                                 x_bins=self._x_bins,
                                 y_bins=self._y_bins,
-                                x_range=x_range,
-                                y_range=y_range,
+                                x_range=zoom_x_range,
+                                y_range=zoom_y_range,
                             ).collect()
                     else:
                         return downsample_2d(
@@ -744,7 +771,8 @@ class Heatmap(BaseComponent):
                 df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
             else:
                 # No filters to apply - levels already filtered by categorical filter
-                available_cols = [c for c in columns_to_select if c in data.columns]
+                schema_names = data.collect_schema().names()
+                available_cols = [c for c in columns_to_select if c in schema_names]
                 df_polars = data.select(available_cols).collect()
                 # Sort by intensity ascending so high-intensity points are drawn on top
                 df_polars = df_polars.sort(self._intensity_column)

{openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/lineplot.py RENAMED Viewed

@@ -45,6 +45,7 @@ class LinePlot(BaseComponent):
         self,
         cache_id: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
@@ -68,6 +69,7 @@ class LinePlot(BaseComponent):
             cache_id: Unique identifier for this component's cache (MANDATORY).
                 Creates a folder {cache_path}/{cache_id}/ for cached data.
             data: Polars LazyFrame with plot data. Optional if cache exists.
+            data_path: Path to parquet file (preferred for large datasets).
             filters: Mapping of identifier names to column names for filtering.
                 Example: {'spectrum': 'scan_id'}
                 When 'spectrum' selection exists, plot shows only data where
@@ -116,11 +118,22 @@ class LinePlot(BaseComponent):
         super().__init__(
             cache_id=cache_id,
             data=data,
+            data_path=data_path,
             filters=filters,
             filter_defaults=filter_defaults,
             interactivity=interactivity,
             cache_path=cache_path,
             regenerate_cache=regenerate_cache,
+            # Pass component-specific params for subprocess recreation
+            x_column=x_column,
+            y_column=y_column,
+            title=title,
+            x_label=x_label,
+            y_label=y_label,
+            highlight_column=highlight_column,
+            annotation_column=annotation_column,
+            styling=styling,
+            config=config,
             **kwargs
         )
@@ -208,9 +221,9 @@ class LinePlot(BaseComponent):
             'annotation_column': self._annotation_column,
         }
-        # Collect data for caching (filter happens at render time)
-        # Base class will serialize this to parquet
-        self._preprocessed_data['data'] = data.collect()
+        # Store LazyFrame for streaming to disk (filter happens at render time)
+        # Base class will use sink_parquet() to stream without full materialization
+        self._preprocessed_data['data'] = data  # Keep lazy
     def _get_vue_component_name(self) -> str:
         """Return the Vue component name."""

{openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/table.py RENAMED Viewed

@@ -49,6 +49,7 @@ class Table(BaseComponent):
         self,
         cache_id: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
@@ -72,6 +73,7 @@ class Table(BaseComponent):
             cache_id: Unique identifier for this component's cache (MANDATORY).
                 Creates a folder {cache_path}/{cache_id}/ for cached data.
             data: Polars LazyFrame with table data. Optional if cache exists.
+            data_path: Path to parquet file (preferred for large datasets).
             filters: Mapping of identifier names to column names for filtering.
                 Example: {'spectrum': 'scan_id'}
                 When 'spectrum' selection exists, table shows only rows where
@@ -120,11 +122,22 @@ class Table(BaseComponent):
         super().__init__(
             cache_id=cache_id,
             data=data,
+            data_path=data_path,
             filters=filters,
             filter_defaults=filter_defaults,
             interactivity=interactivity,
             cache_path=cache_path,
             regenerate_cache=regenerate_cache,
+            # Pass component-specific params for subprocess recreation
+            column_definitions=column_definitions,
+            title=title,
+            index_field=index_field,
+            go_to_fields=go_to_fields,
+            layout=layout,
+            default_row=default_row,
+            initial_sort=initial_sort,
+            pagination=pagination,
+            page_size=page_size,
             **kwargs
         )
@@ -204,9 +217,9 @@ class Table(BaseComponent):
         # Store column definitions in preprocessed data for serialization
         self._preprocessed_data['column_definitions'] = self._column_definitions
-        # Collect data for caching (filter happens at render time)
-        # Base class will serialize this to parquet with optimized row groups
-        self._preprocessed_data['data'] = data.collect()
+        # Store LazyFrame for streaming to disk (filter happens at render time)
+        # Base class will use sink_parquet() to stream without full materialization
+        self._preprocessed_data['data'] = data  # Keep lazy
     def _get_columns_to_select(self) -> Optional[List[str]]:
         """Get list of columns needed for this table."""

{openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/base.py RENAMED Viewed

@@ -43,6 +43,7 @@ class BaseComponent(ABC):
         self,
         cache_id: str,
         data: Optional[pl.LazyFrame] = None,
+        data_path: Optional[str] = None,
         filters: Optional[Dict[str, str]] = None,
         filter_defaults: Optional[Dict[str, Any]] = None,
         interactivity: Optional[Dict[str, str]] = None,
@@ -57,6 +58,9 @@ class BaseComponent(ABC):
             cache_id: Unique identifier for this component's cache (MANDATORY).
                 Creates a folder {cache_path}/{cache_id}/ for cached data.
             data: Polars LazyFrame with source data. Optional if cache exists.
+            data_path: Path to parquet file with source data. Preferred over
+                data= for large datasets as preprocessing runs in a subprocess
+                to ensure memory is released after cache creation.
             filters: Mapping of identifier names to column names for filtering.
                 Example: {'spectrum': 'scan_id'}
                 When 'spectrum' selection exists, component filters data where
@@ -73,6 +77,10 @@ class BaseComponent(ABC):
             regenerate_cache: If True, regenerate cache even if valid cache exists.
             **kwargs: Component-specific configuration options
         """
+        # Validate inputs
+        if data is not None and data_path is not None:
+            raise ValueError("Provide either 'data' or 'data_path', not both")
         self._cache_id = cache_id
         self._cache_dir = get_cache_dir(cache_path, cache_id)
         self._filters = filters or {}
@@ -83,18 +91,33 @@ class BaseComponent(ABC):
         # Check if we should load from cache or preprocess
         if regenerate_cache or not self._is_cache_valid():
-            if data is None:
+            if data is None and data_path is None:
                 raise CacheMissError(
                     f"Cache not found at '{self._cache_dir}' and no data provided. "
-                    f"Either provide data= or ensure cache exists from a previous run."
+                    f"Either provide data=, data_path=, or ensure cache exists."
+                )
+            if data_path is not None:
+                # Subprocess preprocessing - memory released after cache creation
+                from .subprocess_preprocess import preprocess_component
+                preprocess_component(
+                    type(self),
+                    data_path=data_path,
+                    cache_id=cache_id,
+                    cache_path=cache_path,
+                    filters=filters,
+                    filter_defaults=filter_defaults,
+                    interactivity=interactivity,
+                    **kwargs
                 )
-            self._raw_data = data
-            # Validate columns exist in data
-            self._validate_mappings()
-            # Run component-specific preprocessing
-            self._preprocess()
-            # Save to cache for next time
-            self._save_to_cache()
+                self._raw_data = None
+                self._load_from_cache()
+            else:
+                # In-process preprocessing (backward compatible)
+                self._raw_data = data
+                self._validate_mappings()
+                self._preprocess()
+                self._save_to_cache()
         else:
             # Load from valid cache
             self._raw_data = None
@@ -231,28 +254,18 @@ class BaseComponent(ABC):
             "data_values": {},
         }
-        # Save preprocessed data
-        row_group_size = self._get_row_group_size()
+        # Save preprocessed data - stream LazyFrames directly to disk
         for key, value in self._preprocessed_data.items():
             if isinstance(value, pl.LazyFrame):
                 filename = f"{key}.parquet"
                 filepath = preprocessed_dir / filename
-                value.collect().write_parquet(
-                    filepath,
-                    compression='zstd',
-                    statistics=True,
-                    row_group_size=row_group_size,
-                )
+                # Stream directly to disk without full materialization
+                value.sink_parquet(filepath, compression='zstd')
                 manifest["data_files"][key] = filename
             elif isinstance(value, pl.DataFrame):
                 filename = f"{key}.parquet"
                 filepath = preprocessed_dir / filename
-                value.write_parquet(
-                    filepath,
-                    compression='zstd',
-                    statistics=True,
-                    row_group_size=row_group_size,
-                )
+                value.write_parquet(filepath, compression='zstd')
                 manifest["data_files"][key] = filename
             elif self._is_json_serializable(value):
                 manifest["data_values"][key] = value
@@ -261,6 +274,13 @@ class BaseComponent(ABC):
         with open(self._get_manifest_path(), "w") as f:
             json.dump(manifest, f, indent=2)
+        # Release memory - data is now safely on disk
+        self._preprocessed_data = {}
+        self._raw_data = None
+        # Reload as lazy scan_parquet() references
+        self._load_from_cache()
     def _is_json_serializable(self, value: Any) -> bool:
         """Check if value can be JSON serialized."""
         try:

openms-insight 0.1.0__tar.gz → 0.1.1__tar.gz

openms-insight 0.1.0tar.gz → 0.1.1tar.gz