openms-insight 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {openms_insight-0.1.0 → openms_insight-0.1.1}/PKG-INFO +12 -12
  2. {openms_insight-0.1.0 → openms_insight-0.1.1}/README.md +11 -11
  3. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/heatmap.py +64 -36
  4. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/lineplot.py +16 -3
  5. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/table.py +16 -3
  6. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/base.py +43 -23
  7. openms_insight-0.1.1/openms_insight/core/subprocess_preprocess.py +96 -0
  8. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/index.css +1 -1
  9. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/index.js +1 -1
  10. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/preprocessing/filtering.py +8 -15
  11. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/rendering/bridge.py +88 -27
  12. {openms_insight-0.1.0 → openms_insight-0.1.1}/pyproject.toml +1 -1
  13. {openms_insight-0.1.0 → openms_insight-0.1.1}/.gitignore +0 -0
  14. {openms_insight-0.1.0 → openms_insight-0.1.1}/LICENSE +0 -0
  15. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/__init__.py +0 -0
  16. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/__init__.py +0 -0
  17. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/sequenceview.py +0 -0
  18. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/__init__.py +0 -0
  19. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/cache.py +0 -0
  20. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/registry.py +0 -0
  21. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/state.py +0 -0
  22. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.eot +0 -0
  23. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.ttf +0 -0
  24. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff +0 -0
  25. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff2 +0 -0
  26. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/index.html +0 -0
  27. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/preprocessing/__init__.py +0 -0
  28. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/preprocessing/compression.py +0 -0
  29. {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/rendering/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openms-insight
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Interactive visualization components for mass spectrometry data in Streamlit
5
5
  Project-URL: Homepage, https://github.com/t0mdavid-m/OpenMS-Insight
6
6
  Project-URL: Documentation, https://github.com/t0mdavid-m/OpenMS-Insight#readme
@@ -43,7 +43,7 @@ Interactive visualization components for mass spectrometry data in Streamlit, ba
43
43
  ## Features
44
44
 
45
45
  - **Cross-component selection linking** via shared identifiers
46
- - **Polars LazyFrame support** for efficient data handling
46
+ - **Memory-efficient preprocessing** via subprocess isolation
47
47
  - **Automatic disk caching** with config-based invalidation
48
48
  - **Table component** (Tabulator.js) with filtering, sorting, go-to, pagination
49
49
  - **Line plot component** (Plotly.js) with highlighting, annotations, zoom
@@ -60,7 +60,6 @@ pip install openms-insight
60
60
 
61
61
  ```python
62
62
  import streamlit as st
63
- import polars as pl
64
63
  from openms_insight import Table, LinePlot, StateManager
65
64
 
66
65
  # Create state manager for cross-component linking
@@ -69,7 +68,7 @@ state_manager = StateManager()
69
68
  # Create a table - clicking a row sets the 'item' selection
70
69
  table = Table(
71
70
  cache_id="items_table",
72
- data=pl.scan_parquet("items.parquet"),
71
+ data_path="items.parquet",
73
72
  interactivity={'item': 'item_id'},
74
73
  column_definitions=[
75
74
  {'field': 'item_id', 'title': 'ID', 'sorter': 'number'},
@@ -81,7 +80,7 @@ table(state_manager=state_manager)
81
80
  # Create a linked plot - filters by the selected 'item'
82
81
  plot = LinePlot(
83
82
  cache_id="values_plot",
84
- data=pl.scan_parquet("values.parquet"),
83
+ data_path="values.parquet",
85
84
  filters={'item': 'item_id'},
86
85
  x_column='x',
87
86
  y_column='y',
@@ -100,14 +99,14 @@ Components communicate through **identifiers** using two mechanisms:
100
99
  # Master table: no filters, sets 'spectrum' on click
101
100
  master = Table(
102
101
  cache_id="spectra",
103
- data=spectra_data,
102
+ data_path="spectra.parquet",
104
103
  interactivity={'spectrum': 'scan_id'}, # Click -> sets spectrum=scan_id
105
104
  )
106
105
 
107
106
  # Detail table: filters by 'spectrum', sets 'peak' on click
108
107
  detail = Table(
109
108
  cache_id="peaks",
110
- data=peaks_data,
109
+ data_path="peaks.parquet",
111
110
  filters={'spectrum': 'scan_id'}, # Filters where scan_id = selected spectrum
112
111
  interactivity={'peak': 'peak_id'}, # Click -> sets peak=peak_id
113
112
  )
@@ -115,7 +114,7 @@ detail = Table(
115
114
  # Plot: filters by 'spectrum', highlights selected 'peak'
116
115
  plot = LinePlot(
117
116
  cache_id="plot",
118
- data=peaks_data,
117
+ data_path="peaks.parquet",
119
118
  filters={'spectrum': 'scan_id'},
120
119
  interactivity={'peak': 'peak_id'},
121
120
  x_column='mass',
@@ -134,7 +133,7 @@ Interactive table using Tabulator.js with filtering dialogs, sorting, pagination
134
133
  ```python
135
134
  Table(
136
135
  cache_id="spectra_table",
137
- data=pl.scan_parquet("spectra.parquet"),
136
+ data_path="spectra.parquet",
138
137
  interactivity={'spectrum': 'scan_id'},
139
138
  column_definitions=[
140
139
  {'field': 'scan_id', 'title': 'Scan', 'sorter': 'number'},
@@ -156,7 +155,7 @@ Stick-style line plot using Plotly.js for mass spectra visualization.
156
155
  ```python
157
156
  LinePlot(
158
157
  cache_id="spectrum_plot",
159
- data=pl.scan_parquet("peaks.parquet"),
158
+ data_path="peaks.parquet",
160
159
  filters={'spectrum': 'scan_id'},
161
160
  interactivity={'peak': 'peak_id'},
162
161
  x_column='mass',
@@ -176,7 +175,7 @@ LinePlot(
176
175
  ```python
177
176
  Heatmap(
178
177
  cache_id="peaks_heatmap",
179
- data=pl.scan_parquet("all_peaks.parquet"),
178
+ data_path="all_peaks.parquet",
180
179
  x_column='retention_time',
181
180
  y_column='mass',
182
181
  intensity_column='intensity',
@@ -213,7 +212,8 @@ All components accept these common arguments:
213
212
  | Argument | Type | Default | Description |
214
213
  |----------|------|---------|-------------|
215
214
  | `cache_id` | `str` | **Required** | Unique identifier for disk cache |
216
- | `data` | `pl.LazyFrame` | `None` | Polars LazyFrame with source data |
215
+ | `data_path` | `str` | `None` | Path to parquet file (preferred - uses subprocess for memory efficiency) |
216
+ | `data` | `pl.LazyFrame` | `None` | Polars LazyFrame (alternative to data_path, in-process preprocessing) |
217
217
  | `filters` | `Dict[str, str]` | `None` | Map identifier -> column for filtering |
218
218
  | `interactivity` | `Dict[str, str]` | `None` | Map identifier -> column for click actions |
219
219
  | `cache_path` | `str` | `"."` | Base directory for cache storage |
@@ -8,7 +8,7 @@ Interactive visualization components for mass spectrometry data in Streamlit, ba
8
8
  ## Features
9
9
 
10
10
  - **Cross-component selection linking** via shared identifiers
11
- - **Polars LazyFrame support** for efficient data handling
11
+ - **Memory-efficient preprocessing** via subprocess isolation
12
12
  - **Automatic disk caching** with config-based invalidation
13
13
  - **Table component** (Tabulator.js) with filtering, sorting, go-to, pagination
14
14
  - **Line plot component** (Plotly.js) with highlighting, annotations, zoom
@@ -25,7 +25,6 @@ pip install openms-insight
25
25
 
26
26
  ```python
27
27
  import streamlit as st
28
- import polars as pl
29
28
  from openms_insight import Table, LinePlot, StateManager
30
29
 
31
30
  # Create state manager for cross-component linking
@@ -34,7 +33,7 @@ state_manager = StateManager()
34
33
  # Create a table - clicking a row sets the 'item' selection
35
34
  table = Table(
36
35
  cache_id="items_table",
37
- data=pl.scan_parquet("items.parquet"),
36
+ data_path="items.parquet",
38
37
  interactivity={'item': 'item_id'},
39
38
  column_definitions=[
40
39
  {'field': 'item_id', 'title': 'ID', 'sorter': 'number'},
@@ -46,7 +45,7 @@ table(state_manager=state_manager)
46
45
  # Create a linked plot - filters by the selected 'item'
47
46
  plot = LinePlot(
48
47
  cache_id="values_plot",
49
- data=pl.scan_parquet("values.parquet"),
48
+ data_path="values.parquet",
50
49
  filters={'item': 'item_id'},
51
50
  x_column='x',
52
51
  y_column='y',
@@ -65,14 +64,14 @@ Components communicate through **identifiers** using two mechanisms:
65
64
  # Master table: no filters, sets 'spectrum' on click
66
65
  master = Table(
67
66
  cache_id="spectra",
68
- data=spectra_data,
67
+ data_path="spectra.parquet",
69
68
  interactivity={'spectrum': 'scan_id'}, # Click -> sets spectrum=scan_id
70
69
  )
71
70
 
72
71
  # Detail table: filters by 'spectrum', sets 'peak' on click
73
72
  detail = Table(
74
73
  cache_id="peaks",
75
- data=peaks_data,
74
+ data_path="peaks.parquet",
76
75
  filters={'spectrum': 'scan_id'}, # Filters where scan_id = selected spectrum
77
76
  interactivity={'peak': 'peak_id'}, # Click -> sets peak=peak_id
78
77
  )
@@ -80,7 +79,7 @@ detail = Table(
80
79
  # Plot: filters by 'spectrum', highlights selected 'peak'
81
80
  plot = LinePlot(
82
81
  cache_id="plot",
83
- data=peaks_data,
82
+ data_path="peaks.parquet",
84
83
  filters={'spectrum': 'scan_id'},
85
84
  interactivity={'peak': 'peak_id'},
86
85
  x_column='mass',
@@ -99,7 +98,7 @@ Interactive table using Tabulator.js with filtering dialogs, sorting, pagination
99
98
  ```python
100
99
  Table(
101
100
  cache_id="spectra_table",
102
- data=pl.scan_parquet("spectra.parquet"),
101
+ data_path="spectra.parquet",
103
102
  interactivity={'spectrum': 'scan_id'},
104
103
  column_definitions=[
105
104
  {'field': 'scan_id', 'title': 'Scan', 'sorter': 'number'},
@@ -121,7 +120,7 @@ Stick-style line plot using Plotly.js for mass spectra visualization.
121
120
  ```python
122
121
  LinePlot(
123
122
  cache_id="spectrum_plot",
124
- data=pl.scan_parquet("peaks.parquet"),
123
+ data_path="peaks.parquet",
125
124
  filters={'spectrum': 'scan_id'},
126
125
  interactivity={'peak': 'peak_id'},
127
126
  x_column='mass',
@@ -141,7 +140,7 @@ LinePlot(
141
140
  ```python
142
141
  Heatmap(
143
142
  cache_id="peaks_heatmap",
144
- data=pl.scan_parquet("all_peaks.parquet"),
143
+ data_path="all_peaks.parquet",
145
144
  x_column='retention_time',
146
145
  y_column='mass',
147
146
  intensity_column='intensity',
@@ -178,7 +177,8 @@ All components accept these common arguments:
178
177
  | Argument | Type | Default | Description |
179
178
  |----------|------|---------|-------------|
180
179
  | `cache_id` | `str` | **Required** | Unique identifier for disk cache |
181
- | `data` | `pl.LazyFrame` | `None` | Polars LazyFrame with source data |
180
+ | `data_path` | `str` | `None` | Path to parquet file (preferred - uses subprocess for memory efficiency) |
181
+ | `data` | `pl.LazyFrame` | `None` | Polars LazyFrame (alternative to data_path, in-process preprocessing) |
182
182
  | `filters` | `Dict[str, str]` | `None` | Map identifier -> column for filtering |
183
183
  | `interactivity` | `Dict[str, str]` | `None` | Map identifier -> column for click actions |
184
184
  | `cache_path` | `str` | `"."` | Base directory for cache storage |
@@ -69,6 +69,7 @@ class Heatmap(BaseComponent):
69
69
  x_column: str,
70
70
  y_column: str,
71
71
  data: Optional[pl.LazyFrame] = None,
72
+ data_path: Optional[str] = None,
72
73
  intensity_column: str = 'intensity',
73
74
  filters: Optional[Dict[str, str]] = None,
74
75
  filter_defaults: Optional[Dict[str, Any]] = None,
@@ -97,6 +98,7 @@ class Heatmap(BaseComponent):
97
98
  x_column: Name of column for x-axis values
98
99
  y_column: Name of column for y-axis values
99
100
  data: Polars LazyFrame with heatmap data. Optional if cache exists.
101
+ data_path: Path to parquet file (preferred for large datasets).
100
102
  intensity_column: Name of column for intensity/color values
101
103
  filters: Mapping of identifier names to column names for filtering
102
104
  interactivity: Mapping of identifier names to column names for clicks.
@@ -142,11 +144,27 @@ class Heatmap(BaseComponent):
142
144
  super().__init__(
143
145
  cache_id=cache_id,
144
146
  data=data,
147
+ data_path=data_path,
145
148
  filters=filters,
146
149
  filter_defaults=filter_defaults,
147
150
  interactivity=interactivity,
148
151
  cache_path=cache_path,
149
152
  regenerate_cache=regenerate_cache,
153
+ # Pass component-specific params for subprocess recreation
154
+ x_column=x_column,
155
+ y_column=y_column,
156
+ intensity_column=intensity_column,
157
+ min_points=min_points,
158
+ x_bins=x_bins,
159
+ y_bins=y_bins,
160
+ zoom_identifier=zoom_identifier,
161
+ title=title,
162
+ x_label=x_label,
163
+ y_label=y_label,
164
+ colorscale=colorscale,
165
+ use_simple_downsample=use_simple_downsample,
166
+ use_streaming=use_streaming,
167
+ categorical_filters=categorical_filters,
150
168
  **kwargs
151
169
  )
152
170
 
@@ -271,9 +289,8 @@ class Heatmap(BaseComponent):
271
289
 
272
290
  # Store level sizes for this filter value
273
291
  self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
274
- self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = len(level_sizes)
275
292
 
276
- # Build each level
293
+ # Build each compressed level
277
294
  for level_idx, target_size in enumerate(level_sizes):
278
295
  # If target size equals total, skip downsampling - use all data
279
296
  if target_size >= filtered_total:
@@ -297,15 +314,20 @@ class Heatmap(BaseComponent):
297
314
  y_range=y_range,
298
315
  )
299
316
 
300
- # Collect and store
317
+ # Store LazyFrame for streaming to disk
301
318
  level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
302
- self._preprocessed_data[level_key] = level.collect()
319
+ self._preprocessed_data[level_key] = level # Keep lazy
320
+
321
+ # Add full resolution as final level (for zoom fallback)
322
+ num_compressed = len(level_sizes)
323
+ full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
324
+ self._preprocessed_data[full_res_key] = filtered_data
325
+ self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
303
326
 
304
327
  # Also create global levels for when no categorical filter is selected
305
328
  # (fallback to standard behavior)
306
329
  level_sizes = compute_compression_levels(self._min_points, total)
307
330
  self._preprocessed_data['level_sizes'] = level_sizes
308
- self._preprocessed_data['num_levels'] = len(level_sizes)
309
331
 
310
332
  for i, size in enumerate(level_sizes):
311
333
  # If target size equals total, skip downsampling - use all data
@@ -329,13 +351,18 @@ class Heatmap(BaseComponent):
329
351
  x_range=x_range,
330
352
  y_range=y_range,
331
353
  )
332
- self._preprocessed_data[f'level_{i}'] = level.collect()
354
+ self._preprocessed_data[f'level_{i}'] = level # Keep lazy
355
+
356
+ # Add full resolution as final level (for zoom fallback)
357
+ num_compressed = len(level_sizes)
358
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
359
+ self._preprocessed_data['num_levels'] = num_compressed + 1
333
360
 
334
361
  def _preprocess_streaming(self) -> None:
335
362
  """
336
- Streaming preprocessing - levels stay lazy until render.
363
+ Streaming preprocessing - levels stay lazy through caching.
337
364
 
338
- Builds lazy query plans and collects them for caching.
365
+ Builds lazy query plans that are streamed to disk via sink_parquet().
339
366
  """
340
367
  # Get data ranges (minimal collect - just 4 values)
341
368
  x_range, y_range = get_data_range(
@@ -379,12 +406,16 @@ class Heatmap(BaseComponent):
379
406
  x_range=x_range,
380
407
  y_range=y_range,
381
408
  )
382
- # Collect and store as DataFrame for caching
383
- # Base class will serialize these to parquet
384
- self._preprocessed_data[f'level_{i}'] = level.collect()
409
+ # Store LazyFrame for streaming to disk
410
+ # Base class will use sink_parquet() to stream without full materialization
411
+ self._preprocessed_data[f'level_{i}'] = level # Keep lazy
385
412
 
386
- # Store number of levels for reconstruction
387
- self._preprocessed_data['num_levels'] = len(level_sizes)
413
+ # Add full resolution as final level (for zoom fallback)
414
+ num_compressed = len(level_sizes)
415
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
416
+
417
+ # Store number of levels for reconstruction (includes full resolution)
418
+ self._preprocessed_data['num_levels'] = num_compressed + 1
388
419
 
389
420
  def _preprocess_eager(self) -> None:
390
421
  """
@@ -434,16 +465,21 @@ class Heatmap(BaseComponent):
434
465
  x_bins=self._x_bins,
435
466
  y_bins=self._y_bins,
436
467
  )
437
- # Collect for caching - store with reversed index
468
+ # Store LazyFrame for streaming to disk
438
469
  level_idx = len(level_sizes) - 1 - i
439
470
  if isinstance(downsampled, pl.LazyFrame):
440
- self._preprocessed_data[f'level_{level_idx}'] = downsampled.collect()
471
+ self._preprocessed_data[f'level_{level_idx}'] = downsampled # Keep lazy
441
472
  else:
442
- self._preprocessed_data[f'level_{level_idx}'] = downsampled
473
+ # DataFrame from downsample_2d - convert back to lazy
474
+ self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
443
475
  current = downsampled
444
476
 
445
- # Store number of levels for reconstruction
446
- self._preprocessed_data['num_levels'] = len(level_sizes)
477
+ # Add full resolution as final level (for zoom fallback)
478
+ num_compressed = len(level_sizes)
479
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
480
+
481
+ # Store number of levels for reconstruction (includes full resolution)
482
+ self._preprocessed_data['num_levels'] = num_compressed + 1
447
483
 
448
484
  def _get_levels(self) -> list:
449
485
  """
@@ -460,10 +496,6 @@ class Heatmap(BaseComponent):
460
496
  if level_data is not None:
461
497
  levels.append(level_data)
462
498
 
463
- # Add full resolution at end (if raw data available)
464
- if self._raw_data is not None:
465
- levels.append(self._raw_data)
466
-
467
499
  return levels
468
500
 
469
501
  def _get_categorical_levels(
@@ -496,13 +528,7 @@ class Heatmap(BaseComponent):
496
528
  if level_data is not None:
497
529
  levels.append(level_data)
498
530
 
499
- # Get filtered raw data for full resolution (if available)
500
- filtered_raw = None
501
- if self._raw_data is not None and filter_id in self._filters:
502
- column_name = self._filters[filter_id]
503
- filtered_raw = self._raw_data.filter(pl.col(column_name) == filter_value)
504
-
505
- return levels, filtered_raw
531
+ return levels, None # Full resolution included in cached levels
506
532
 
507
533
  def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
508
534
  """
@@ -630,10 +656,11 @@ class Heatmap(BaseComponent):
630
656
 
631
657
  if count >= self._min_points:
632
658
  # This level has enough detail
633
- if count > self._min_points * 2:
634
- # Still too many - downsample further
635
- x_range = self._preprocessed_data.get('x_range')
636
- y_range = self._preprocessed_data.get('y_range')
659
+ if count > self._min_points:
660
+ # Over limit - downsample to stay at/under max
661
+ # Use ZOOM range for binning (not global) to avoid sparse bins
662
+ zoom_x_range = (x0, x1)
663
+ zoom_y_range = (y0, y1)
637
664
  if self._use_streaming or self._use_simple_downsample:
638
665
  if self._use_simple_downsample:
639
666
  return downsample_2d_simple(
@@ -650,8 +677,8 @@ class Heatmap(BaseComponent):
650
677
  intensity_column=self._intensity_column,
651
678
  x_bins=self._x_bins,
652
679
  y_bins=self._y_bins,
653
- x_range=x_range,
654
- y_range=y_range,
680
+ x_range=zoom_x_range,
681
+ y_range=zoom_y_range,
655
682
  ).collect()
656
683
  else:
657
684
  return downsample_2d(
@@ -744,7 +771,8 @@ class Heatmap(BaseComponent):
744
771
  df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
745
772
  else:
746
773
  # No filters to apply - levels already filtered by categorical filter
747
- available_cols = [c for c in columns_to_select if c in data.columns]
774
+ schema_names = data.collect_schema().names()
775
+ available_cols = [c for c in columns_to_select if c in schema_names]
748
776
  df_polars = data.select(available_cols).collect()
749
777
  # Sort by intensity ascending so high-intensity points are drawn on top
750
778
  df_polars = df_polars.sort(self._intensity_column)
@@ -45,6 +45,7 @@ class LinePlot(BaseComponent):
45
45
  self,
46
46
  cache_id: str,
47
47
  data: Optional[pl.LazyFrame] = None,
48
+ data_path: Optional[str] = None,
48
49
  filters: Optional[Dict[str, str]] = None,
49
50
  filter_defaults: Optional[Dict[str, Any]] = None,
50
51
  interactivity: Optional[Dict[str, str]] = None,
@@ -68,6 +69,7 @@ class LinePlot(BaseComponent):
68
69
  cache_id: Unique identifier for this component's cache (MANDATORY).
69
70
  Creates a folder {cache_path}/{cache_id}/ for cached data.
70
71
  data: Polars LazyFrame with plot data. Optional if cache exists.
72
+ data_path: Path to parquet file (preferred for large datasets).
71
73
  filters: Mapping of identifier names to column names for filtering.
72
74
  Example: {'spectrum': 'scan_id'}
73
75
  When 'spectrum' selection exists, plot shows only data where
@@ -116,11 +118,22 @@ class LinePlot(BaseComponent):
116
118
  super().__init__(
117
119
  cache_id=cache_id,
118
120
  data=data,
121
+ data_path=data_path,
119
122
  filters=filters,
120
123
  filter_defaults=filter_defaults,
121
124
  interactivity=interactivity,
122
125
  cache_path=cache_path,
123
126
  regenerate_cache=regenerate_cache,
127
+ # Pass component-specific params for subprocess recreation
128
+ x_column=x_column,
129
+ y_column=y_column,
130
+ title=title,
131
+ x_label=x_label,
132
+ y_label=y_label,
133
+ highlight_column=highlight_column,
134
+ annotation_column=annotation_column,
135
+ styling=styling,
136
+ config=config,
124
137
  **kwargs
125
138
  )
126
139
 
@@ -208,9 +221,9 @@ class LinePlot(BaseComponent):
208
221
  'annotation_column': self._annotation_column,
209
222
  }
210
223
 
211
- # Collect data for caching (filter happens at render time)
212
- # Base class will serialize this to parquet
213
- self._preprocessed_data['data'] = data.collect()
224
+ # Store LazyFrame for streaming to disk (filter happens at render time)
225
+ # Base class will use sink_parquet() to stream without full materialization
226
+ self._preprocessed_data['data'] = data # Keep lazy
214
227
 
215
228
  def _get_vue_component_name(self) -> str:
216
229
  """Return the Vue component name."""
@@ -49,6 +49,7 @@ class Table(BaseComponent):
49
49
  self,
50
50
  cache_id: str,
51
51
  data: Optional[pl.LazyFrame] = None,
52
+ data_path: Optional[str] = None,
52
53
  filters: Optional[Dict[str, str]] = None,
53
54
  filter_defaults: Optional[Dict[str, Any]] = None,
54
55
  interactivity: Optional[Dict[str, str]] = None,
@@ -72,6 +73,7 @@ class Table(BaseComponent):
72
73
  cache_id: Unique identifier for this component's cache (MANDATORY).
73
74
  Creates a folder {cache_path}/{cache_id}/ for cached data.
74
75
  data: Polars LazyFrame with table data. Optional if cache exists.
76
+ data_path: Path to parquet file (preferred for large datasets).
75
77
  filters: Mapping of identifier names to column names for filtering.
76
78
  Example: {'spectrum': 'scan_id'}
77
79
  When 'spectrum' selection exists, table shows only rows where
@@ -120,11 +122,22 @@ class Table(BaseComponent):
120
122
  super().__init__(
121
123
  cache_id=cache_id,
122
124
  data=data,
125
+ data_path=data_path,
123
126
  filters=filters,
124
127
  filter_defaults=filter_defaults,
125
128
  interactivity=interactivity,
126
129
  cache_path=cache_path,
127
130
  regenerate_cache=regenerate_cache,
131
+ # Pass component-specific params for subprocess recreation
132
+ column_definitions=column_definitions,
133
+ title=title,
134
+ index_field=index_field,
135
+ go_to_fields=go_to_fields,
136
+ layout=layout,
137
+ default_row=default_row,
138
+ initial_sort=initial_sort,
139
+ pagination=pagination,
140
+ page_size=page_size,
128
141
  **kwargs
129
142
  )
130
143
 
@@ -204,9 +217,9 @@ class Table(BaseComponent):
204
217
  # Store column definitions in preprocessed data for serialization
205
218
  self._preprocessed_data['column_definitions'] = self._column_definitions
206
219
 
207
- # Collect data for caching (filter happens at render time)
208
- # Base class will serialize this to parquet with optimized row groups
209
- self._preprocessed_data['data'] = data.collect()
220
+ # Store LazyFrame for streaming to disk (filter happens at render time)
221
+ # Base class will use sink_parquet() to stream without full materialization
222
+ self._preprocessed_data['data'] = data # Keep lazy
210
223
 
211
224
  def _get_columns_to_select(self) -> Optional[List[str]]:
212
225
  """Get list of columns needed for this table."""
@@ -43,6 +43,7 @@ class BaseComponent(ABC):
43
43
  self,
44
44
  cache_id: str,
45
45
  data: Optional[pl.LazyFrame] = None,
46
+ data_path: Optional[str] = None,
46
47
  filters: Optional[Dict[str, str]] = None,
47
48
  filter_defaults: Optional[Dict[str, Any]] = None,
48
49
  interactivity: Optional[Dict[str, str]] = None,
@@ -57,6 +58,9 @@ class BaseComponent(ABC):
57
58
  cache_id: Unique identifier for this component's cache (MANDATORY).
58
59
  Creates a folder {cache_path}/{cache_id}/ for cached data.
59
60
  data: Polars LazyFrame with source data. Optional if cache exists.
61
+ data_path: Path to parquet file with source data. Preferred over
62
+ data= for large datasets as preprocessing runs in a subprocess
63
+ to ensure memory is released after cache creation.
60
64
  filters: Mapping of identifier names to column names for filtering.
61
65
  Example: {'spectrum': 'scan_id'}
62
66
  When 'spectrum' selection exists, component filters data where
@@ -73,6 +77,10 @@ class BaseComponent(ABC):
73
77
  regenerate_cache: If True, regenerate cache even if valid cache exists.
74
78
  **kwargs: Component-specific configuration options
75
79
  """
80
+ # Validate inputs
81
+ if data is not None and data_path is not None:
82
+ raise ValueError("Provide either 'data' or 'data_path', not both")
83
+
76
84
  self._cache_id = cache_id
77
85
  self._cache_dir = get_cache_dir(cache_path, cache_id)
78
86
  self._filters = filters or {}
@@ -83,18 +91,33 @@ class BaseComponent(ABC):
83
91
 
84
92
  # Check if we should load from cache or preprocess
85
93
  if regenerate_cache or not self._is_cache_valid():
86
- if data is None:
94
+ if data is None and data_path is None:
87
95
  raise CacheMissError(
88
96
  f"Cache not found at '{self._cache_dir}' and no data provided. "
89
- f"Either provide data= or ensure cache exists from a previous run."
97
+ f"Either provide data=, data_path=, or ensure cache exists."
98
+ )
99
+
100
+ if data_path is not None:
101
+ # Subprocess preprocessing - memory released after cache creation
102
+ from .subprocess_preprocess import preprocess_component
103
+ preprocess_component(
104
+ type(self),
105
+ data_path=data_path,
106
+ cache_id=cache_id,
107
+ cache_path=cache_path,
108
+ filters=filters,
109
+ filter_defaults=filter_defaults,
110
+ interactivity=interactivity,
111
+ **kwargs
90
112
  )
91
- self._raw_data = data
92
- # Validate columns exist in data
93
- self._validate_mappings()
94
- # Run component-specific preprocessing
95
- self._preprocess()
96
- # Save to cache for next time
97
- self._save_to_cache()
113
+ self._raw_data = None
114
+ self._load_from_cache()
115
+ else:
116
+ # In-process preprocessing (backward compatible)
117
+ self._raw_data = data
118
+ self._validate_mappings()
119
+ self._preprocess()
120
+ self._save_to_cache()
98
121
  else:
99
122
  # Load from valid cache
100
123
  self._raw_data = None
@@ -231,28 +254,18 @@ class BaseComponent(ABC):
231
254
  "data_values": {},
232
255
  }
233
256
 
234
- # Save preprocessed data
235
- row_group_size = self._get_row_group_size()
257
+ # Save preprocessed data - stream LazyFrames directly to disk
236
258
  for key, value in self._preprocessed_data.items():
237
259
  if isinstance(value, pl.LazyFrame):
238
260
  filename = f"{key}.parquet"
239
261
  filepath = preprocessed_dir / filename
240
- value.collect().write_parquet(
241
- filepath,
242
- compression='zstd',
243
- statistics=True,
244
- row_group_size=row_group_size,
245
- )
262
+ # Stream directly to disk without full materialization
263
+ value.sink_parquet(filepath, compression='zstd')
246
264
  manifest["data_files"][key] = filename
247
265
  elif isinstance(value, pl.DataFrame):
248
266
  filename = f"{key}.parquet"
249
267
  filepath = preprocessed_dir / filename
250
- value.write_parquet(
251
- filepath,
252
- compression='zstd',
253
- statistics=True,
254
- row_group_size=row_group_size,
255
- )
268
+ value.write_parquet(filepath, compression='zstd')
256
269
  manifest["data_files"][key] = filename
257
270
  elif self._is_json_serializable(value):
258
271
  manifest["data_values"][key] = value
@@ -261,6 +274,13 @@ class BaseComponent(ABC):
261
274
  with open(self._get_manifest_path(), "w") as f:
262
275
  json.dump(manifest, f, indent=2)
263
276
 
277
+ # Release memory - data is now safely on disk
278
+ self._preprocessed_data = {}
279
+ self._raw_data = None
280
+
281
+ # Reload as lazy scan_parquet() references
282
+ self._load_from_cache()
283
+
264
284
  def _is_json_serializable(self, value: Any) -> bool:
265
285
  """Check if value can be JSON serialized."""
266
286
  try: