openms-insight 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openms_insight-0.1.0 → openms_insight-0.1.1}/PKG-INFO +12 -12
- {openms_insight-0.1.0 → openms_insight-0.1.1}/README.md +11 -11
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/heatmap.py +64 -36
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/lineplot.py +16 -3
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/table.py +16 -3
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/base.py +43 -23
- openms_insight-0.1.1/openms_insight/core/subprocess_preprocess.py +96 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/index.css +1 -1
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/index.js +1 -1
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/preprocessing/filtering.py +8 -15
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/rendering/bridge.py +88 -27
- {openms_insight-0.1.0 → openms_insight-0.1.1}/pyproject.toml +1 -1
- {openms_insight-0.1.0 → openms_insight-0.1.1}/.gitignore +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/LICENSE +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/__init__.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/__init__.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/components/sequenceview.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/__init__.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/cache.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/registry.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/core/state.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.eot +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.ttf +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff2 +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/js-component/dist/index.html +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/preprocessing/__init__.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/preprocessing/compression.py +0 -0
- {openms_insight-0.1.0 → openms_insight-0.1.1}/openms_insight/rendering/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openms-insight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Interactive visualization components for mass spectrometry data in Streamlit
|
|
5
5
|
Project-URL: Homepage, https://github.com/t0mdavid-m/OpenMS-Insight
|
|
6
6
|
Project-URL: Documentation, https://github.com/t0mdavid-m/OpenMS-Insight#readme
|
|
@@ -43,7 +43,7 @@ Interactive visualization components for mass spectrometry data in Streamlit, ba
|
|
|
43
43
|
## Features
|
|
44
44
|
|
|
45
45
|
- **Cross-component selection linking** via shared identifiers
|
|
46
|
-
- **
|
|
46
|
+
- **Memory-efficient preprocessing** via subprocess isolation
|
|
47
47
|
- **Automatic disk caching** with config-based invalidation
|
|
48
48
|
- **Table component** (Tabulator.js) with filtering, sorting, go-to, pagination
|
|
49
49
|
- **Line plot component** (Plotly.js) with highlighting, annotations, zoom
|
|
@@ -60,7 +60,6 @@ pip install openms-insight
|
|
|
60
60
|
|
|
61
61
|
```python
|
|
62
62
|
import streamlit as st
|
|
63
|
-
import polars as pl
|
|
64
63
|
from openms_insight import Table, LinePlot, StateManager
|
|
65
64
|
|
|
66
65
|
# Create state manager for cross-component linking
|
|
@@ -69,7 +68,7 @@ state_manager = StateManager()
|
|
|
69
68
|
# Create a table - clicking a row sets the 'item' selection
|
|
70
69
|
table = Table(
|
|
71
70
|
cache_id="items_table",
|
|
72
|
-
|
|
71
|
+
data_path="items.parquet",
|
|
73
72
|
interactivity={'item': 'item_id'},
|
|
74
73
|
column_definitions=[
|
|
75
74
|
{'field': 'item_id', 'title': 'ID', 'sorter': 'number'},
|
|
@@ -81,7 +80,7 @@ table(state_manager=state_manager)
|
|
|
81
80
|
# Create a linked plot - filters by the selected 'item'
|
|
82
81
|
plot = LinePlot(
|
|
83
82
|
cache_id="values_plot",
|
|
84
|
-
|
|
83
|
+
data_path="values.parquet",
|
|
85
84
|
filters={'item': 'item_id'},
|
|
86
85
|
x_column='x',
|
|
87
86
|
y_column='y',
|
|
@@ -100,14 +99,14 @@ Components communicate through **identifiers** using two mechanisms:
|
|
|
100
99
|
# Master table: no filters, sets 'spectrum' on click
|
|
101
100
|
master = Table(
|
|
102
101
|
cache_id="spectra",
|
|
103
|
-
|
|
102
|
+
data_path="spectra.parquet",
|
|
104
103
|
interactivity={'spectrum': 'scan_id'}, # Click -> sets spectrum=scan_id
|
|
105
104
|
)
|
|
106
105
|
|
|
107
106
|
# Detail table: filters by 'spectrum', sets 'peak' on click
|
|
108
107
|
detail = Table(
|
|
109
108
|
cache_id="peaks",
|
|
110
|
-
|
|
109
|
+
data_path="peaks.parquet",
|
|
111
110
|
filters={'spectrum': 'scan_id'}, # Filters where scan_id = selected spectrum
|
|
112
111
|
interactivity={'peak': 'peak_id'}, # Click -> sets peak=peak_id
|
|
113
112
|
)
|
|
@@ -115,7 +114,7 @@ detail = Table(
|
|
|
115
114
|
# Plot: filters by 'spectrum', highlights selected 'peak'
|
|
116
115
|
plot = LinePlot(
|
|
117
116
|
cache_id="plot",
|
|
118
|
-
|
|
117
|
+
data_path="peaks.parquet",
|
|
119
118
|
filters={'spectrum': 'scan_id'},
|
|
120
119
|
interactivity={'peak': 'peak_id'},
|
|
121
120
|
x_column='mass',
|
|
@@ -134,7 +133,7 @@ Interactive table using Tabulator.js with filtering dialogs, sorting, pagination
|
|
|
134
133
|
```python
|
|
135
134
|
Table(
|
|
136
135
|
cache_id="spectra_table",
|
|
137
|
-
|
|
136
|
+
data_path="spectra.parquet",
|
|
138
137
|
interactivity={'spectrum': 'scan_id'},
|
|
139
138
|
column_definitions=[
|
|
140
139
|
{'field': 'scan_id', 'title': 'Scan', 'sorter': 'number'},
|
|
@@ -156,7 +155,7 @@ Stick-style line plot using Plotly.js for mass spectra visualization.
|
|
|
156
155
|
```python
|
|
157
156
|
LinePlot(
|
|
158
157
|
cache_id="spectrum_plot",
|
|
159
|
-
|
|
158
|
+
data_path="peaks.parquet",
|
|
160
159
|
filters={'spectrum': 'scan_id'},
|
|
161
160
|
interactivity={'peak': 'peak_id'},
|
|
162
161
|
x_column='mass',
|
|
@@ -176,7 +175,7 @@ LinePlot(
|
|
|
176
175
|
```python
|
|
177
176
|
Heatmap(
|
|
178
177
|
cache_id="peaks_heatmap",
|
|
179
|
-
|
|
178
|
+
data_path="all_peaks.parquet",
|
|
180
179
|
x_column='retention_time',
|
|
181
180
|
y_column='mass',
|
|
182
181
|
intensity_column='intensity',
|
|
@@ -213,7 +212,8 @@ All components accept these common arguments:
|
|
|
213
212
|
| Argument | Type | Default | Description |
|
|
214
213
|
|----------|------|---------|-------------|
|
|
215
214
|
| `cache_id` | `str` | **Required** | Unique identifier for disk cache |
|
|
216
|
-
| `
|
|
215
|
+
| `data_path` | `str` | `None` | Path to parquet file (preferred - uses subprocess for memory efficiency) |
|
|
216
|
+
| `data` | `pl.LazyFrame` | `None` | Polars LazyFrame (alternative to data_path, in-process preprocessing) |
|
|
217
217
|
| `filters` | `Dict[str, str]` | `None` | Map identifier -> column for filtering |
|
|
218
218
|
| `interactivity` | `Dict[str, str]` | `None` | Map identifier -> column for click actions |
|
|
219
219
|
| `cache_path` | `str` | `"."` | Base directory for cache storage |
|
|
@@ -8,7 +8,7 @@ Interactive visualization components for mass spectrometry data in Streamlit, ba
|
|
|
8
8
|
## Features
|
|
9
9
|
|
|
10
10
|
- **Cross-component selection linking** via shared identifiers
|
|
11
|
-
- **
|
|
11
|
+
- **Memory-efficient preprocessing** via subprocess isolation
|
|
12
12
|
- **Automatic disk caching** with config-based invalidation
|
|
13
13
|
- **Table component** (Tabulator.js) with filtering, sorting, go-to, pagination
|
|
14
14
|
- **Line plot component** (Plotly.js) with highlighting, annotations, zoom
|
|
@@ -25,7 +25,6 @@ pip install openms-insight
|
|
|
25
25
|
|
|
26
26
|
```python
|
|
27
27
|
import streamlit as st
|
|
28
|
-
import polars as pl
|
|
29
28
|
from openms_insight import Table, LinePlot, StateManager
|
|
30
29
|
|
|
31
30
|
# Create state manager for cross-component linking
|
|
@@ -34,7 +33,7 @@ state_manager = StateManager()
|
|
|
34
33
|
# Create a table - clicking a row sets the 'item' selection
|
|
35
34
|
table = Table(
|
|
36
35
|
cache_id="items_table",
|
|
37
|
-
|
|
36
|
+
data_path="items.parquet",
|
|
38
37
|
interactivity={'item': 'item_id'},
|
|
39
38
|
column_definitions=[
|
|
40
39
|
{'field': 'item_id', 'title': 'ID', 'sorter': 'number'},
|
|
@@ -46,7 +45,7 @@ table(state_manager=state_manager)
|
|
|
46
45
|
# Create a linked plot - filters by the selected 'item'
|
|
47
46
|
plot = LinePlot(
|
|
48
47
|
cache_id="values_plot",
|
|
49
|
-
|
|
48
|
+
data_path="values.parquet",
|
|
50
49
|
filters={'item': 'item_id'},
|
|
51
50
|
x_column='x',
|
|
52
51
|
y_column='y',
|
|
@@ -65,14 +64,14 @@ Components communicate through **identifiers** using two mechanisms:
|
|
|
65
64
|
# Master table: no filters, sets 'spectrum' on click
|
|
66
65
|
master = Table(
|
|
67
66
|
cache_id="spectra",
|
|
68
|
-
|
|
67
|
+
data_path="spectra.parquet",
|
|
69
68
|
interactivity={'spectrum': 'scan_id'}, # Click -> sets spectrum=scan_id
|
|
70
69
|
)
|
|
71
70
|
|
|
72
71
|
# Detail table: filters by 'spectrum', sets 'peak' on click
|
|
73
72
|
detail = Table(
|
|
74
73
|
cache_id="peaks",
|
|
75
|
-
|
|
74
|
+
data_path="peaks.parquet",
|
|
76
75
|
filters={'spectrum': 'scan_id'}, # Filters where scan_id = selected spectrum
|
|
77
76
|
interactivity={'peak': 'peak_id'}, # Click -> sets peak=peak_id
|
|
78
77
|
)
|
|
@@ -80,7 +79,7 @@ detail = Table(
|
|
|
80
79
|
# Plot: filters by 'spectrum', highlights selected 'peak'
|
|
81
80
|
plot = LinePlot(
|
|
82
81
|
cache_id="plot",
|
|
83
|
-
|
|
82
|
+
data_path="peaks.parquet",
|
|
84
83
|
filters={'spectrum': 'scan_id'},
|
|
85
84
|
interactivity={'peak': 'peak_id'},
|
|
86
85
|
x_column='mass',
|
|
@@ -99,7 +98,7 @@ Interactive table using Tabulator.js with filtering dialogs, sorting, pagination
|
|
|
99
98
|
```python
|
|
100
99
|
Table(
|
|
101
100
|
cache_id="spectra_table",
|
|
102
|
-
|
|
101
|
+
data_path="spectra.parquet",
|
|
103
102
|
interactivity={'spectrum': 'scan_id'},
|
|
104
103
|
column_definitions=[
|
|
105
104
|
{'field': 'scan_id', 'title': 'Scan', 'sorter': 'number'},
|
|
@@ -121,7 +120,7 @@ Stick-style line plot using Plotly.js for mass spectra visualization.
|
|
|
121
120
|
```python
|
|
122
121
|
LinePlot(
|
|
123
122
|
cache_id="spectrum_plot",
|
|
124
|
-
|
|
123
|
+
data_path="peaks.parquet",
|
|
125
124
|
filters={'spectrum': 'scan_id'},
|
|
126
125
|
interactivity={'peak': 'peak_id'},
|
|
127
126
|
x_column='mass',
|
|
@@ -141,7 +140,7 @@ LinePlot(
|
|
|
141
140
|
```python
|
|
142
141
|
Heatmap(
|
|
143
142
|
cache_id="peaks_heatmap",
|
|
144
|
-
|
|
143
|
+
data_path="all_peaks.parquet",
|
|
145
144
|
x_column='retention_time',
|
|
146
145
|
y_column='mass',
|
|
147
146
|
intensity_column='intensity',
|
|
@@ -178,7 +177,8 @@ All components accept these common arguments:
|
|
|
178
177
|
| Argument | Type | Default | Description |
|
|
179
178
|
|----------|------|---------|-------------|
|
|
180
179
|
| `cache_id` | `str` | **Required** | Unique identifier for disk cache |
|
|
181
|
-
| `
|
|
180
|
+
| `data_path` | `str` | `None` | Path to parquet file (preferred - uses subprocess for memory efficiency) |
|
|
181
|
+
| `data` | `pl.LazyFrame` | `None` | Polars LazyFrame (alternative to data_path, in-process preprocessing) |
|
|
182
182
|
| `filters` | `Dict[str, str]` | `None` | Map identifier -> column for filtering |
|
|
183
183
|
| `interactivity` | `Dict[str, str]` | `None` | Map identifier -> column for click actions |
|
|
184
184
|
| `cache_path` | `str` | `"."` | Base directory for cache storage |
|
|
@@ -69,6 +69,7 @@ class Heatmap(BaseComponent):
|
|
|
69
69
|
x_column: str,
|
|
70
70
|
y_column: str,
|
|
71
71
|
data: Optional[pl.LazyFrame] = None,
|
|
72
|
+
data_path: Optional[str] = None,
|
|
72
73
|
intensity_column: str = 'intensity',
|
|
73
74
|
filters: Optional[Dict[str, str]] = None,
|
|
74
75
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
@@ -97,6 +98,7 @@ class Heatmap(BaseComponent):
|
|
|
97
98
|
x_column: Name of column for x-axis values
|
|
98
99
|
y_column: Name of column for y-axis values
|
|
99
100
|
data: Polars LazyFrame with heatmap data. Optional if cache exists.
|
|
101
|
+
data_path: Path to parquet file (preferred for large datasets).
|
|
100
102
|
intensity_column: Name of column for intensity/color values
|
|
101
103
|
filters: Mapping of identifier names to column names for filtering
|
|
102
104
|
interactivity: Mapping of identifier names to column names for clicks.
|
|
@@ -142,11 +144,27 @@ class Heatmap(BaseComponent):
|
|
|
142
144
|
super().__init__(
|
|
143
145
|
cache_id=cache_id,
|
|
144
146
|
data=data,
|
|
147
|
+
data_path=data_path,
|
|
145
148
|
filters=filters,
|
|
146
149
|
filter_defaults=filter_defaults,
|
|
147
150
|
interactivity=interactivity,
|
|
148
151
|
cache_path=cache_path,
|
|
149
152
|
regenerate_cache=regenerate_cache,
|
|
153
|
+
# Pass component-specific params for subprocess recreation
|
|
154
|
+
x_column=x_column,
|
|
155
|
+
y_column=y_column,
|
|
156
|
+
intensity_column=intensity_column,
|
|
157
|
+
min_points=min_points,
|
|
158
|
+
x_bins=x_bins,
|
|
159
|
+
y_bins=y_bins,
|
|
160
|
+
zoom_identifier=zoom_identifier,
|
|
161
|
+
title=title,
|
|
162
|
+
x_label=x_label,
|
|
163
|
+
y_label=y_label,
|
|
164
|
+
colorscale=colorscale,
|
|
165
|
+
use_simple_downsample=use_simple_downsample,
|
|
166
|
+
use_streaming=use_streaming,
|
|
167
|
+
categorical_filters=categorical_filters,
|
|
150
168
|
**kwargs
|
|
151
169
|
)
|
|
152
170
|
|
|
@@ -271,9 +289,8 @@ class Heatmap(BaseComponent):
|
|
|
271
289
|
|
|
272
290
|
# Store level sizes for this filter value
|
|
273
291
|
self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
|
|
274
|
-
self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = len(level_sizes)
|
|
275
292
|
|
|
276
|
-
# Build each level
|
|
293
|
+
# Build each compressed level
|
|
277
294
|
for level_idx, target_size in enumerate(level_sizes):
|
|
278
295
|
# If target size equals total, skip downsampling - use all data
|
|
279
296
|
if target_size >= filtered_total:
|
|
@@ -297,15 +314,20 @@ class Heatmap(BaseComponent):
|
|
|
297
314
|
y_range=y_range,
|
|
298
315
|
)
|
|
299
316
|
|
|
300
|
-
#
|
|
317
|
+
# Store LazyFrame for streaming to disk
|
|
301
318
|
level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
|
|
302
|
-
self._preprocessed_data[level_key] = level
|
|
319
|
+
self._preprocessed_data[level_key] = level # Keep lazy
|
|
320
|
+
|
|
321
|
+
# Add full resolution as final level (for zoom fallback)
|
|
322
|
+
num_compressed = len(level_sizes)
|
|
323
|
+
full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
|
|
324
|
+
self._preprocessed_data[full_res_key] = filtered_data
|
|
325
|
+
self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
|
|
303
326
|
|
|
304
327
|
# Also create global levels for when no categorical filter is selected
|
|
305
328
|
# (fallback to standard behavior)
|
|
306
329
|
level_sizes = compute_compression_levels(self._min_points, total)
|
|
307
330
|
self._preprocessed_data['level_sizes'] = level_sizes
|
|
308
|
-
self._preprocessed_data['num_levels'] = len(level_sizes)
|
|
309
331
|
|
|
310
332
|
for i, size in enumerate(level_sizes):
|
|
311
333
|
# If target size equals total, skip downsampling - use all data
|
|
@@ -329,13 +351,18 @@ class Heatmap(BaseComponent):
|
|
|
329
351
|
x_range=x_range,
|
|
330
352
|
y_range=y_range,
|
|
331
353
|
)
|
|
332
|
-
self._preprocessed_data[f'level_{i}'] = level
|
|
354
|
+
self._preprocessed_data[f'level_{i}'] = level # Keep lazy
|
|
355
|
+
|
|
356
|
+
# Add full resolution as final level (for zoom fallback)
|
|
357
|
+
num_compressed = len(level_sizes)
|
|
358
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
|
|
359
|
+
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
333
360
|
|
|
334
361
|
def _preprocess_streaming(self) -> None:
|
|
335
362
|
"""
|
|
336
|
-
Streaming preprocessing - levels stay lazy
|
|
363
|
+
Streaming preprocessing - levels stay lazy through caching.
|
|
337
364
|
|
|
338
|
-
Builds lazy query plans
|
|
365
|
+
Builds lazy query plans that are streamed to disk via sink_parquet().
|
|
339
366
|
"""
|
|
340
367
|
# Get data ranges (minimal collect - just 4 values)
|
|
341
368
|
x_range, y_range = get_data_range(
|
|
@@ -379,12 +406,16 @@ class Heatmap(BaseComponent):
|
|
|
379
406
|
x_range=x_range,
|
|
380
407
|
y_range=y_range,
|
|
381
408
|
)
|
|
382
|
-
#
|
|
383
|
-
# Base class will
|
|
384
|
-
self._preprocessed_data[f'level_{i}'] = level
|
|
409
|
+
# Store LazyFrame for streaming to disk
|
|
410
|
+
# Base class will use sink_parquet() to stream without full materialization
|
|
411
|
+
self._preprocessed_data[f'level_{i}'] = level # Keep lazy
|
|
385
412
|
|
|
386
|
-
#
|
|
387
|
-
|
|
413
|
+
# Add full resolution as final level (for zoom fallback)
|
|
414
|
+
num_compressed = len(level_sizes)
|
|
415
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
|
|
416
|
+
|
|
417
|
+
# Store number of levels for reconstruction (includes full resolution)
|
|
418
|
+
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
388
419
|
|
|
389
420
|
def _preprocess_eager(self) -> None:
|
|
390
421
|
"""
|
|
@@ -434,16 +465,21 @@ class Heatmap(BaseComponent):
|
|
|
434
465
|
x_bins=self._x_bins,
|
|
435
466
|
y_bins=self._y_bins,
|
|
436
467
|
)
|
|
437
|
-
#
|
|
468
|
+
# Store LazyFrame for streaming to disk
|
|
438
469
|
level_idx = len(level_sizes) - 1 - i
|
|
439
470
|
if isinstance(downsampled, pl.LazyFrame):
|
|
440
|
-
self._preprocessed_data[f'level_{level_idx}'] = downsampled
|
|
471
|
+
self._preprocessed_data[f'level_{level_idx}'] = downsampled # Keep lazy
|
|
441
472
|
else:
|
|
442
|
-
|
|
473
|
+
# DataFrame from downsample_2d - convert back to lazy
|
|
474
|
+
self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
|
|
443
475
|
current = downsampled
|
|
444
476
|
|
|
445
|
-
#
|
|
446
|
-
|
|
477
|
+
# Add full resolution as final level (for zoom fallback)
|
|
478
|
+
num_compressed = len(level_sizes)
|
|
479
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
|
|
480
|
+
|
|
481
|
+
# Store number of levels for reconstruction (includes full resolution)
|
|
482
|
+
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
447
483
|
|
|
448
484
|
def _get_levels(self) -> list:
|
|
449
485
|
"""
|
|
@@ -460,10 +496,6 @@ class Heatmap(BaseComponent):
|
|
|
460
496
|
if level_data is not None:
|
|
461
497
|
levels.append(level_data)
|
|
462
498
|
|
|
463
|
-
# Add full resolution at end (if raw data available)
|
|
464
|
-
if self._raw_data is not None:
|
|
465
|
-
levels.append(self._raw_data)
|
|
466
|
-
|
|
467
499
|
return levels
|
|
468
500
|
|
|
469
501
|
def _get_categorical_levels(
|
|
@@ -496,13 +528,7 @@ class Heatmap(BaseComponent):
|
|
|
496
528
|
if level_data is not None:
|
|
497
529
|
levels.append(level_data)
|
|
498
530
|
|
|
499
|
-
|
|
500
|
-
filtered_raw = None
|
|
501
|
-
if self._raw_data is not None and filter_id in self._filters:
|
|
502
|
-
column_name = self._filters[filter_id]
|
|
503
|
-
filtered_raw = self._raw_data.filter(pl.col(column_name) == filter_value)
|
|
504
|
-
|
|
505
|
-
return levels, filtered_raw
|
|
531
|
+
return levels, None # Full resolution included in cached levels
|
|
506
532
|
|
|
507
533
|
def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
|
|
508
534
|
"""
|
|
@@ -630,10 +656,11 @@ class Heatmap(BaseComponent):
|
|
|
630
656
|
|
|
631
657
|
if count >= self._min_points:
|
|
632
658
|
# This level has enough detail
|
|
633
|
-
if count > self._min_points
|
|
634
|
-
#
|
|
635
|
-
|
|
636
|
-
|
|
659
|
+
if count > self._min_points:
|
|
660
|
+
# Over limit - downsample to stay at/under max
|
|
661
|
+
# Use ZOOM range for binning (not global) to avoid sparse bins
|
|
662
|
+
zoom_x_range = (x0, x1)
|
|
663
|
+
zoom_y_range = (y0, y1)
|
|
637
664
|
if self._use_streaming or self._use_simple_downsample:
|
|
638
665
|
if self._use_simple_downsample:
|
|
639
666
|
return downsample_2d_simple(
|
|
@@ -650,8 +677,8 @@ class Heatmap(BaseComponent):
|
|
|
650
677
|
intensity_column=self._intensity_column,
|
|
651
678
|
x_bins=self._x_bins,
|
|
652
679
|
y_bins=self._y_bins,
|
|
653
|
-
x_range=
|
|
654
|
-
y_range=
|
|
680
|
+
x_range=zoom_x_range,
|
|
681
|
+
y_range=zoom_y_range,
|
|
655
682
|
).collect()
|
|
656
683
|
else:
|
|
657
684
|
return downsample_2d(
|
|
@@ -744,7 +771,8 @@ class Heatmap(BaseComponent):
|
|
|
744
771
|
df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
|
|
745
772
|
else:
|
|
746
773
|
# No filters to apply - levels already filtered by categorical filter
|
|
747
|
-
|
|
774
|
+
schema_names = data.collect_schema().names()
|
|
775
|
+
available_cols = [c for c in columns_to_select if c in schema_names]
|
|
748
776
|
df_polars = data.select(available_cols).collect()
|
|
749
777
|
# Sort by intensity ascending so high-intensity points are drawn on top
|
|
750
778
|
df_polars = df_polars.sort(self._intensity_column)
|
|
@@ -45,6 +45,7 @@ class LinePlot(BaseComponent):
|
|
|
45
45
|
self,
|
|
46
46
|
cache_id: str,
|
|
47
47
|
data: Optional[pl.LazyFrame] = None,
|
|
48
|
+
data_path: Optional[str] = None,
|
|
48
49
|
filters: Optional[Dict[str, str]] = None,
|
|
49
50
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
50
51
|
interactivity: Optional[Dict[str, str]] = None,
|
|
@@ -68,6 +69,7 @@ class LinePlot(BaseComponent):
|
|
|
68
69
|
cache_id: Unique identifier for this component's cache (MANDATORY).
|
|
69
70
|
Creates a folder {cache_path}/{cache_id}/ for cached data.
|
|
70
71
|
data: Polars LazyFrame with plot data. Optional if cache exists.
|
|
72
|
+
data_path: Path to parquet file (preferred for large datasets).
|
|
71
73
|
filters: Mapping of identifier names to column names for filtering.
|
|
72
74
|
Example: {'spectrum': 'scan_id'}
|
|
73
75
|
When 'spectrum' selection exists, plot shows only data where
|
|
@@ -116,11 +118,22 @@ class LinePlot(BaseComponent):
|
|
|
116
118
|
super().__init__(
|
|
117
119
|
cache_id=cache_id,
|
|
118
120
|
data=data,
|
|
121
|
+
data_path=data_path,
|
|
119
122
|
filters=filters,
|
|
120
123
|
filter_defaults=filter_defaults,
|
|
121
124
|
interactivity=interactivity,
|
|
122
125
|
cache_path=cache_path,
|
|
123
126
|
regenerate_cache=regenerate_cache,
|
|
127
|
+
# Pass component-specific params for subprocess recreation
|
|
128
|
+
x_column=x_column,
|
|
129
|
+
y_column=y_column,
|
|
130
|
+
title=title,
|
|
131
|
+
x_label=x_label,
|
|
132
|
+
y_label=y_label,
|
|
133
|
+
highlight_column=highlight_column,
|
|
134
|
+
annotation_column=annotation_column,
|
|
135
|
+
styling=styling,
|
|
136
|
+
config=config,
|
|
124
137
|
**kwargs
|
|
125
138
|
)
|
|
126
139
|
|
|
@@ -208,9 +221,9 @@ class LinePlot(BaseComponent):
|
|
|
208
221
|
'annotation_column': self._annotation_column,
|
|
209
222
|
}
|
|
210
223
|
|
|
211
|
-
#
|
|
212
|
-
# Base class will
|
|
213
|
-
self._preprocessed_data['data'] = data
|
|
224
|
+
# Store LazyFrame for streaming to disk (filter happens at render time)
|
|
225
|
+
# Base class will use sink_parquet() to stream without full materialization
|
|
226
|
+
self._preprocessed_data['data'] = data # Keep lazy
|
|
214
227
|
|
|
215
228
|
def _get_vue_component_name(self) -> str:
|
|
216
229
|
"""Return the Vue component name."""
|
|
@@ -49,6 +49,7 @@ class Table(BaseComponent):
|
|
|
49
49
|
self,
|
|
50
50
|
cache_id: str,
|
|
51
51
|
data: Optional[pl.LazyFrame] = None,
|
|
52
|
+
data_path: Optional[str] = None,
|
|
52
53
|
filters: Optional[Dict[str, str]] = None,
|
|
53
54
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
54
55
|
interactivity: Optional[Dict[str, str]] = None,
|
|
@@ -72,6 +73,7 @@ class Table(BaseComponent):
|
|
|
72
73
|
cache_id: Unique identifier for this component's cache (MANDATORY).
|
|
73
74
|
Creates a folder {cache_path}/{cache_id}/ for cached data.
|
|
74
75
|
data: Polars LazyFrame with table data. Optional if cache exists.
|
|
76
|
+
data_path: Path to parquet file (preferred for large datasets).
|
|
75
77
|
filters: Mapping of identifier names to column names for filtering.
|
|
76
78
|
Example: {'spectrum': 'scan_id'}
|
|
77
79
|
When 'spectrum' selection exists, table shows only rows where
|
|
@@ -120,11 +122,22 @@ class Table(BaseComponent):
|
|
|
120
122
|
super().__init__(
|
|
121
123
|
cache_id=cache_id,
|
|
122
124
|
data=data,
|
|
125
|
+
data_path=data_path,
|
|
123
126
|
filters=filters,
|
|
124
127
|
filter_defaults=filter_defaults,
|
|
125
128
|
interactivity=interactivity,
|
|
126
129
|
cache_path=cache_path,
|
|
127
130
|
regenerate_cache=regenerate_cache,
|
|
131
|
+
# Pass component-specific params for subprocess recreation
|
|
132
|
+
column_definitions=column_definitions,
|
|
133
|
+
title=title,
|
|
134
|
+
index_field=index_field,
|
|
135
|
+
go_to_fields=go_to_fields,
|
|
136
|
+
layout=layout,
|
|
137
|
+
default_row=default_row,
|
|
138
|
+
initial_sort=initial_sort,
|
|
139
|
+
pagination=pagination,
|
|
140
|
+
page_size=page_size,
|
|
128
141
|
**kwargs
|
|
129
142
|
)
|
|
130
143
|
|
|
@@ -204,9 +217,9 @@ class Table(BaseComponent):
|
|
|
204
217
|
# Store column definitions in preprocessed data for serialization
|
|
205
218
|
self._preprocessed_data['column_definitions'] = self._column_definitions
|
|
206
219
|
|
|
207
|
-
#
|
|
208
|
-
# Base class will
|
|
209
|
-
self._preprocessed_data['data'] = data
|
|
220
|
+
# Store LazyFrame for streaming to disk (filter happens at render time)
|
|
221
|
+
# Base class will use sink_parquet() to stream without full materialization
|
|
222
|
+
self._preprocessed_data['data'] = data # Keep lazy
|
|
210
223
|
|
|
211
224
|
def _get_columns_to_select(self) -> Optional[List[str]]:
|
|
212
225
|
"""Get list of columns needed for this table."""
|
|
@@ -43,6 +43,7 @@ class BaseComponent(ABC):
|
|
|
43
43
|
self,
|
|
44
44
|
cache_id: str,
|
|
45
45
|
data: Optional[pl.LazyFrame] = None,
|
|
46
|
+
data_path: Optional[str] = None,
|
|
46
47
|
filters: Optional[Dict[str, str]] = None,
|
|
47
48
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
48
49
|
interactivity: Optional[Dict[str, str]] = None,
|
|
@@ -57,6 +58,9 @@ class BaseComponent(ABC):
|
|
|
57
58
|
cache_id: Unique identifier for this component's cache (MANDATORY).
|
|
58
59
|
Creates a folder {cache_path}/{cache_id}/ for cached data.
|
|
59
60
|
data: Polars LazyFrame with source data. Optional if cache exists.
|
|
61
|
+
data_path: Path to parquet file with source data. Preferred over
|
|
62
|
+
data= for large datasets as preprocessing runs in a subprocess
|
|
63
|
+
to ensure memory is released after cache creation.
|
|
60
64
|
filters: Mapping of identifier names to column names for filtering.
|
|
61
65
|
Example: {'spectrum': 'scan_id'}
|
|
62
66
|
When 'spectrum' selection exists, component filters data where
|
|
@@ -73,6 +77,10 @@ class BaseComponent(ABC):
|
|
|
73
77
|
regenerate_cache: If True, regenerate cache even if valid cache exists.
|
|
74
78
|
**kwargs: Component-specific configuration options
|
|
75
79
|
"""
|
|
80
|
+
# Validate inputs
|
|
81
|
+
if data is not None and data_path is not None:
|
|
82
|
+
raise ValueError("Provide either 'data' or 'data_path', not both")
|
|
83
|
+
|
|
76
84
|
self._cache_id = cache_id
|
|
77
85
|
self._cache_dir = get_cache_dir(cache_path, cache_id)
|
|
78
86
|
self._filters = filters or {}
|
|
@@ -83,18 +91,33 @@ class BaseComponent(ABC):
|
|
|
83
91
|
|
|
84
92
|
# Check if we should load from cache or preprocess
|
|
85
93
|
if regenerate_cache or not self._is_cache_valid():
|
|
86
|
-
if data is None:
|
|
94
|
+
if data is None and data_path is None:
|
|
87
95
|
raise CacheMissError(
|
|
88
96
|
f"Cache not found at '{self._cache_dir}' and no data provided. "
|
|
89
|
-
f"Either provide data
|
|
97
|
+
f"Either provide data=, data_path=, or ensure cache exists."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if data_path is not None:
|
|
101
|
+
# Subprocess preprocessing - memory released after cache creation
|
|
102
|
+
from .subprocess_preprocess import preprocess_component
|
|
103
|
+
preprocess_component(
|
|
104
|
+
type(self),
|
|
105
|
+
data_path=data_path,
|
|
106
|
+
cache_id=cache_id,
|
|
107
|
+
cache_path=cache_path,
|
|
108
|
+
filters=filters,
|
|
109
|
+
filter_defaults=filter_defaults,
|
|
110
|
+
interactivity=interactivity,
|
|
111
|
+
**kwargs
|
|
90
112
|
)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
113
|
+
self._raw_data = None
|
|
114
|
+
self._load_from_cache()
|
|
115
|
+
else:
|
|
116
|
+
# In-process preprocessing (backward compatible)
|
|
117
|
+
self._raw_data = data
|
|
118
|
+
self._validate_mappings()
|
|
119
|
+
self._preprocess()
|
|
120
|
+
self._save_to_cache()
|
|
98
121
|
else:
|
|
99
122
|
# Load from valid cache
|
|
100
123
|
self._raw_data = None
|
|
@@ -231,28 +254,18 @@ class BaseComponent(ABC):
|
|
|
231
254
|
"data_values": {},
|
|
232
255
|
}
|
|
233
256
|
|
|
234
|
-
# Save preprocessed data
|
|
235
|
-
row_group_size = self._get_row_group_size()
|
|
257
|
+
# Save preprocessed data - stream LazyFrames directly to disk
|
|
236
258
|
for key, value in self._preprocessed_data.items():
|
|
237
259
|
if isinstance(value, pl.LazyFrame):
|
|
238
260
|
filename = f"{key}.parquet"
|
|
239
261
|
filepath = preprocessed_dir / filename
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
compression='zstd',
|
|
243
|
-
statistics=True,
|
|
244
|
-
row_group_size=row_group_size,
|
|
245
|
-
)
|
|
262
|
+
# Stream directly to disk without full materialization
|
|
263
|
+
value.sink_parquet(filepath, compression='zstd')
|
|
246
264
|
manifest["data_files"][key] = filename
|
|
247
265
|
elif isinstance(value, pl.DataFrame):
|
|
248
266
|
filename = f"{key}.parquet"
|
|
249
267
|
filepath = preprocessed_dir / filename
|
|
250
|
-
value.write_parquet(
|
|
251
|
-
filepath,
|
|
252
|
-
compression='zstd',
|
|
253
|
-
statistics=True,
|
|
254
|
-
row_group_size=row_group_size,
|
|
255
|
-
)
|
|
268
|
+
value.write_parquet(filepath, compression='zstd')
|
|
256
269
|
manifest["data_files"][key] = filename
|
|
257
270
|
elif self._is_json_serializable(value):
|
|
258
271
|
manifest["data_values"][key] = value
|
|
@@ -261,6 +274,13 @@ class BaseComponent(ABC):
|
|
|
261
274
|
with open(self._get_manifest_path(), "w") as f:
|
|
262
275
|
json.dump(manifest, f, indent=2)
|
|
263
276
|
|
|
277
|
+
# Release memory - data is now safely on disk
|
|
278
|
+
self._preprocessed_data = {}
|
|
279
|
+
self._raw_data = None
|
|
280
|
+
|
|
281
|
+
# Reload as lazy scan_parquet() references
|
|
282
|
+
self._load_from_cache()
|
|
283
|
+
|
|
264
284
|
def _is_json_serializable(self, value: Any) -> bool:
|
|
265
285
|
"""Check if value can be JSON serialized."""
|
|
266
286
|
try:
|